# Load libraries / create tokenizer object 

In [5]:
import pandas as pd 
import string 
import numpy as np 
from nltk import word_tokenize
from nltk.corpus import stopwords   
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter 
import math
import re 

# Preprocess data 

In [6]:
ps = PorterStemmer()
stop = set(stopwords.words('english'))

def read_txt(file_path): 
    # read file contents by line 
    file = open(file_path, "r", encoding="utf-8")
    text = file.read() 
    text = re.sub(r'\n', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = text.strip()
    text = text.lower()
    tokenized_words = word_tokenize(text)
    tokenized_words = [w for w in tokenized_words if w not in stop]
    tokenized_words = [ps.stem(word) for word in tokenized_words]
    return ' '.join(tokenized_words)

websites = [] 
websites.append(read_txt("movie.txt"))
websites.append(read_txt("movie2.txt"))
websites.append(read_txt("movie3.txt"))
print(websites)

['one movi ai top head irobot one smith sinc ive seen dont rememb exact plot rememb rogu ai memor convers one robot smith mani way believ current ai technolog alreadi surpass robot movie1 matrix seri matrix take place dystopian world human trap simul made crazili advanc ai system use natur languag process talk humans2 blade blade also set dystopian futur ai call replic creat bore work howev ai becam advanc gain feel caus fight right human also use nlp commun humanswhen young read seri call higher institut villain educ hive artifici intellig becam sentient attack creator kind like termin book form childreni think movi big hero 6 relev ai one charact baymax use ai heal patient start kdrama main charact softwar engin make comput vision app let blind peopl ask question like count money front object front app tell isimit game alan ture make comput search decrypt code enigma machinei think iron man relat ai nlp iron man suit jarvi understand human languag aii think hbo silicon valley specif 

# TF / IDF Frequency

In [7]:
doc_occurrences = {}
def get_frequencies(website): # takes the corpus, counts its frequencies occurred in doc and stores in dictionary 
    word_freq = {} # stores the frequency of each word in a doc 
    for word in website: 
        if word in word_freq: 
            word_freq[word] += 1 
        else: 
            word_freq[word] = 1 
    return word_freq 

def get_doc_occurrences(website_dict, websites, doc_occurrences): # get the number of websites a word appears in 
    for key in website_dict: 
        if key not in doc_occurrences: 
            ct = 0 
            for website in websites: 
                if key in website: 
                    ct = ct + 1 
            doc_occurrences[key] = ct 
    return doc_occurrences


def calculate_tf_idf(website, websites, n, doc_occurrences): 
    word_freqs = get_frequencies(website)
    tf_idf = {}
    doc_occurrences = get_doc_occurrences(word_freqs, websites, doc_occurrences)
    tf_idf = {} 
    for key in word_freqs: 
        doc_occurs = doc_occurrences[key]
        print(doc_occurs, word_freqs[key])
        tf_idf[key] = word_freqs[key] * math.log(n / doc_occurs)
    print(tf_idf)
    return tf_idf 


for website in websites: 
    calculate_tf_idf(website, websites, len(websites), doc_occurrences=doc_occurrences)











3 109
3 114
3 166
3 315
3 82
3 41
3 142
3 139
3 105
3 44
3 48
3 38
3 95
3 25
3 74
3 57
3 8
3 93
3 29
3 61
3 17
3 13
2 2
3 26
1 1
3 3
3 20
2 1
3 3
3 2
3 2
2 1
2 1
{'o': 0.0, 'n': 0.0, 'e': 0.0, ' ': 0.0, 'm': 0.0, 'v': 0.0, 'i': 0.0, 'a': 0.0, 't': 0.0, 'p': 0.0, 'h': 0.0, 'd': 0.0, 'r': 0.0, 'b': 0.0, 's': 0.0, 'c': 0.0, 'x': 0.0, 'l': 0.0, 'g': 0.0, 'u': 0.0, 'w': 0.0, 'y': 0.0, '1': 0.8109302162163288, 'k': 0.0, 'z': 1.0986122886681098, '2': 0.0, 'f': 0.0, '6': 0.4054651081081644, 'q': 0.0, 'j': 0.0, '0': 0.0, '“': 0.4054651081081644, '”': 0.4054651081081644}
3 196
3 183
3 388
3 78
3 135
3 156
3 154
3 123
3 108
3 74
3 7
3 27
3 89
3 42
3 47
3 19
3 208
3 22
3 45
3 57
3 95
3 24
3 16
3 45
3 11
2 1
1 3
3 2
3 1
3 2
{'a': 0.0, 'i': 0.0, ' ': 0.0, 'c': 0.0, 'o': 0.0, 'n': 0.0, 't': 0.0, 'r': 0.0, 'l': 0.0, 'h': 0.0, 'j': 0.0, 'b': 0.0, 's': 0.0, 'g': 0.0, 'p': 0.0, 'y': 0.0, 'e': 0.0, 'k': 0.0, 'v': 0.0, 'u': 0.0, 'm': 0.0, 'f': 0.0, 'w': 0.0, 'd': 0.0, 'x': 0.0, '4': 0.4054651081081644, '5'

# actual TF / IDF Frequency

In [None]:
vectorizer = TfidfVectorizer(norm='l2')
website_content = [] 
def file_to_st(file_path): 
    file = open(file_path, "r", encoding="utf-8")
    text = file.read() 
    return text 

website_content.append(file_to_st("movie.txt"))
website_content.append(file_to_st("movie2.txt"))
website_content.append(file_to_st("movie3.txt"))

x = vectorizer.fit_transform(websites)
tokens = vectorizer.get_feature_names_out()
tfidf_df = pd.DataFrame(data = x.T.toarray(),index = tokens, columns = ['movie.txt', 'movie2.txt', 'movie3.txt'])
print(tfidf_df)




# SVD 

In [15]:
def svd_init(m, rank): 
    u, s, v = np.linalg.svd(m, full_matrices=False)
    vTranspose = v.T 
    w = np.zeros((m.shape[0], rank)) 
    h = np.zeroes((rank, m.shape[1]))

    w[:, 0] = np.sqrt(s[0]) * np.abs(u[:, 0])
    h[0, :] = np.sqrt(s[0]) * np.abs(v[:, 0].T)

    for i in range(1, rank): 
        u_i = u[:, i]
        v_i = v[:, i]
        ui_p = (u_i >= 0) * u_i 
        ui_n = (u_i < 0) * -u_i
        vi_p = (v_i >= 0) * v_i
        vi_n = (v_i <  0) * -v_i

        uip_n = np.linalg.norm(ui_p, 2)
        uin_n = np.linalg.norm(ui_n, 2)
        vip_n = np.linalg.norm(vi_p, 2)
        vin_n = np.linalg.norm(vi_n, 2)

        p_norm = uip_n * vip_n 
        n_norm = uin_n * vin_n 

        



svd_init(tfidf_df, 10)


520
