# Load libraries / create tokenizer object 

In [1]:
import pandas as pd 
import string 
import numpy as np 
import os 
from nltk import word_tokenize
from nltk.corpus import stopwords   
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter 
import math
import re 

# load corpus


In [5]:
def load_corpus(path): 
    corpus_paths = []
    for filename in os.listdir(path): 
        file_path = path + filename 
        corpus_paths.append(file_path)
    return corpus_paths 

corpus_dir = './corpus/'
file_paths = load_corpus(corpus_dir)
for path in file_paths: 
    print(path)




./corpus/en.wikipedia.org_wiki_Inter-process_communication.txt
./corpus/github.com_dkorenci_topic_coverage_blob_main_data.readme.txt.txt
./corpus/playcode.io_javascript.txt
./corpus/chatgpt.com_c_561150ab-359f-4949-bfba-f1d10d660259.txt
./corpus/www.geeksforgeeks.org_python-os-listdir-method_.txt
./corpus/stackoverflow.com_questions_461791_is-it-possible-to-write-to-a-file-on-a-disk-using-javascript?rq=3.txt
./corpus/www.reddit.com_r_learnprogramming_comments_6sftac_how_can_i_transfer_data_between_java_and_python_.txt
./corpus/www.linkedin.com_feed_.txt


# Preprocess data 

In [6]:
ps = PorterStemmer()
stop = set(stopwords.words('english'))

def read_txt(file_path): 
    # read file contents by line 
    file = open(file_path, "r", encoding="utf-8")
    text = file.read() 
    text = re.sub(r'\n', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = text.strip()
    text = text.lower()
    tokenized_words = word_tokenize(text)
    tokenized_words = [w for w in tokenized_words if w not in stop]
    tokenized_words = [ps.stem(word) for word in tokenized_words]
    return ' '.join(tokenized_words)



websites_preprocessed_data = [] 
for path in file_paths: 
    websites_preprocessed_data.append(read_txt(path))

print(websites_preprocessed_data)

['interprocess commun wikipedia jump contentmain menusearchcr accountlog inperson toolscont hidetopapproachesapplicationstoggl applic subsectionremot procedur call interfacesplatform commun stackoper system commun stackdistribut object modelsse alsoreferencesextern linksinterprocess communication29 languagesarticletalkreadeditview historytoolsfrom wikipedia free encyclopedia articl includ list gener refer lack suffici correspond inlin citat pleas help improv articl introduc precis citat august 2015 learn remov messagea grid comput system connect mani person comput internet via interprocess network communicationin comput scienc interprocess commun ipc also spell interprocess commun mechan provid oper system process manag share data typic applic use ipc categor client server client request data server respond client requests1 mani applic client server commonli seen distribut computingipc import design process microkernel nanokernel reduc number function provid kernel function obtain comm

# TF / IDF Frequency

In [4]:
doc_occurrences = {}
def get_frequencies(website): # takes the corpus, counts its frequencies occurred in doc and stores in dictionary 
    word_freq = {} # stores the frequency of each word in a doc 
    for word in website: 
        if word in word_freq: 
            word_freq[word] += 1 
        else: 
            word_freq[word] = 1 
    return word_freq 

def get_doc_occurrences(website_dict, websites, doc_occurrences): # get the number of websites a word appears in 
    for key in website_dict: 
        if key not in doc_occurrences: 
            ct = 0 
            for website in websites: 
                if key in website: 
                    ct = ct + 1 
            doc_occurrences[key] = ct 
    return doc_occurrences


def calculate_tf_idf(website, websites, n, doc_occurrences): 
    word_freqs = get_frequencies(website)
    tf_idf = {}
    doc_occurrences = get_doc_occurrences(word_freqs, websites, doc_occurrences)
    tf_idf = {} 
    for key in word_freqs: 
        doc_occurs = doc_occurrences[key]
        print(doc_occurs, word_freqs[key])
        tf_idf[key] = word_freqs[key] * math.log(n / doc_occurs)
    print(tf_idf)
    return tf_idf 


for website in websites_preprocessed_data: 
    calculate_tf_idf(website, websites_preprocessed_data, len(websites_preprocessed_data), doc_occurrences=doc_occurrences)











3 109
3 114
3 166
3 315
3 82
3 41
3 142
3 139
3 105
3 44
3 48
3 38
3 95
3 25
3 74
3 57
3 8
3 93
3 29
3 61
3 17
3 13
2 2
3 26
1 1
3 3
3 20
2 1
3 3
3 2
3 2
2 1
2 1
{'o': 0.0, 'n': 0.0, 'e': 0.0, ' ': 0.0, 'm': 0.0, 'v': 0.0, 'i': 0.0, 'a': 0.0, 't': 0.0, 'p': 0.0, 'h': 0.0, 'd': 0.0, 'r': 0.0, 'b': 0.0, 's': 0.0, 'c': 0.0, 'x': 0.0, 'l': 0.0, 'g': 0.0, 'u': 0.0, 'w': 0.0, 'y': 0.0, '1': 0.8109302162163288, 'k': 0.0, 'z': 1.0986122886681098, '2': 0.0, 'f': 0.0, '6': 0.4054651081081644, 'q': 0.0, 'j': 0.0, '0': 0.0, '“': 0.4054651081081644, '”': 0.4054651081081644}
3 196
3 183
3 388
3 78
3 135
3 156
3 154
3 123
3 108
3 74
3 7
3 27
3 89
3 42
3 47
3 19
3 208
3 22
3 45
3 57
3 95
3 24
3 16
3 45
3 11
2 1
1 3
3 2
3 1
3 2
{'a': 0.0, 'i': 0.0, ' ': 0.0, 'c': 0.0, 'o': 0.0, 'n': 0.0, 't': 0.0, 'r': 0.0, 'l': 0.0, 'h': 0.0, 'j': 0.0, 'b': 0.0, 's': 0.0, 'g': 0.0, 'p': 0.0, 'y': 0.0, 'e': 0.0, 'k': 0.0, 'v': 0.0, 'u': 0.0, 'm': 0.0, 'f': 0.0, 'w': 0.0, 'd': 0.0, 'x': 0.0, '4': 0.4054651081081644, '5'

# actual TF / IDF Frequency

In [4]:
vectorizer = TfidfVectorizer(norm='l2', smooth_idf=True)
website_content = [] 
def file_to_st(file_path): 
    file = open(file_path, "r", encoding="utf-8")
    text = file.read() 
    return text 

website_content.append(file_to_st("movie.txt"))
website_content.append(file_to_st("movie2.txt"))
website_content.append(file_to_st("movie3.txt"))

x = vectorizer.fit_transform(websites)
tokens = vectorizer.get_feature_names_out()
tfidf_df = pd.DataFrame(data = x.T.toarray(),index = tokens, columns = ['movie.txt', 'movie2.txt', 'movie3.txt'])
print(tfidf_df)
print(type(tfidf_df))




         movie.txt  movie2.txt  movie3.txt
2001      0.035696    0.000000    0.024381
2077      0.000000    0.000000    0.032057
40k       0.000000    0.000000    0.032057
550       0.000000    0.037198    0.000000
abil      0.000000    0.000000    0.032057
...            ...         ...         ...
world     0.027721    0.021970    0.037867
worldth   0.000000    0.000000    0.032057
would     0.027721    0.021970    0.018934
xfile     0.046936    0.000000    0.000000
young     0.046936    0.000000    0.000000

[520 rows x 3 columns]
<class 'pandas.core.frame.DataFrame'>


# SVD 

In [9]:
# rank < min(m, n) for matrix m x n 
def svd_init(m, rank): 
    # convert from dataframe to array 
    arr = m.values
    u, s, v = np.linalg.svd(arr, full_matrices=False)
    v = v.T 
    w = np.zeros((arr.shape[0], rank)) 
    h = np.zeros((rank, arr.shape[1]))

    w[:, 0] = np.sqrt(s[0]) * np.abs(u[:, 0])
    h[0, :] = np.sqrt(s[0]) * np.abs(v[:, 0].T)

    for i in range(1, rank): 
        print(u.shape)
        x = u[:, i]
        y = v[:, i]
        x_p = (x >= 0) * x
        x_n = (x < 0) * -x
        y_p = (y >= 0) * y
        y_n = (y <  0) * -y

        xp_norm = np.linalg.norm(x_p, 2)
        xn_norm = np.linalg.norm(x_n, 2)
        yp_norm = np.linalg.norm(y_p, 2)
        yn_norm = np.linalg.norm(y_n, 2)

        p_norm = xp_norm * yp_norm 
        n_norm = xn_norm * yn_norm 
        if p_norm > n_norm: 
            u_i = x_p / xp_norm 
            v_i = y_p / yp_norm 
            sigma = p_norm 
        else: 
            u_i = x_n / xn_norm 
            v_i = y_n / yn_norm 
            sigma = n_norm 
        w[:, i] = np.sqrt(s[i] * sigma) * u_i
        h[i, :] = np.sqrt(s[i] * sigma) * v_i.T 

    threshold = 1e-10 

    w[w < threshold] = 0
    h[h < threshold] = 0

    return w, h 
        



w, h = svd_init(tfidf_df, 3)
print("w: ", w, w.shape)
print("h: ", h, h.shape)


(520, 3)
(520, 3)
w:  [[0.02820312 0.02614555 0.02046649]
 [0.01532784 0.         0.03106882]
 [0.01532784 0.         0.03106882]
 ...
 [0.03235515 0.00781373 0.        ]
 [0.02175588 0.04807016 0.        ]
 [0.02175588 0.04807016 0.        ]] (520, 3)
h:  [[0.69198507 0.71030221 0.71380236]
 [0.65880846 0.         0.        ]
 [0.         0.         0.55858965]] (3, 3)


# NMF