# Load libraries / create tokenizer object 

In [2]:
import pandas as pd 
import string 
import numpy as np 
import os 
from nltk import word_tokenize
from nltk.corpus import stopwords   
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF 
from collections import Counter 
import math
import re 

# train / test split


In [6]:
data = pd.read_csv('c4filteredsample_tagged.csv')

for col in data.columns: 
    if "text" not in col: 
        data = data.drop(col, axis=1)


data = data.replace({r'\n' : ' '}, regex=True) 
data.fillna('', inplace=True)
data = data.replace({r'\'' : ' '}, regex=True) 

print(data)

                                                     text
0       Man continuously undergoes selection through t...
1       Today we are celebrating the precious letter P...
2       Comics Archive | I made inky. 07/30/12 – Comic...
3       This article is about current Disney record la...
4       This entry was posted on November 3, 2011 at 4...
...                                                   ...
149995  17 HOU W 22-13 2 0 0 0.0 0 No injury listed. 1...
149996  Item description Caption on back of photograph...
149997  In a bizarre coincidence recently, DC Comics l...
149998  The Men of Hampton Park dedicate themselves to...
149999  Welcome to Post-weekend Poetry and the twelfth...

[150000 rows x 1 columns]


# load corpus


In [1]:
def load_corpus(path): 
    corpus_paths = []
    for filename in os.listdir(path): 
        file_path = path + filename 
        corpus_paths.append(file_path)
    return corpus_paths 

corpus_dir = './corpus/'
file_paths = load_corpus(corpus_dir)
for path in file_paths: 
    print(path)




./corpus/www.aa.com_booking_find-flights.txt
./corpus/www.linkedin.com_jobs_search_?currentJobId=3915603480&distance=25&f_WT=1%2C3%2C2&geoId=102448103&keywords=software%20engineer&origin=JOBS_HOME_SEARCH_CARDS.txt
./corpus/spacelift.io_blog_kubernetes-cronjob.txt
./corpus/stackoverflow.com_questions_45019753_how-to-retrain-a-machine-learning-model-in-python-till-we-get-desired-outcome.txt
./corpus/towardsdatascience.com_topic-modeling-articles-with-nmf-8c6b2a227a45.txt
./corpus/www.frontiersin.org_articles_10.3389_fsoc.2022.886498_full.txt
./corpus/www.jetblue.com_flights.txt
./corpus/kubernetes.io_docs_concepts_workloads_controllers_cron-jobs_.txt
./corpus/kubernetes.io_docs_tasks_job_automated-tasks-with-cron-jobs_.txt


# Preprocess data 

In [4]:
ps = PorterStemmer()
stop = set(stopwords.words('english'))

def read_txt(file_path): 
    # read file contents by line 
    file = open(file_path, "r", encoding="utf-8")
    text = file.read() 
    text = re.sub(r'\n', '', text)
    text = re.sub(r'\d', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = text.strip()
    text = text.lower()
    tokenized_words = word_tokenize(text)
    tokenized_words = [w for w in tokenized_words if w not in stop]
    tokenized_words = [ps.stem(word) for word in tokenized_words]
    return ' '.join(tokenized_words)



websites_preprocessed_data = [] 
for path in file_paths: 
    websites_preprocessed_data.append(read_txt(path))


# TF / IDF Frequency (from scratch)

In [None]:
doc_occurrences = {}
def get_frequencies(website): # takes the corpus, counts its frequencies occurred in doc and stores in dictionary 
    word_freq = {} # stores the frequency of each word in a doc 
    for word in website: 
        if word in word_freq: 
            word_freq[word] += 1 
        else: 
            word_freq[word] = 1 
    return word_freq 

def get_doc_occurrences(website_dict, websites, doc_occurrences): # get the number of websites a word appears in 
    for key in website_dict: 
        if key not in doc_occurrences: 
            ct = 0 
            for website in websites: 
                if key in website: 
                    ct = ct + 1 
            doc_occurrences[key] = ct 
    return doc_occurrences


def calculate_tf_idf(website, websites, n, doc_occurrences): 
    word_freqs = get_frequencies(website)
    tf_idf = {}
    doc_occurrences = get_doc_occurrences(word_freqs, websites, doc_occurrences)
    tf_idf = {} 
    for key in word_freqs: 
        doc_occurs = doc_occurrences[key]
        print(doc_occurs, word_freqs[key])
        tf_idf[key] = word_freqs[key] * math.log(n / doc_occurs)
    print(tf_idf)
    return tf_idf 


for website in websites_preprocessed_data: 
    calculate_tf_idf(website, websites_preprocessed_data, len(websites_preprocessed_data), doc_occurrences=doc_occurrences)











# actual TF / IDF Frequency

In [23]:
vectorizer = TfidfVectorizer(norm='l2', smooth_idf=True, ngram_range=(1, 2))
website_content = [] 
def file_to_st(file_path): 
    file = open(file_path, "r", encoding="utf-8")
    text = file.read() 
    return text 

 
x = vectorizer.fit_transform(websites_preprocessed_data)
tokens = vectorizer.get_feature_names_out()
tfidf_df = pd.DataFrame(data = x.T.toarray(),index = tokens, columns = file_paths)
print(tfidf_df)
print(type(tfidf_df))




                    ./corpus/www.aa.com_booking_find-flights.txt  \
aacom                                                    0.06198   
aacom submit                                             0.03099   
aadvantag                                                0.03099   
aadvantag accountc                                       0.03099   
abac                                                     0.00000   
...                                                          ...   
zscore bigquerya                                         0.00000   
zscore number                                            0.00000   
zydziunait                                               0.00000   
zydziunait automat                                       0.00000   
京icp备号                                                   0.00000   

                    ./corpus/www.linkedin.com_jobs_search_?currentJobId=3915603480&distance=25&f_WT=1%2C3%2C2&geoId=102448103&keywords=software%20engineer&origin=JOBS_HOME_SEARCH_CARD

# SVD 

In [None]:
# rank < min(m, n) for matrix m x n 
def svd_init(m, rank): 
    # convert from dataframe to array 
    arr = m.values
    u, s, v = np.linalg.svd(arr, full_matrices=False)
    v = v.T 
    w = np.zeros((arr.shape[0], rank)) 
    h = np.zeros((rank, arr.shape[1]))

    w[:, 0] = np.sqrt(s[0]) * np.abs(u[:, 0])
    h[0, :] = np.sqrt(s[0]) * np.abs(v[:, 0].T)

    for i in range(1, rank): 
        print(u.shape)
        x = u[:, i]
        y = v[:, i]
        x_p = (x >= 0) * x
        x_n = (x < 0) * -x
        y_p = (y >= 0) * y
        y_n = (y <  0) * -y

        xp_norm = np.linalg.norm(x_p, 2)
        xn_norm = np.linalg.norm(x_n, 2)
        yp_norm = np.linalg.norm(y_p, 2)
        yn_norm = np.linalg.norm(y_n, 2)

        p_norm = xp_norm * yp_norm 
        n_norm = xn_norm * yn_norm 
        if p_norm > n_norm: 
            u_i = x_p / xp_norm 
            v_i = y_p / yp_norm 
            sigma = p_norm 
        else: 
            u_i = x_n / xn_norm 
            v_i = y_n / yn_norm 
            sigma = n_norm 
        w[:, i] = np.sqrt(s[i] * sigma) * u_i
        h[i, :] = np.sqrt(s[i] * sigma) * v_i.T 

    threshold = 1e-10 

    w[w < threshold] = 0
    h[h < threshold] = 0

    return w, h 
        



w, h = svd_init(tfidf_df, 3)
print("w: ", w, w.shape)
print("h: ", h, h.shape)


# NMF

In [20]:
nmf_model = NMF(n_components=4, random_state=42)


terms = vectorizer.get_feature_names_out()
W = nmf_model.fit_transform(x)
H = nmf_model.components_ 


topics = [] 

for index, topic in enumerate(H):
    topics.append([terms[i] for i in topic.argsort()[-3:]])




0 [0.         0.         0.64368637 0.008001  ]
1 [0.01520028 0.02687482 0.61630582 0.        ]
2 [0.62285755 0.02338112 0.00872876 0.        ]
3 [0.         0.62006743 0.         0.0041045 ]
4 [0.        0.5749474 0.        0.       ]
5 [0.         0.6752744  0.00446505 0.00246688]
6 [0.         0.         0.         1.07282832]
7 [0.75865586 0.         0.         0.        ]
8 [0.69128653 0.         0.         0.        ]


# mapping back to documents

In [24]:
topic_doc_map = {i: [] for i in range(nmf_model.n_components)}

for doc_index, topic_contributions in enumerate(W):
    dominant_topic = np.argmax(topic_contributions)
    topic_doc_map[dominant_topic].append(doc_index)

for topic, doc_indices in topic_doc_map.items():
    print(f"Topic {topic + 1}:")
    print("Documents:")
    for doc_index in doc_indices:
        print(f"Document: {file_paths[doc_index]}")
    print()

Topic 1:
Documents:
Document: ./corpus/spacelift.io_blog_kubernetes-cronjob.txt
Document: ./corpus/kubernetes.io_docs_concepts_workloads_controllers_cron-jobs_.txt
Document: ./corpus/kubernetes.io_docs_tasks_job_automated-tasks-with-cron-jobs_.txt

Topic 2:
Documents:
Document: ./corpus/stackoverflow.com_questions_45019753_how-to-retrain-a-machine-learning-model-in-python-till-we-get-desired-outcome.txt
Document: ./corpus/towardsdatascience.com_topic-modeling-articles-with-nmf-8c6b2a227a45.txt
Document: ./corpus/www.frontiersin.org_articles_10.3389_fsoc.2022.886498_full.txt

Topic 3:
Documents:
Document: ./corpus/www.aa.com_booking_find-flights.txt
Document: ./corpus/www.linkedin.com_jobs_search_?currentJobId=3915603480&distance=25&f_WT=1%2C3%2C2&geoId=102448103&keywords=software%20engineer&origin=JOBS_HOME_SEARCH_CARDS.txt

Topic 4:
Documents:
Document: ./corpus/www.jetblue.com_flights.txt

