# Load libraries / create tokenizer object 

In [2]:
import pandas as pd 
import string 
import numpy as np 
import os 
from nltk import word_tokenize
from nltk.corpus import stopwords   
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF 
from collections import Counter 
import math
import re 

# train / test split


In [6]:
data = pd.read_csv('c4filteredsample_tagged.csv')

for col in data.columns: 
    if "text" not in col: 
        data = data.drop(col, axis=1)


data = data.replace({r'\n' : ' '}, regex=True) 
data.fillna('', inplace=True)
data = data.replace({r'\'' : ' '}, regex=True) 

print(data)

                                                     text
0       Man continuously undergoes selection through t...
1       Today we are celebrating the precious letter P...
2       Comics Archive | I made inky. 07/30/12 – Comic...
3       This article is about current Disney record la...
4       This entry was posted on November 3, 2011 at 4...
...                                                   ...
149995  17 HOU W 22-13 2 0 0 0.0 0 No injury listed. 1...
149996  Item description Caption on back of photograph...
149997  In a bizarre coincidence recently, DC Comics l...
149998  The Men of Hampton Park dedicate themselves to...
149999  Welcome to Post-weekend Poetry and the twelfth...

[150000 rows x 1 columns]


# load corpus


In [1]:
def load_corpus(path): 
    corpus_paths = []
    for filename in os.listdir(path): 
        file_path = path + filename 
        corpus_paths.append(file_path)
    return corpus_paths 

corpus_dir = './corpus/'
file_paths = load_corpus(corpus_dir)
for path in file_paths: 
    print(path)




./corpus/www.aa.com_booking_find-flights.txt
./corpus/www.linkedin.com_jobs_search_?currentJobId=3915603480&distance=25&f_WT=1%2C3%2C2&geoId=102448103&keywords=software%20engineer&origin=JOBS_HOME_SEARCH_CARDS.txt
./corpus/spacelift.io_blog_kubernetes-cronjob.txt
./corpus/stackoverflow.com_questions_45019753_how-to-retrain-a-machine-learning-model-in-python-till-we-get-desired-outcome.txt
./corpus/towardsdatascience.com_topic-modeling-articles-with-nmf-8c6b2a227a45.txt
./corpus/www.frontiersin.org_articles_10.3389_fsoc.2022.886498_full.txt
./corpus/www.jetblue.com_flights.txt
./corpus/kubernetes.io_docs_concepts_workloads_controllers_cron-jobs_.txt
./corpus/kubernetes.io_docs_tasks_job_automated-tasks-with-cron-jobs_.txt


# Preprocess data 

In [None]:
ps = PorterStemmer()
stop = set(stopwords.words('english'))

def read_txt(file_path): 
    # read file contents by line 
    file = open(file_path, "r", encoding="utf-8")
    text = file.read() 
    text = re.sub(r'\n', '', text)
    text = re.sub(r'\d', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = text.strip()
    text = text.lower()
    tokenized_words = word_tokenize(text)
    tokenized_words = [w for w in tokenized_words if w not in stop]
    tokenized_words = [ps.stem(word) for word in tokenized_words]
    return ' '.join(tokenized_words)



websites_preprocessed_data = [] 
for path in file_paths: 
    websites_preprocessed_data.append(read_txt(path))


print(websites_preprocessed_data)


# TF / IDF Frequency (from scratch)

In [None]:
doc_occurrences = {}
def get_frequencies(website): # takes the corpus, counts its frequencies occurred in doc and stores in dictionary 
    word_freq = {} # stores the frequency of each word in a doc 
    for word in website: 
        if word in word_freq: 
            word_freq[word] += 1 
        else: 
            word_freq[word] = 1 
    return word_freq 

def get_doc_occurrences(website_dict, websites, doc_occurrences): # get the number of websites a word appears in 
    for key in website_dict: 
        if key not in doc_occurrences: 
            ct = 0 
            for website in websites: 
                if key in website: 
                    ct = ct + 1 
            doc_occurrences[key] = ct 
    return doc_occurrences


def calculate_tf_idf(website, websites, n, doc_occurrences): 
    word_freqs = get_frequencies(website)
    tf_idf = {}
    doc_occurrences = get_doc_occurrences(word_freqs, websites, doc_occurrences)
    tf_idf = {} 
    for key in word_freqs: 
        doc_occurs = doc_occurrences[key]
        print(doc_occurs, word_freqs[key])
        tf_idf[key] = word_freqs[key] * math.log(n / doc_occurs)
    print(tf_idf)
    return tf_idf 


for website in websites_preprocessed_data: 
    calculate_tf_idf(website, websites_preprocessed_data, len(websites_preprocessed_data), doc_occurrences=doc_occurrences)











# actual TF / IDF Frequency

In [59]:
vectorizer = TfidfVectorizer(norm='l2', smooth_idf=True, ngram_range=(2, 3), stop_words='english')
website_content = [] 
def file_to_st(file_path): 
    file = open(file_path, "r", encoding="utf-8")
    text = file.read() 
    return text 

 
x = vectorizer.fit_transform(websites_preprocessed_data)
tokens = vectorizer.get_feature_names_out()
tfidf_df = pd.DataFrame(data = x.T.toarray(),index = tokens, columns = file_paths)

print(x)


  (0, 87)	0.0312569026559189
  (0, 12371)	0.0312569026559189
  (0, 18643)	0.0312569026559189
  (0, 22755)	0.0312569026559189
  (0, 13457)	0.0312569026559189
  (0, 14100)	0.0312569026559189
  (0, 11531)	0.0312569026559189
  (0, 9839)	0.0312569026559189
  (0, 8729)	0.0312569026559189
  (0, 101)	0.0312569026559189
  (0, 12378)	0.0312569026559189
  (0, 2693)	0.0312569026559189
  (0, 15597)	0.0312569026559189
  (0, 129)	0.0312569026559189
  (0, 3676)	0.0312569026559189
  (0, 21180)	0.0312569026559189
  (0, 9240)	0.0312569026559189
  (0, 8719)	0.0312569026559189
  (0, 91)	0.0312569026559189
  (0, 12373)	0.0312569026559189
  (0, 8586)	0.0312569026559189
  (0, 18740)	0.0312569026559189
  (0, 4490)	0.0312569026559189
  (0, 15382)	0.0312569026559189
  (0, 8725)	0.0312569026559189
  :	:
  (8, 4137)	0.013793040708072423
  (8, 18958)	0.013793040708072423
  (8, 20066)	0.013793040708072423
  (8, 18970)	0.013793040708072423
  (8, 10477)	0.013793040708072423
  (8, 17515)	0.013793040708072423
  (8, 8855

# SVD 

In [50]:
# rank < min(m, n) for matrix m x n 
def svd_init(m, rank): 
    # convert from dataframe to array 
    arr = m.values
    u, s, v = np.linalg.svd(arr, full_matrices=False)
    v = v.T 
    w = np.zeros((arr.shape[0], rank)) 
    h = np.zeros((rank, arr.shape[1]))

    w[:, 0] = np.sqrt(s[0]) * np.abs(u[:, 0])
    h[0, :] = np.sqrt(s[0]) * np.abs(v[:, 0].T)

    for i in range(1, rank): 
        print(u.shape)
        x = u[:, i]
        y = v[:, i]
        x_p = (x >= 0) * x
        x_n = (x < 0) * -x
        y_p = (y >= 0) * y
        y_n = (y <  0) * -y

        xp_norm = np.linalg.norm(x_p, 2)
        xn_norm = np.linalg.norm(x_n, 2)
        yp_norm = np.linalg.norm(y_p, 2)
        yn_norm = np.linalg.norm(y_n, 2)

        p_norm = xp_norm * yp_norm 
        n_norm = xn_norm * yn_norm 
        if p_norm > n_norm: 
            u_i = x_p / xp_norm 
            v_i = y_p / yp_norm 
            sigma = p_norm 
        else: 
            u_i = x_n / xn_norm 
            v_i = y_n / yn_norm 
            sigma = n_norm 
        w[:, i] = np.sqrt(s[i] * sigma) * u_i
        h[i, :] = np.sqrt(s[i] * sigma) * v_i.T 

    threshold = 1e-10 

    w[w < threshold] = 0
    h[h < threshold] = 0

    return w, h 
        



w, h = svd_init(tfidf_df, 3)
print("w: ", w, w.shape)
print("h: ", h, h.shape)


(37997, 9)
(37997, 9)
w:  [[1.04316829e-05 4.75269088e-04 1.37509973e-02]
 [1.04316829e-05 4.75269088e-04 1.37509973e-02]
 [1.04316830e-05 4.75269088e-04 1.37509973e-02]
 ...
 [4.36836389e-06 3.11687796e-03 0.00000000e+00]
 [4.36836389e-06 3.11687796e-03 0.00000000e+00]
 [4.36836389e-06 3.11687796e-03 0.00000000e+00]] (37997, 3)
h:  [[5.38055983e-04 5.40236319e-04 1.42902535e-01 2.59402116e-04
  3.23789135e-04 1.20511361e-03 3.96638816e-06 7.96448511e-01
  7.90426762e-01]
 [1.94221178e-02 1.10095831e-02 5.34009730e-03 4.76365390e-01
  5.67451136e-01 6.81260031e-01 1.36646055e-02 0.00000000e+00
  0.00000000e+00]
 [5.55184223e-01 4.64096898e-01 1.46886400e-02 8.76632064e-03
  0.00000000e+00 0.00000000e+00 6.89797225e-01 0.00000000e+00
  0.00000000e+00]] (3, 9)


# NMF

In [60]:
nmf_model = NMF(n_components=4, random_state=60)


terms = vectorizer.get_feature_names_out()
W = nmf_model.fit_transform(x)
H = nmf_model.components_ 


topics = [] 

for index, topic in enumerate(H):
    topics.append([terms[i] for i in topic.argsort()[-3:]])
    print([terms[i] for i in topic.argsort()[-4:]])


print(W)



['object use', 'pod use', 'pod secur', 'kubernet object']
['min read', 'text googl', 'topic model', 'et al']
['site new', 'open anoth', 'meet access', 'new window']
['open option list', 'open option', 'list convers', 'option list convers']
[[1.78744765e-04 1.10787074e-04 5.61728657e-01 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 1.11269572e+00]
 [1.74038903e-01 1.38208852e-03 0.00000000e+00 2.69484638e-03]
 [0.00000000e+00 5.38750178e-01 1.08859770e-04 0.00000000e+00]
 [0.00000000e+00 5.08268889e-01 0.00000000e+00 0.00000000e+00]
 [5.57761085e-05 6.68618847e-01 1.94308088e-04 4.34908439e-04]
 [0.00000000e+00 0.00000000e+00 6.59341490e-01 7.86561752e-04]
 [7.96706614e-01 0.00000000e+00 1.94687868e-04 0.00000000e+00]
 [7.87029555e-01 0.00000000e+00 0.00000000e+00 0.00000000e+00]]


# mapping back to documents

In [61]:
topic_doc_map = {i: [] for i in range(nmf_model.n_components)}

for doc_index, topic_scores in enumerate(W):
    max_topic_score = np.argmax(topic_scores)
    topic_doc_map[max_topic_score].append(doc_index)

for topic, doc_indices in topic_doc_map.items():
    print(f"Topic {topic + 1}:")
    for doc_index in doc_indices:
        print(f"{file_paths[doc_index]}")
    print()

Topic 1:
./corpus/spacelift.io_blog_kubernetes-cronjob.txt
./corpus/kubernetes.io_docs_concepts_workloads_controllers_cron-jobs_.txt
./corpus/kubernetes.io_docs_tasks_job_automated-tasks-with-cron-jobs_.txt

Topic 2:
./corpus/stackoverflow.com_questions_45019753_how-to-retrain-a-machine-learning-model-in-python-till-we-get-desired-outcome.txt
./corpus/towardsdatascience.com_topic-modeling-articles-with-nmf-8c6b2a227a45.txt
./corpus/www.frontiersin.org_articles_10.3389_fsoc.2022.886498_full.txt

Topic 3:
./corpus/www.aa.com_booking_find-flights.txt
./corpus/www.jetblue.com_flights.txt

Topic 4:
./corpus/www.linkedin.com_jobs_search_?currentJobId=3915603480&distance=25&f_WT=1%2C3%2C2&geoId=102448103&keywords=software%20engineer&origin=JOBS_HOME_SEARCH_CARDS.txt

