# Load libraries / create tokenizer object 

In [61]:
import pandas as pd 
import string 
import numpy as np 
from nltk import word_tokenize
from nltk.corpus import stopwords   
from nltk.stem import PorterStemmer
from collections import Counter 
import math
import re 

# Preprocess data 

In [77]:
ps = PorterStemmer()
stop = set(stopwords.words('english'))

def read_txt(file_path): 
    # read file contents by line 
    file = open(file_path, "r", encoding="utf-8")
    text = file.read() 
    text = re.sub(r'\n', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = text.strip()
    text = text.lower()
    tokenized_words = word_tokenize(text)
    tokenized_words = [w for w in tokenized_words if w not in stop]
    tokenized_words = [ps.stem(word) for word in tokenized_words]
    return tokenized_words

websites = [] 
websites.append(read_txt("movie.txt"))
websites.append(read_txt("movie2.txt"))
websites.append(read_txt("movie3.txt"))
print(websites)

# TF / IDF Frequency

In [75]:
doc_occurrences = {}
def get_frequencies(website): # takes the corpus, counts its frequencies occurred in doc and stores in dictionary 
    word_freq = {} # stores the frequency of each word in a doc 
    for word in website: 
        if word in word_freq: 
            word_freq[word] += 1 
        else: 
            word_freq[word] = 1 
    return word_freq 

def get_doc_occurrences(website_dict, websites, doc_occurrences): # get the number of websites a word appears in 
    for key in website_dict: 
        if key not in doc_occurrences: 
            ct = 0 
            for website in websites: 
                if key in website: 
                    ct = ct + 1 
            doc_occurrences[key] = ct 
    return doc_occurrences


def calculate_tf_idf(website, websites, n, doc_occurrences): 
    word_freqs = get_frequencies(website)
    tf_idf = {}
    doc_occurrences = get_doc_occurrences(word_freqs, websites, doc_occurrences)
    tf_idf = {} 
    for key in word_freqs: 
        doc_occurs = doc_occurrences[key]
        print(doc_occurs, word_freqs[key])
        tf_idf[key] = word_freqs[key] * math.log(n / doc_occurs)
    print(tf_idf)
    return tf_idf 


for website in websites: 
    calculate_tf_idf(website, websites, len(websites), doc_occurrences=doc_occurrences)











1 11
1 41
1 64
1 2
1 2
1 1
1 2
1 4
1 1
1 2
1 3
1 2
1 1
1 2
1 1
1 1
1 1
1 16
1 4
1 2
1 2
1 1
1 5
1 2
1 1
1 1
1 3
1 3
1 2
1 2
1 2
1 4
1 24
1 1
1 2
1 2
1 1
1 7
1 2
1 18
1 5
1 6
1 3
1 2
1 1
1 2
1 6
1 4
1 3
1 7
1 2
1 8
1 1
1 5
1 1
1 2
1 1
1 7
1 1
1 3
1 1
1 22
1 4
1 1
1 1
1 1
1 1
1 1
1 3
1 1
1 1
1 6
1 10
1 2
1 1
1 1
1 2
1 11
1 4
1 1
1 1
1 1
1 27
1 3
1 2
1 3
1 21
1 9
1 1
1 1
1 1
1 2
1 2
1 6
1 2
1 4
1 9
1 4
1 3
1 4
1 2
1 2
1 3
1 3
1 3
1 2
1 2
1 4
1 2
1 2
1 2
1 19
1 3
1 7
1 2
1 2
1 2
1 3
1 1
1 10
1 11
1 3
1 2
1 6
1 5
1 1
1 2
1 2
1 2
1 4
1 1
1 3
1 2
1 2
1 2
1 1
1 1
1 1
1 2
1 1
1 1
1 1
1 1
1 1
1 2
1 3
1 7
1 1
1 1
1 1
1 1
1 2
1 2
1 14
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 3
1 1
1 3
1 1
1 1
1 2
1 2
1 3
1 1
1 2
1 1
1 1
1 1
1 6
1 1
1 2
1 2
1 1
1 1
1 1
1 2
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 2
1 1
1 1
1 6
1 4
1 1
1 1
1 1
1 3
1 1
1 2
1 1
1 1
1 4
1 4
1 3
1 2
1 1
1 1
1 1
1 2
1 1
1 1
1 2
1 2
1 7
1 8
1 4
1 2
1 1
1 4
1 1
1 1
1 1
1 1
1 1
1 2
1 1
1 1
1 1
1 1
1 1
1 1
1 4
1 1
1 1
1 1
1 1
1 1
1 1
1 2
1 2
1 1
1