In [1]:
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
import pandas as pd
import string
import math
import re

# Don't collapse Pandas Dataframes:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

In [2]:
## TFIDF stands for "Term Frequency - Inverse Document Frequency"
## The TFIDF weight is a measure used to evaluate how important a 'word' is to a 'document' in a 'collection of documents'

In [3]:
# opening the file in read mode
document1 = open('document-1.txt','r')
document1 = document1.read()
document2 = open('document-2.txt','r')
document2 = document2.read()

list_of_documents = []
list_of_documents.append(document1)
list_of_documents.append(document2)
# print(list_of_documents)

In [4]:
training_data = []
stop_words = set(stopwords.words('english'))

for doc in list_of_documents:
    doc = doc.translate(str.maketrans('', '', string.punctuation)) ## Remove Punctuation Characters
    doc = doc.replace('\n','') ## Remove Line Breaks
    doc = re.sub(r'[0-9]+', '', doc) ## Remove Numerical Characters
    doc = doc.lower() ## Lowercase Characters
    doc = word_tokenize(doc)
    doc = [word for word in doc if word not in stop_words] ## Remove Stop words
    doc = ' '.join(doc)
    training_data.append(doc)

# print(doc)

In [5]:
def create_doc_info(list_of_documents):
    doc_info = []
    doc_id = 0
    for doc in list_of_documents:
        # print(doc)
        doc_id += 1
        doc_length = count_words(doc)
        info = {'doc_id' : doc_id,
            'doc_length' : doc_length}
        doc_info.append(info)
    return doc_info

def count_words(sent):
    count = 0
    words = word_tokenize(sent)
    for word in words:
        count += 1
    return count

In [6]:
doc_info = create_doc_info(training_data)
print(doc_info)

[{'doc_id': 1, 'doc_length': 1070}, {'doc_id': 2, 'doc_length': 1237}]


In [7]:
def create_freq_dict(sents):
    i = 0
    freqDict_list = []
    for sent in sents:
        i += 1
        freq_dict = {}
        words = word_tokenize(sent)
        for word in words:
            if word in freq_dict:
                freq_dict[word] += 1
            else:
                freq_dict[word] = 1
            temp = {'doc_id' : i,
                'freq_dict' : freq_dict}
        freqDict_list.append(temp)
    return freqDict_list

In [8]:
freqDict_list = create_freq_dict(training_data)
print(freqDict_list)

[{'doc_id': 1, 'freq_dict': {'vehicles': 11, 'guidepzwikilanguage': 1, 'policy': 1, 'language': 1, 'english': 1, '•': 2, '日本語': 1, 'русскийnavigationmain': 1, 'indexthe': 1, 'game': 2, 'worldvehiclesvehicles': 1, 'guidepage': 1, 'updated': 1, 'version': 2, 'drivingspiffopngspiffo': 1, 'knows': 1, 'driving': 1, 'vehiclesvehicles': 2, 'form': 1, 'driveable': 1, 'transport': 1, 'found': 3, 'project': 2, 'zomboid': 1, 'located': 2, 'throughout': 1, 'knox': 1, 'country': 1, 'outside': 1, 'homes': 1, 'parking': 2, 'lotscontents': 1, 'types': 3, 'controls': 3, 'car': 7, 'keys': 2, 'hotwiring': 2, 'tools': 1, 'vehicle': 42, 'mechanics': 26, 'menu': 10, 'engine': 18, 'quality': 9, 'installinguninstalling': 2, 'parts': 11, 'repairsreplacements': 1, 'specialty': 2, 'repairs': 1, 'dashboard': 3, 'refuelingtypesvehicles': 1, 'come': 1, 'three': 3, 'different': 3, 'along': 2, 'installed': 2, 'corresponding': 2, 'type': 3, 'recipe': 4, 'magazine': 2, 'must': 4, 'read': 1, 'player': 5, 'add': 1, 'remo

In [9]:
def computeTF(doc_info, freqDict_list):
    TF_scores = []
    for tempDict in freqDict_list:
        id = tempDict['doc_id']
        for k in tempDict['freq_dict']:
            temp = {'doc_id':id,
                    'TF_score':tempDict['freq_dict'][k]/doc_info[id-1]['doc_length'],
                    'key':k}
            TF_scores.append(temp)
    return TF_scores

def computeIDF(doc_info, freqDict_list):
    IDF_scores = []
    counter = 0
    for dict in freqDict_list:
        counter += 1
        for k in dict['freq_dict'].keys():
            count = sum([k in tempDict['freq_dict'] for tempDict in freqDict_list])
            temp = {'doc_id':counter, 'IDF_score':math.log(len(doc_info)/count),'key':k}
            
            IDF_scores.append(temp)
            
    return IDF_scores

In [10]:
## TF = number of times the term appears in the doc/total number of words in the doc
TF_scores = computeTF(doc_info, freqDict_list)
TF_scores_df = pd.DataFrame(TF_scores)
TF_scores_df.sort_values(['TF_score'], ascending=True).head(20)

Unnamed: 0,doc_id,TF_score,key
543,2,0.000808,carsvehicles
747,2,0.000808,lotshow
748,2,0.000808,zomboiddriving
749,2,0.000808,easy
752,2,0.000808,visible
761,2,0.000808,automatically
762,2,0.000808,otherwise
763,2,0.000808,n
764,2,0.000808,outthe
746,2,0.000808,figure


In [11]:
## IDF = ln(number of docs/number docs the term appears in)
IDF_scores = computeIDF(doc_info, freqDict_list)
IDF_scores_df = pd.DataFrame(IDF_scores)
IDF_scores_df.sort_values(['IDF_score'], ascending=True).head(20)

Unnamed: 0,doc_id,IDF_score,key
0,1,0.0,vehicles
255,1,0.0,condition
720,2,0.0,ground
253,1,0.0,various
252,1,0.0,information
250,1,0.0,menuthe
511,1,0.0,air
248,1,0.0,radios
247,1,0.0,windows
245,1,0.0,small


In [12]:
def computeTFIDF(TF_scores, IDF_scores):
    TFIDF_scores = []
    for j in IDF_scores:
        for i in TF_scores:
            if j['key'] == i['key'] and j['doc_id'] == i['doc_id']:
                temp = {'doc_id' : j['doc_id'],
                       'TFIDF_score' : j['IDF_score']*i['TF_score'],
                       'key':i['key']}
        TFIDF_scores.append(temp)
    return TFIDF_scores

In [13]:
## TFIDF is the product of the TF and IDF scores of the term
## The goal is to 'weigh down' the frequent terms while 'scaling up' the rare terms
## The lower the TFIDF score, the more frequent the term is
TFIDF_scores = computeTFIDF(TF_scores, IDF_scores)
TFIDF_scores_df = pd.DataFrame(TFIDF_scores)
print('Frequent Terms:')
TFIDF_scores_df.sort_values(['TFIDF_score'], ascending=True).head(20)

Frequent Terms:


Unnamed: 0,doc_id,TFIDF_score,key
0,1,0.0,vehicles
848,2,0.0,containers
849,2,0.0,container
854,2,0.0,rightclicking
856,2,0.0,gasoline
321,1,0.0,give
857,2,0.0,add
318,1,0.0,multiplier
860,2,0.0,empty
863,2,0.0,even


In [14]:
## TFIDF is the product of the TF and IDF scores of the term
## The goal is to 'weigh down' the frequent terms while 'scaling up' the rare terms
## The higher the TFIDF score, the rarer the term is
print('Rare Terms:')
TFIDF_scores_df.sort_values(['TFIDF_score'], ascending=False).head(20)

Rare Terms:


Unnamed: 0,doc_id,TFIDF_score,key
41,1,0.01166,engine
69,1,0.009717,models
453,1,0.006478,metal
42,1,0.00583,quality
566,2,0.005603,cars
576,2,0.005603,find
261,1,0.005182,part
219,1,0.005182,screwdriver
568,2,0.005043,fill
455,1,0.004535,sheet
