In [1]:
import string, os, nltk
import numpy as np
import math
import pandas as pd
from nltk.tokenize import word_tokenize
nltk.download('punkt')
from nltk.stem import WordNetLemmatizer
nltk.download('omw-1.4')
from nltk.stem.porter import PorterStemmer

[nltk_data] Downloading package punkt to /home/sbokhari/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/sbokhari/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [2]:
# process the stop words
with open('stop_words.txt') as f:
    doc = f.readlines()
    processed_doc = []
    for line in doc:
        new_line = line.strip()
        new_line = new_line.translate(str.maketrans('', '', string.punctuation)).lower();
        processed_doc.append(new_line)
    stop_words = []
    for line in processed_doc:
        stop_words.extend(line.split())
    print(stop_words)

FileNotFoundError: [Errno 2] No such file or directory: 'stop_words.txt'

In [None]:
# collect the text from the files
cwd = os.getcwd()
files = os.listdir("./documents")
dataset = []
for file in files:
    path = f"{cwd}/documents/{file}"
    with open(path) as f:
        dataset.append(f.read())
print('Number of documents \n', len(dataset), dataset) # has all the files text in a single list

Number of documents 
 8 ['Given a character sequence and a defined document unit, \ntokenization is the task of chopping it up into pieces, called tokens, \nperhaps at the same time throwing away certain characters, \nsuch as punctuation. \n\n', 's is a spam page.\ntokens\nstopwords\nindex\npostings\nclassification\nsupervised\ntokens\nstopwords\nindex\npostings\nclassification\nsupervised\ntokens\nstopwords\nindex\npostings\nclassification\nsupervised\ntokens\nstopwords\nindex\npostings\nclassification\nsupervised\n', 'In text classification, we are given a description \nof a document and a fixed set of classes.\n\n', 'Using a supervised learning method or learning algorithm, \nwe wish to learn a classifier or classification function \nthat maps documents to classes.\n\n', 'For English, an alternative to making every token lowercase \nis to just make some tokens lowercase. The simplest heuristic\nis to convert to lowercase words at the beginning of a \nsentence and all words occurring

In [None]:
def preprocess(string):
    sentences = []
    #lemmatization process
    wordnet = WordNetLemmatizer()
    #stemming - convert words into their stem
    porter_stemmer = PorterStemmer()

    x = [i.lower() for i in word_tokenize(string) if i.isalpha()]
    for word in x: 
        if word not in stop_words and len(word) > 2:
            sentences.append(porter_stemmer.stem(wordnet.lemmatize(word)))
            # sentences.append(word)

    new_string = " ".join(sentences)
    # print(new_string)
    return new_string

#preprossing the data
clean_dataset = []
for each_document in dataset:
    clean_doc = preprocess(each_document)
    clean_dataset.append(clean_doc)
print(clean_dataset)


['given charact sequenc defin document unit token task chop into piec call token perhap same time throw away certain charact such punctuat', 'spam page token stopword index post classif supervis token stopword index post classif supervis token stopword index post classif supervis token stopword index post classif supervis', 'text classif given descript document fix set class', 'use supervis learn method learn algorithm wish learn classifi classif function map document class', 'english altern make everi token lowercas just make some token lowercas simplest heurist convert lowercas word begin sentenc all word occur titl all uppercas which most all word capit', 'index document each term occur creat invert index consist dictionari post', 'token normal process canonic token match occur despit superfici differ charact sequenc token', 'gain speed benefit index retriev time build index advanc major step collect document index token text turn each document into list token linguist preprocess pr

In [None]:
# standard formula to find normalized term frequency
def termFrequency(term, document):
    normalizeDocument = document.lower().split()
    return normalizeDocument.count(term.lower()) / float(len(normalizeDocument))

def calculate_tf(documents):
    tf_doc = []
    for txt in documents:
        sentence = txt.split()
        norm_tf= dict.fromkeys(set(sentence), 0) #create a new dictinary 
        for word in sentence:
            norm_tf[word] = termFrequency(word, txt) 
        tf_doc.append(norm_tf)
        df = pd.DataFrame([norm_tf])
        idx = 0
        new_col = ["TF"]    
        df.insert(loc = idx, column = 'Document', value = new_col)
        print(df)
    return tf_doc

term_freq = calculate_tf(clean_dataset) # store result for further access

  Document    perhap     token   certain      time      unit     given  \
0       TF  0.045455  0.090909  0.045455  0.045455  0.045455  0.045455   

       such      into   sequenc  ...  punctuat      task      same      chop  \
0  0.045455  0.045455  0.045455  ...  0.045455  0.045455  0.045455  0.045455   

      throw   charact  document      piec      away      call  
0  0.045455  0.090909  0.045455  0.045455  0.045455  0.045455  

[1 rows x 21 columns]
  Document  supervis      spam     token   classif  stopword      page  \
0       TF  0.153846  0.038462  0.153846  0.153846  0.153846  0.038462   

       post     index  
0  0.153846  0.153846  
  Document  given  document    fix    set  class   text  classif  descript
0       TF  0.125     0.125  0.125  0.125  0.125  0.125    0.125     0.125
  Document  supervis      wish  document     class  classifi    method  \
0       TF  0.071429  0.071429  0.071429  0.071429  0.071429  0.071429   

      learn   classif  algorithm  function 

In [None]:
# check how important a word is to a document in a collection of files
def inverseDocumentFrequency(term, allDocuments):
    numDocumentsWithThisTerm = 0
    for doc in range (0, len(allDocuments)):
        if term.lower() in allDocuments[doc].lower().split():
            numDocumentsWithThisTerm = numDocumentsWithThisTerm + 1
 
    if numDocumentsWithThisTerm > 0:
        return 1.0 + math.log(float(len(allDocuments)) / numDocumentsWithThisTerm)
    else:
        return 1.0
    
def compute_idf(documents):
    idf_dict = {}
    for doc in documents:
        sentence = doc.split()
        for word in sentence:
            idf_dict[word] = inverseDocumentFrequency(word, documents)
    return idf_dict
idf_dict = compute_idf(clean_dataset)
print("Total number of Unique words =", len(idf_dict))
compute_idf(clean_dataset)

Total number of Unique words = 87


{'given': 2.386294361119891,
 'charact': 2.386294361119891,
 'sequenc': 2.386294361119891,
 'defin': 3.0794415416798357,
 'document': 1.4700036292457357,
 'unit': 3.0794415416798357,
 'token': 1.4700036292457357,
 'task': 3.0794415416798357,
 'chop': 3.0794415416798357,
 'into': 2.386294361119891,
 'piec': 3.0794415416798357,
 'call': 3.0794415416798357,
 'perhap': 3.0794415416798357,
 'same': 3.0794415416798357,
 'time': 2.386294361119891,
 'throw': 3.0794415416798357,
 'away': 3.0794415416798357,
 'certain': 3.0794415416798357,
 'such': 3.0794415416798357,
 'punctuat': 3.0794415416798357,
 'spam': 3.0794415416798357,
 'page': 3.0794415416798357,
 'stopword': 3.0794415416798357,
 'index': 1.9808292530117262,
 'post': 2.386294361119891,
 'classif': 1.9808292530117262,
 'supervis': 2.386294361119891,
 'text': 2.386294361119891,
 'descript': 3.0794415416798357,
 'fix': 3.0794415416798357,
 'set': 3.0794415416798357,
 'class': 2.386294361119891,
 'use': 3.0794415416798357,
 'learn': 3.079

In [None]:
# saving into a text file
with open('index.txt', 'w') as f:
    for key, value in idf_dict.items():
        f.write('%s:%s\n' % (key,value))

In [None]:
# tf-idf score across all docs for the query string
def calculate_tfidf_with_alldocs(documents , query):
    tf_idf = []
    index = 0
    query_tokens = query.split()
    df = pd.DataFrame(columns=['doc'] + query_tokens)
    for doc in documents:
        df['doc'] = np.arange(0 , len(documents))
        doc_num = term_freq[index]
        sentence = doc.split()
        for word in sentence:
            for text in query_tokens:
                if(text == word):
                    # idx = sentence.index(word)
                    tf_idf_score = doc_num[word] * idf_dict[word]
                    tf_idf.append(tf_idf_score)
                    df.iloc[index, df.columns.get_loc(word)] = tf_idf_score
        index += 1
    df.fillna(0 , axis = 1, inplace = True)
    return tf_idf , df

In [None]:
tf_idf , df = calculate_tfidf_with_alldocs(clean_dataset , "tokens")
print(df)

   doc  tokens
0    0       0
1    1       0
2    2       0
3    3       0
4    4       0
5    5       0
6    6       0
7    7       0


In [None]:
tf_idf , df = calculate_tfidf_with_alldocs(clean_dataset , "token")
print(df)

   doc     token
0  0.0  0.133637
1  1.0  0.226154
2  2.0  0.000000
3  3.0  0.000000
4  4.0  0.101380
5  5.0  0.000000
6  6.0  0.339232
7  7.0  0.142258


In [None]:
tf_idf , df = calculate_tfidf_with_alldocs(clean_dataset , "index")
print(df)

   doc     index
0  0.0  0.000000
1  1.0  0.304743
2  2.0  0.000000
3  3.0  0.000000
4  4.0  0.000000
5  5.0  0.360151
6  6.0  0.000000
7  7.0  0.255591


In [None]:
tf_idf , df = calculate_tfidf_with_alldocs(clean_dataset , "classification")
print(df)

   doc  classification
0    0               0
1    1               0
2    2               0
3    3               0
4    4               0
5    5               0
6    6               0
7    7               0


In [None]:
tf_idf , df = calculate_tfidf_with_alldocs(clean_dataset , "classif")
print(df)

   doc   classif
0  0.0  0.000000
1  1.0  0.304743
2  2.0  0.247604
3  3.0  0.141488
4  4.0  0.000000
5  5.0  0.000000
6  6.0  0.000000
7  7.0  0.000000


In [100]:
tf_idf , df = calculate_tfidf_with_alldocs(clean_dataset , "token classif")
print(df)

   doc     token   classif
0  0.0  0.133637  0.000000
1  1.0  0.226154  0.304743
2  2.0  0.000000  0.247604
3  3.0  0.000000  0.141488
4  4.0  0.101380  0.000000
5  5.0  0.000000  0.000000
6  6.0  0.339232  0.000000
7  7.0  0.142258  0.000000
