In [24]:
import nltk
nltk.download('shakespeare')
nltk.download('wordnet')
from nltk.corpus import shakespeare
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
import numpy as np
import pandas as pd
import random 
import math
from pprint import pprint

[nltk_data] Downloading package shakespeare to /home/oem/nltk_data...
[nltk_data]   Package shakespeare is already up-to-date!
[nltk_data] Downloading package wordnet to /home/oem/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [25]:
#creating corpus dictionary
corpus_dict={}
for file_id in shakespeare.fileids():
    corpus_dict[file_id]={} 

# Preprocsesing Each Document

In [26]:
stop_words = stopwords.words('english')
ps=PorterStemmer()
lemmatizer = WordNetLemmatizer()

In [36]:
def normalize_text(terms_list):
    return [w.lower() for w in terms_list]

def removeAlphaNumeric(terms_list):
    return [w for w in terms_list if w.isalpha()]

def removeStopWords (terms_list):
    return [w for w in terms_list if w not in stop_words]

def doStemming (terms_list):
    return [ps.stem(w) for w in terms_list]

def doLemmatization (terms_list):
    return [lemmatizer.lemmatize(w) for w in terms_list]

def preProcessWords(terms_list):
    
    terms_list = normalize_text(terms_list)
    terms_list = removeAlphaNumeric(terms_list)
    terms_list  = removeStopWords(terms_list)
    terms_list = doStemming(terms_list)
    terms_list = doLemmatization(terms_list)
    
    return terms_list

for file_id in shakespeare.fileids():
    doc  = corpus_dict[file_id]
    play_terms = shakespeare.words(file_id)
    doc['words'] = preProcessWords(play_terms)

# Build Vocabulary set for each document

In [57]:
def getVocabulary (doc_terms_list):
    
    return set(doc_terms_list)

for file_id in shakespeare.fileids():
    doc=corpus_dict[file_id]
    doc['vocab'] = getVocabulary(doc['words'])

# Build Word Frequency List for Each Document

In [58]:
def getWordCounts(doc_terms_list):
    counts={}
    for w in doc_terms_list:
        count = counts.get(w,0)
        counts[w]=count+1
    return counts

for file_id in shakespeare.fileids():
    doc = corpus_dict[file_id]
    doc['counts'] = getWordCounts(doc['words'])

# Build Term Frequency List for Each Document

In [59]:
def getTermFrequency(doc_vocab_list, doc_count_dict, T):
    '''
    Input:
        doc_vocab_list: Document Vocabulary as Iterable
        doc_count_dict: Document Word Frequency as Dictionary
        T: Total Number of Words in the Document
    Output:
        Returns a Dictionary, having
            Each Term as a Key
            Term Frequency of each Term as its value
            TF = C/T
                C: Count of the Term
                T: Total Words In Document
    '''
    tf = {}
    for w in doc_vocab_list:
        tf[w] = doc_count_dict[w] / T
    return tf;

for file_id in shakespeare.fileids():
    doc = corpus_dict[file_id]
    doc['tf'] = getTermFrequency(doc['vocab'],doc['counts'], len(doc['words']))

# Build Corpus Vocabulary List

In [74]:
def getTotalVocabList(corpus_dict):
    total_corpus_vocab = set()
    for k,v in corpus_dict.items():
        total_corpus_vocab = total_corpus_vocab.union(corpus_dict[k]['vocab'])
    return total_corpus_vocab

vocab_list = []

for file_id in shakespeare.fileids():
    doc=corpus_dict[file_id]
    vocab_list.append(doc['vocab'])
    
from itertools import chain
corpus_total_vocab = set(chain.from_iterable(vocab_list))

# Build Document Frequency List

In [75]:
def getDocumentFrequency(corpus_dict, total_corpus_vocab):
    '''
    from the given docs dicts,
    construct Document Frequency list
    and returns it
    '''
    df = {}
    for w in total_corpus_vocab:
        df_count = 0
        for file_id,doc in corpus_dict.items():
            if w in corpus_dict[file_id]['vocab']:
                df_count = df_count + 1
        df[w] = df_count
    return df
            
corpus_df = getDocumentFrequency(corpus_dict, corpus_total_vocab)

# Build Inverse Document Frequency

In [76]:
  def getInverseDocumentFrequency(corpus_df, total_corpus_vocab):
    '''
    using provided document frequency
    constructs IDF for each vocab in total_vocab
    and returns it as dict
    '''
    idf = {}
    for w in total_corpus_vocab:
        idf[w] = corpus_df[w] / len(shakespeare.fileids())
    return idf

corpus_idf = getInverseDocumentFrequency(corpus_df, corpus_total_vocab)

# Build Wieghted TF-IDF

In [90]:
def getWeightedTFIDFOfATerm(tc, idf):
   
    wv = 0
    if(tc > 0):
        wv = 1 + np.log10(tc)
    #print('tc: ', tc, 'idf: ', idf, 'wf-idf: ', wv * idf)
    return wv * idf

def getWeightedTFIDFOfADocument(doc_vocab, doc_count, idf):
   
    wv_tf_idf = {}
    for w in doc_vocab:
        wv_tf_idf[w] = getWeightedTFIDFOfATerm(doc_count[w], idf[w])
    return wv_tf_idf

for file_id,doc in corpus_dict.items():
    vocab = corpus_dict[file_id]['vocab']
    counts = corpus_dict[file_id]['counts']
    doc = corpus_dict[file_id]
    doc['wv_tf_idf'] = getWeightedTFIDFOfADocument(vocab, counts, corpus_idf)

# Build Weighted TF-IDF DataFrame

In [91]:
col_headers = shakespeare.fileids()
tfidf_matrix = pd.DataFrame(columns=col_headers)

for w in corpus_total_vocab:
    wv = []
    for file_id,doc in corpus_dict.items():
        wv.append(doc['wv_tf_idf'].get(w, 0))
    tfidf_matrix.loc[w] = wv

In [92]:
tfidf_matrix.sort_index(inplace=True)

In [93]:
tfidf_matrix

Unnamed: 0,a_and_c.xml,dream.xml,hamlet.xml,j_caesar.xml,macbeth.xml,merchant.xml,othello.xml,r_and_j.xml
abandon,0.00000,0.000000,0.000000,0.00000,0.00000,0.000000,0.125000,0.000000
abat,0.00000,0.500000,0.650515,0.00000,0.00000,0.500000,0.000000,0.500000
abbey,0.00000,0.000000,0.000000,0.00000,0.00000,0.000000,0.000000,0.125000
abhor,0.62500,0.000000,0.625000,0.00000,0.62500,0.000000,0.923201,0.813144
abi,0.00000,0.162629,0.000000,0.00000,0.00000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...
younker,0.00000,0.000000,0.000000,0.00000,0.00000,0.125000,0.000000,0.000000
youth,1.69897,1.845098,2.204120,1.30103,1.30103,1.954243,1.698970,1.954243
zeal,0.00000,0.000000,0.000000,0.00000,0.00000,0.125000,0.000000,0.000000
zone,0.00000,0.000000,0.125000,0.00000,0.00000,0.000000,0.000000,0.000000


# Build Query Vocabulary
    Randomly selecting query words from Corpus Vocabulary
    Out-of-Vocabulary (OOV) is not considered in this test case

In [94]:
max_query_words = 20
print('max_query_words: ', max_query_words)

max_query_words:  20


In [95]:
query_words = random.choices(list(corpus_total_vocab), k=max_query_words)
print('query_words: ', query_words)

query_words:  ['earthquak', 'confessor', 'bewhor', 'gobbo', 'composur', 'tribe', 'patienc', 'seedsman', 'gash', 'pomegran', 'distress', 'avenu', 'afor', 'alexa', 'grimli', 'treacheri', 'laudabl', 'haggard', 'demon', 'reckon']


In [97]:
query_doc = {}
query_doc['words'] = preProcessWords(query_words)
query_doc['vocab'] = getVocabulary(query_doc['words'])
query_doc['counts'] = getWordCounts(query_doc['words'])
query_doc['tf'] = getTermFrequency(query_doc['vocab'], query_doc['counts'], len(query_doc['words']))
query_doc['wv_tf_idf'] = getWeightedTFIDFOfADocument(query_doc['vocab'], query_doc['counts'], corpus_idf)

# for terms that don't exist in query words, assign 0 as Weighted TF-IDF for those terms
for w in corpus_total_vocab:
    count = query_doc['wv_tf_idf'].get(w, 0)
    query_doc['wv_tf_idf'][w] = count

In [98]:
def cosine_distance(a, b):
    return 1 - cosine_similarity(a,b)


def cosine_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

In [99]:
query_tf_df_matrix = pd.DataFrame(columns=['Wv-TF-IDF'])

for file_id,doc in corpus_dict.items():
    cos_sim= []
    doc_val = np.array(tfidf_matrix[file_id]).T
    pprint(doc_val.shape)
    query_val = np.array(list(query_doc['wv_tf_idf'].values()))
    query_tf_df_matrix.loc[file_id] = cosine_similarity(doc_val, query_val)

(7554,)
(7554,)
(7554,)
(7554,)
(7554,)
(7554,)
(7554,)
(7554,)


In [100]:
query_tf_df_matrix.sort_values(by='Wv-TF-IDF',ascending=False, inplace=True)

In [101]:
query_tf_df_matrix

Unnamed: 0,Wv-TF-IDF
r_and_j.xml,0.020376
a_and_c.xml,0.017167
othello.xml,0.016528
hamlet.xml,0.015866
macbeth.xml,0.014134
merchant.xml,0.014108
dream.xml,0.008535
j_caesar.xml,0.007037
