In [1]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
from collections import Counter
from num2words import num2words

import nltk
import os
import string
import numpy as np
import copy
import pandas as pd
import pickle
import re
import math
import json
from hunspell import Hunspell
import warnings
warnings.filterwarnings('ignore')

# %load_ext autotime

In [2]:
data = pd.read_csv('metadata/final_1.csv')
filenames = data['filename'].to_list()
years = data['years'].to_list()
titles = data['title'].to_list()

In [14]:
# Only runs when need to reload the dataset
dataset = []

def sorted_alphanumeric(data):
    convert = lambda text: int(text) if text.isdigit() else text.lower()
    alphanum_key = lambda key: [ convert(c) for c in re.split('([0-9]+)', key) ] 
    return sorted(data, key=alphanum_key)

for filename in sorted_alphanumeric(os.listdir('txt')):
    dataset.append(('txt/'+filename, titles[int(filenames.index(filename))]))

N = len (dataset)

In [31]:
# Only runs when need to reload the dataset
def print_doc(id):
    print(dataset[id])
    file = open(dataset[id][0], 'r', encoding='utf8')
    text = file.read().strip()
    file.close()
    print(text)

In [3]:
def convert_lower_case(data):
    return np.char.lower(data)

In [4]:
def remove_stop_words(data):
    stop_words = stopwords.words('english')
    words = word_tokenize(str(data))
    new_text = ""
    for w in words:
        if w not in stop_words and len(w) > 1:
            new_text = new_text + " " + w
    return new_text

In [5]:
def remove_punctuation(data):
    symbols = "!\"#$%&()*+-./:;<=>?@[\]^_`{|}~\n"
    for i in range(len(symbols)):
        data = np.char.replace(data, symbols[i], ' ')
        data = np.char.replace(data, "  ", " ")
    data = np.char.replace(data, ',', '')
    return data

In [6]:
def remove_apostrophe(data):
    return np.char.replace(data, "'", "")

In [7]:
def stemming(data):
    stemmer= SnowballStemmer(language='english')
    
    tokens = word_tokenize(str(data))
    new_text = ""
    for w in tokens:
        new_text = new_text + " " + stemmer.stem(w)
    return new_text

In [8]:
def stemming2(data):
    hobj = Hunspell('en_US')
    tokens = word_tokenize(str(data))
    new_text = ""
    for w in tokens:
        if len(hobj.stem(w)) > 0:
            new_text = new_text + " " + hobj.stem(w)[0]
        else:
            new_text = new_text + " " + w
    return new_text

In [10]:
def convert_numbers(data):
    tokens = word_tokenize(str(data))
    new_text = ""
    for w in tokens:
        try:
            w = num2words(int(w))
        except:
            a = 0
        new_text = new_text + " " + w
    new_text = np.char.replace(new_text, "-", " ")
    return new_text

In [11]:
def preprocess(data):
    data = convert_lower_case(data)
    data = remove_punctuation(data) #remove comma seperately
    data = remove_apostrophe(data)
    data = remove_stop_words(data)
    data = convert_numbers(data)
    data = stemming2(data)
    data = remove_punctuation(data)
    data = convert_numbers(data)
    data = stemming2(data) #needed again as we need to stem the words
    data = remove_punctuation(data) #needed again as num2word is giving few hypens and commas fourty-one
    data = remove_stop_words(data) #needed again as num2word is giving stop words 101 - one hundred and one
    return data

In [32]:
# Only runs when need to reload the dataset
processed_text = []
processed_title = []

for i in dataset[:N]:
    file = open(i[0], 'r', encoding="utf8", errors='ignore')
    text = file.read().strip()
    file.close()

    processed_text.append(word_tokenize(str(preprocess(text))))
    processed_title.append(word_tokenize(str(preprocess(i[1]))))

In [19]:
# Only runs when need to reload the dataset
with open('extracted_data/processed_text.data', 'wb') as f:
    pickle.dump(processed_text, f)
with open('extracted_data/processed_title.data', 'wb') as f:
    pickle.dump(processed_title, f)

In [12]:
with open('extracted_data/processed_text.data', 'rb') as f:
    processed_text = pickle.load(f)
with open('extracted_data/processed_title.data', 'rb') as f:
    processed_title = pickle.load(f)

In [15]:
# Only runs when need to reload the dataset
DF = {}

for i in range(N):
    tokens = processed_text[i]
    for w in tokens:
        try:
            DF[w].add(i)
        except:
            DF[w] = {i}

    tokens = processed_title[i]
    for w in tokens:
        try:
            DF[w].add(i)
        except:
            DF[w] = {i}
for i in DF:
    DF[i] = len(DF[i])

In [16]:
# Only runs when need to reload the dataset
total_vocab_size = len(DF)
total_vocab = [x for x in DF]
def get_doc_freq(word):
    try:
        c = DF[word]
    except:
        c = 0
    return c

In [23]:
# Only runs when need to reload the dataset
doc = 0

TF_IDF = {}

files_list = []
total_count_all = 0
idf_all = {}

for i in range(N):
    
    tokens = processed_text[i]
    counter = Counter(tokens + processed_title[i]) # Because we need to sum up all the word from title and body, anw it's about unique word and its counter of each word
    words_count = len(tokens + processed_title[i]) # Same thing, btw it's about all the words appeared, even duplicated
    tfs = {}
    counters = {}
    total_count = 0

    for token in np.unique(tokens):
        tf = counter[token]/words_count
        tfs.update({token:tf})
        counters.update({token:counter[token]})
        total_count += counter[token]
        df = get_doc_freq(token)
        idf = np.log((N+1)/(df+1)) # Numerator is added 1 to avoid negative values
        idf_all.update({token:idf}) 
        TF_IDF[doc, token] = tf*idf

    tmp_dict = {'name': filenames[i], "words_counting":counters, "total_count":total_count, "TF_scores":tfs}
    files_list.append(tmp_dict)

    total_count_all += total_count
    doc += 1
    
all_list = {'name': 'ALL_Documents', 'total_count': total_count_all, 'DF': DF, 'IDF_scores': idf_all}

1447
2908


In [36]:
# Only runs when need to reload the dataset
doc = 0

TF_IDF_title = {}

files_list_title = []
total_count_all_title = 0
idf_all_title = {}

for i in range(N):
    
    tokens = processed_title[i]
    counter = Counter(tokens + processed_text[i])
    words_count = len(tokens + processed_text[i])
    tfs = {}
    counters = {}
    total_count = 0

    for token in np.unique(tokens):
        tf = counter[token]/words_count
        tfs.update({token:tf})
        counters.update({token:counter[token]})
        total_count += counter[token]
        df = get_doc_freq(token)
        idf = np.log((N+1)/(df+1)) 
        TF_IDF_title[doc, token] = tf*idf

    tmp_dict = {'name': filenames[i], "words_counting":counters, "total_count":total_count, "TF_scores":tfs}
    files_list_title.append(tmp_dict)
    doc += 1


In [111]:
# Only runs when need to reload the dataset
with open('extracted_data/all_docs_info.json', 'w') as f:
    f.write(json.dumps(all_list, indent=4))
with open('extracted_data/all_text_info.json', 'w') as f:
    f.write(json.dumps(files_list, indent=4))
with open('extracted_data/all_title_info.json', 'w') as f:
    f.write(json.dumps(files_list_title, indent=4))

In [37]:
# Only runs when need to reload the dataset
tf_idf_tmp = copy.deepcopy(TF_IDF)
tf_idf_title_tmp = copy.deepcopy(TF_IDF_title)
# tf_idf_tmp

In [38]:
# Only runs when need to reload the dataset
TF_IDF = copy.deepcopy(tf_idf_tmp)
TF_IDF_title = copy.deepcopy(tf_idf_title_tmp)
alpha = 0.4
for i in TF_IDF:
    TF_IDF[i] *= alpha
for i in TF_IDF_title:
    try:
        TF_IDF[i] += TF_IDF_title[i] * (1-alpha)
    except KeyError:
        continue
TF_IDF_tmp = copy.deepcopy(TF_IDF)


In [39]:
# Only runs when need to reload the dataset
vectorized_array = np.zeros((N, total_vocab_size))
for i in TF_IDF_tmp:
    try:
        ind = total_vocab.index(i[1])
        vectorized_array[i[0]][ind] = TF_IDF_tmp[i]
    except:
        pass

In [11]:
with open('extracted_data/all_docs_info.json', 'r') as f:
    DF_tmp = json.load(f).get('DF')
    
def cosine_sim(a, b):
    cos_sim = np.dot(a, b)/(np.linalg.norm(a)*np.linalg.norm(b))
    return cos_sim

def get_doc_freg_for_vector(token):
    try:
        c = DF_tmp[token]
    except:
        c = 0
    return c

def gen_vector(tokens):
    total_vocab = [x for x in DF_tmp]
    Q = np.zeros((len(total_vocab)))
    
    counter = Counter(tokens)
    words_count = len(tokens)
    
    for token in np.unique(tokens):
        
        tf = counter[token]/words_count
        df = get_doc_freg_for_vector(token)
        idf = math.log((len(processed_text)+1)/(df+1))

        try:
            ind = total_vocab.index(token)
            Q[ind] = tf*idf
        except:
            pass
    return Q

In [40]:
# Only runs when need to reload the dataset
doc_id = 0
score_dict = {}
tmp_dict_tfidf = {}
tfidf_list = []
for i in TF_IDF:
    if i[0] != doc_id:
        tmp_dict_tfidf.update({"name": filenames[doc_id], "scores": score_dict})
        tfidf_list.append(tmp_dict_tfidf)
        doc_id = i[0]
        tmp_dict_tfidf = {}
        score_dict = {}
    doc_name = filenames[i[0]]
    token = i[1]
    score_dict.update({token:TF_IDF[i]})


In [41]:
# Only runs when need to reload the dataset
with open('extracted_data/tf_idf_scores.json', 'w') as f:
    f.write(json.dumps(tfidf_list, indent=4))
with open('extracted_data/tf_idf_vectorized.npy', 'wb') as f:
    np.save(f, vectorized_array)

In [13]:
with open("extracted_data/tf_idf_scores.json", 'r', encoding="utf8") as j:
    TF_IDF = json.loads(j.read())
vectorized_array = np.load('extracted_data/tf_idf_vectorized.npy')
# TF_IDF

In [14]:
def matching_score(k, query):
    preprocessed_query = preprocess(query)
    tokens = word_tokenize(str(preprocessed_query))

    print("Matching Score")
    print("\nQuery:", query)
    print("")
    print(tokens)
    
    query_weights = {}

    for idx, file in enumerate(TF_IDF):
        scores = file.get('scores')
        for key in scores:
            if key in tokens:
                try:
                    query_weights[idx] += scores[key]
                except:
                    query_weights[idx] = scores[key]
    
    query_weights = sorted(query_weights.items(), key=lambda x: x[1], reverse=True)

    print("")
    
    l = []
    
    for i in query_weights[:k]:
        l.append(filenames[i[0]])
    
    print(l)
    

matching_score(10, "Ho Chi Minh the Greatest General")

Matching Score

Query: Ho Chi Minh the Greatest General

['ho', 'chi', 'minh', 'great', 'general']

['42.txt', '8.txt', '73.txt', '87.txt', '26.txt', '27.txt', '105.txt', '44.txt', '90.txt', '35.txt']


In [16]:
def cosine_similarity(k, query):
    print("Cosine Similarity")
    preprocessed_query = preprocess(query)
    tokens = word_tokenize(str(preprocessed_query))
    
    print("\nQuery:", query)
    print("")
    print(tokens)
    
    d_cosines = []
    
    query_vector = gen_vector(tokens)
    
    for d in vectorized_array:
        d_cosines.append(cosine_sim(query_vector, d))
        
    out = np.array(d_cosines).argsort()[-k:][::-1]
    
    print("")
    
    for idx in out:
        print(filenames[idx], end=" ")

#     for i in out:
#         print(i, dataset[i][0])

Q = cosine_similarity(10, "Ho Chi Minh the Greatest General")

Cosine Similarity

Query: Ho Chi Minh the Greatest General

['ho', 'chi', 'minh', 'great', 'general']

42.txt 8.txt 35.txt 87.txt 26.txt 90.txt 73.txt 27.txt 105.txt 81.txt 