In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
%cd drive/MyDrive/NLP/Project

/content/drive/MyDrive/NLP/Project


### Install Required Packages

In [3]:
!pip install hazm
!pip install tokenizers

Collecting hazm
  Downloading hazm-0.7.0-py3-none-any.whl (316 kB)
[K     |████████████████████████████████| 316 kB 5.2 MB/s 
[?25hCollecting nltk==3.3
  Downloading nltk-3.3.0.zip (1.4 MB)
[K     |████████████████████████████████| 1.4 MB 55.5 MB/s 
[?25hCollecting libwapiti>=0.2.1
  Downloading libwapiti-0.2.1.tar.gz (233 kB)
[K     |████████████████████████████████| 233 kB 62.3 MB/s 
Building wheels for collected packages: nltk, libwapiti
  Building wheel for nltk (setup.py) ... [?25l[?25hdone
  Created wheel for nltk: filename=nltk-3.3-py3-none-any.whl size=1394488 sha256=bb05f8fd46fe4fe0f3e7df0475501c18809233e4eab95ec2377c06dfead7b3a3
  Stored in directory: /root/.cache/pip/wheels/9b/fd/0c/d92302c876e5de87ebd7fc0979d82edb93e2d8d768bf71fac4
  Building wheel for libwapiti (setup.py) ... [?25l[?25hdone
  Created wheel for libwapiti: filename=libwapiti-0.2.1-cp37-cp37m-linux_x86_64.whl size=154784 sha256=04555dc81726f72941125f79e7e6f708d0a3f7fd24896185042016ec2acb4342
  Stored

### Import Required Packages

In [56]:
import glob
from hazm import *
import codecs
import tqdm
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import yaml
import glob
import linecache
import matplotlib.pyplot as plt
from tokenizers import Tokenizer, models, pre_tokenizers, decoders, trainers, processors
from gensim.models import FastText
from gensim.models import Word2Vec
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

normalizer = Normalizer()
stemmer = Stemmer()
lemmatizer = Lemmatizer()
import pandas as pd

### Read data

In [28]:
poets = glob.glob('./*.txt')
poems = []

for poem_file in poets:
    with open(poem_file, encoding='utf-8', mode='r') as fp:
        line = fp.readline()
        cnt = 1
        box = ''
        while line:
            if line.strip() != '':
                box = box + ' ' + line.strip()
                if cnt % 2 == 0:
                    poems.append(box.strip())
                    box = ''
                cnt += 1
            line = fp.readline()

In [72]:
queries = []
query_indexes = []
queries1_matches = glob.glob('./Evaluation/queries1/retrieved/*.txt')

for i in range(1, 51):
    with open('./Evaluation/queries1/retrieved/' + str(i) + '.txt', encoding='utf-8', mode='r') as fp:
        indexes = []
        line = fp.readline()
        while line:
            if line.strip() != '':
                try:
                    index = poems.index(line.strip())
                    indexes.append(index)
                except Exception:
                    pass
            line = fp.readline()
    if len(indexes):
        query_indexes.append(indexes)
        query = linecache.getline('./Evaluation/queries1/queries.txt', i)
        queries.append(query)



queries2_matches = glob.glob('./Evaluation/queries2/retrieved/*.txt')

for i in range(1, 51):
    with open('./Evaluation/queries2/retrieved/' + str(i) + '.txt', encoding='utf-8', mode='r') as fp:
        indexes = []
        line = fp.readline()
        while line:
            if line.strip() != '':
                try:
                    index = poems.index(line.strip())
                    indexes.append(index)
                except Exception:
                    pass
            line = fp.readline()
    if len(indexes):
        query_indexes.append(indexes)
        query = linecache.getline('./Evaluation/queries2/queries.txt', i).strip()
        queries.append(query)

In [123]:
def MMR(model_path):
    mrr_values = []
    for ind, query in enumerate(tqdm.tqdm(queries)):
        model = Doc2Vec.load(model_path)
        tokens = word_tokenize(query)
        new_vector = model.infer_vector(doc_words=tokens)
        similarity = model.docvecs.most_similar([new_vector], topn = len(poems))
        similarity_indexes = []
        for sim in similarity:
            similarity_indexes.append(int(sim[0]))
        mrr_value = 0
        for query_index in query_indexes[ind]:
            mrr_value += 1/(similarity_indexes.index(query_index) + 1)
        mrr_values.append(mrr_value / len(query_indexes[ind]))

    return mrr_values


def evaluate_model(model_path, precisions_at_k_values):
    mrr_values = []
    precision_at_k = np.zeros(len(precisions_at_k_values))
    for ind, query in enumerate(tqdm.tqdm(queries)):
        model = Doc2Vec.load(model_path)
        tokens = word_tokenize(query)
        new_vector = model.infer_vector(doc_words=tokens)
        similarity = model.docvecs.most_similar([new_vector], topn = len(poems))
        similarity_indexes = []
        for sim in similarity:
            similarity_indexes.append(int(sim[0]))
        mrr_value = 0
        for query_index in query_indexes[ind]:
            mrr_value += 1/(similarity_indexes.index(query_index) + 1)
        mrr_values.append(mrr_value / len(query_indexes[ind]))

        for ind_pak, precisions_at_k_value in enumerate(precisions_at_k_values):
            ground_truth = set(query_indexes[ind])
            preds = set(similarity_indexes[:precisions_at_k_value])
            intersection = ground_truth.intersection(preds)
            precision_at_k[ind_pak] += len(intersection) / min(precisions_at_k_value, len(ground_truth))

    precision_at_k /= len(queries)        
    return mrr_values, precision_at_k

In [132]:
dbow_300_mrr, dbow_300_pak = evaluate_model('./Models/PV_DBOW_vec300', [20, 40, 60, 80, 100])

100%|██████████| 99/99 [11:34<00:00,  7.01s/it]


In [133]:
print('DBOW with vector size 300 results:')
print('MRR: {}'.format(np.mean(dbow_300_mrr)))
print('precision at k = 20: {}'.format(dbow_300_pak[0]))
print('precision at k = 40: {}'.format(dbow_300_pak[1]))
print('precision at k = 60: {}'.format(dbow_300_pak[2]))
print('precision at k = 80: {}'.format(dbow_300_pak[3]))
print('precision at k = 100: {}'.format(dbow_300_pak[4]))

DBOW with vector size 300 results:
MRR: 0.052692182549789295
precision at k = 20: 0.21364975380361428
precision at k = 40: 0.27462429912750225
precision at k = 60: 0.3035470857379756
precision at k = 80: 0.3165068205315578
precision at k = 100: 0.33488324744464126


In [134]:
dbow_100_mrr, dbow_100_pak = evaluate_model('./Models/PV_DBOW_vec100', [20, 40, 60, 80, 100])

100%|██████████| 99/99 [09:09<00:00,  5.55s/it]


In [135]:
print('DBOW with vector size 100 results:')
print('MRR: {}'.format(np.mean(dbow_100_mrr)))
print('precision at k = 20: {}'.format(dbow_100_pak[0]))
print('precision at k = 40: {}'.format(dbow_100_pak[1]))
print('precision at k = 60: {}'.format(dbow_100_pak[2]))
print('precision at k = 80: {}'.format(dbow_100_pak[3]))
print('precision at k = 100: {}'.format(dbow_100_pak[4]))

DBOW with vector size 100 results:
MRR: 0.026599293137089478
precision at k = 20: 0.13029803053374678
precision at k = 40: 0.1600316454576327
precision at k = 60: 0.17870817278157067
precision at k = 80: 0.19168114112736995
precision at k = 100: 0.2103352911278365


In [136]:
dm_300_mrr, dm_300_pak = evaluate_model('./Models/PV_DM_vec300', [20, 40, 60, 80, 100])

100%|██████████| 99/99 [11:47<00:00,  7.15s/it]


In [141]:
print('DM with vector size 300 results:')
print('MRR: {}'.format(np.mean(dm_300_mrr)))
print('precision at k = 20: {}'.format(dm_300_pak[0]))
print('precision at k = 40: {}'.format(dm_300_pak[1]))
print('precision at k = 60: {}'.format(dm_300_pak[2]))
print('precision at k = 80: {}'.format(dm_300_pak[3]))
print('precision at k = 100: {}'.format(dm_300_pak[4]))

DM with vector size 300 results:
MRR: 0.018768259720678276
precision at k = 20: 0.04359157662586688
precision at k = 40: 0.05194164071797719
precision at k = 60: 0.05836116298139203
precision at k = 80: 0.06448413370253436
precision at k = 100: 0.06722188208559116


In [138]:
dm_100_mrr, dm_100_pak = evaluate_model('./Models/PV_DM_vec100', [20, 40, 60, 80, 100])

100%|██████████| 99/99 [09:20<00:00,  5.67s/it]


In [142]:
print('DM with vector size 100 results:')
print('MRR: {}'.format(np.mean(dm_100_mrr)))
print('precision at k = 20: {}'.format(dm_100_pak[0]))
print('precision at k = 40: {}'.format(dm_100_pak[1]))
print('precision at k = 60: {}'.format(dm_100_pak[2]))
print('precision at k = 80: {}'.format(dm_100_pak[3]))
print('precision at k = 100: {}'.format(dm_100_pak[4]))

DM with vector size 100 results:
MRR: 0.0002619389159030986
precision at k = 20: 0.000505050505050505
precision at k = 40: 0.0002525252525252525
precision at k = 60: 0.0002463661000246366
precision at k = 80: 0.0002463661000246366
precision at k = 100: 0.0002463661000246366
