In [1]:
from hazm import *
import openpyxl
from pathlib import Path
import collections
import math
import itertools


In [2]:

xlsx_file = Path('../IR_Spring2021_ph12_7k.xlsx')
wb_obj = openpyxl.load_workbook(xlsx_file)
sheet = wb_obj.active
N_documents = 0
normalizer = Normalizer()
stemmer = Stemmer()
tokens = []
doc_vectors = {}

In [3]:
punctuations = [':', '،', '.', ')', '(', '}', '{', '؟', '!', '-', '/', '؛', '#', '*', '\n', '\"',
                ']', '[', '«', '»', '٪', '+', '٠', '\\', '\"', '_', '\'']



In [5]:
%%time
for row in sheet.iter_rows():
    each_doc_vector = {}
    line = row[1].value
    if row[0].value == 'id':
        continue
    N_documents += 1
    line = normalizer.normalize(line)
    for p in punctuations:
        if p in line:
            line = line.replace(p, " ")
    extracted_tokens = word_tokenize(line)
    stem_tokens = []
    for t in extracted_tokens:
        stem_tokens.append(stemmer.stem(t))

    term_doc = []
    for w in stem_tokens:
        term_doc.append({'term': w, 'doc_id': int(row[0].value)})
    tokens.extend(term_doc)
    
    for t in stem_tokens:
        each_doc_vector[t] = each_doc_vector.get(t, 0) + 1
    doc_vectors[int(row[0].value)] = collections.OrderedDict(sorted(each_doc_vector.items()))


Wall time: 11 s


In [6]:
%%time
dictionary = {}
inverted_index = {}
idf = {}
doc_term_frq = []
for t in tokens:
    word = t['term']
    doc_id = t['doc_id']
    
    frq = dictionary.get(word, 0)
    dictionary[word] = frq + 1

    doc_list = inverted_index.get(word, [])
    doc_list.append(doc_id)
    inverted_index[word] = doc_list
    
    

Wall time: 2.96 s


In [7]:
%%time
# remove stop words
frq_sorted_dic = {k: v for k, v in sorted(dictionary.items(), key=lambda item: item[1])}
stop_words = dict(itertools.islice(reversed(frq_sorted_dic.items()), 20))
for s in stop_words:
    dictionary.pop(s)
    inverted_index.pop(s)

Wall time: 53.1 ms


In [8]:
%%time

inver_inex_dup = inverted_index.copy()
for i in inverted_index:
    inverted_index[i] = list(dict.fromkeys(inverted_index[i]))
inverted_index = collections.OrderedDict(sorted(inverted_index.items()))
dictionary = collections.OrderedDict(sorted(dictionary.items()))

print(inverted_index.keys())

odict_keys(['', '&', '&NID=', '&pageid=', ',', ',everywhere', ';', ';A', '<http', '<https', '=', '>', '?', '@HAMAYESHSALAMATVAZENDEGI', '@Motamed', '@gmail', '@isna', '@motahari', '@student', '@unesco', '@unescocps', 'A', 'A,C,D,E', 'AA', 'AB', 'AC', 'ACA', 'ACE', 'ACE۲', 'ACL', 'AC۲', 'AD', 'ADSL', 'AE', 'AF', 'AFC', 'AFM', 'AHF', 'AI', 'AIDS', 'AIIB', 'AMD', 'AMH', 'AMP', 'ANDEROONشد', 'ANOC', 'APKCombo', 'APKpure', 'APM', 'APP', 'APT', 'ARJ', 'ASFR', 'ASMRاس', 'ASR', 'ASTAR', 'ATM', 'ATR', 'About', 'Academic', 'Academy', 'Added', 'Advanced', 'Advances', 'AfC', 'Age', 'Agent', 'Aix', 'Analysis', 'Annals', 'Antiblock', 'App', 'Arabic', 'Archive', 'Associated', 'AstraZeneca', 'Attenuated', 'Awakend', 'Awakened', 'A۱', 'A۱C', 'A۲', 'A۳', 'A۶', 'A۷', 'A۸', 'A۹', 'B', 'BA', 'BAK', 'BBIBP', 'BE', 'BI', 'BICU', 'BMI', 'BMJ', 'BMT', 'BMW', 'BNT', 'BTS', 'Based', 'Baseline', 'Batch', 'Belt', 'BioNTech', 'Bit', 'Bjelovar', 'Blower', 'Bortezomib', 'Braskem', 'Break', 'Brown', 'Business', 'B۰', 

In [9]:
%%time

for term in inverted_index:
    df = len(inverted_index[term])
    idf[term] = math.log(N_documents / df, 10)
    print(idf[term])

1.0548128759810151
3.8450980400142565
3.8450980400142565
4.146128035678237
2.867374434725409
3.8450980400142565
4.146128035678237
4.146128035678237
3.8450980400142565
4.146128035678237
3.8450980400142565
3.447158031342219
2.970036776622557
4.146128035678237
4.146128035678237
4.146128035678237
4.146128035678237
4.146128035678237
4.146128035678237
3.8450980400142565
3.6690067809585756
2.47403017774252
4.146128035678237
2.2485009443877964
2.9156791142999636
2.455931955649724
4.146128035678237
4.146128035678237
4.146128035678237
4.146128035678237
4.146128035678237
2.5663444390614276
3.301029995663981
2.455931955649724
2.1823402083326826
2.970036776622557
3.8450980400142565
4.146128035678237
3.8450980400142565
4.146128035678237
4.146128035678237
4.146128035678237
4.146128035678237
4.146128035678237
4.146128035678237
4.146128035678237
4.146128035678237
4.146128035678237
4.146128035678237
4.146128035678237
4.146128035678237
4.146128035678237
4.146128035678237
4.146128035678237
4.1461280356782

4.146128035678237
3.301029995663981
4.146128035678237
4.146128035678237
4.146128035678237
4.146128035678237
4.146128035678237
4.146128035678237
4.146128035678237
3.8450980400142565
4.146128035678237
4.146128035678237
4.146128035678237
4.146128035678237
4.146128035678237
4.146128035678237
4.146128035678237
4.146128035678237
4.146128035678237
4.146128035678237
4.146128035678237
4.146128035678237
4.146128035678237
4.146128035678237
4.146128035678237
4.146128035678237
4.146128035678237
4.146128035678237
3.8450980400142565
4.146128035678237
3.6690067809585756
4.146128035678237
4.146128035678237
3.5440680443502752
4.146128035678237
4.146128035678237
4.146128035678237
4.146128035678237
4.146128035678237
1.4276263468109638
2.73115468770742
2.8037053548560316
4.146128035678237
4.146128035678237
2.201645363528069
3.243038048686294
3.3679767852945943
3.6690067809585756
2.9420080530223127
3.066946789630613
1.2562663144200497
3.8450980400142565
3.6690067809585756
3.8450980400142565
4.14612803567823

4.146128035678237
3.5440680443502752
4.146128035678237
4.146128035678237
4.146128035678237
3.8450980400142565
4.146128035678237
3.447158031342219
2.8037053548560316
3.301029995663981
3.6690067809585756
3.8450980400142565
4.146128035678237
4.146128035678237
4.146128035678237
4.146128035678237
4.146128035678237
2.0633426653617875
2.6690067809585756
2.8450980400142565
3.6690067809585756
2.447158031342219
4.146128035678237
3.8450980400142565
3.243038048686294
2.9999999999999996
4.146128035678237
3.6690067809585756
3.301029995663981
2.823908740944318
3.191885526238913
3.5440680443502752
3.8450980400142565
4.146128035678237
4.146128035678237
1.6675615400843944
3.104735350520013
3.5440680443502752
3.6690067809585756
2.8450980400142565
3.447158031342219
1.8813102126687014
1.7825160557860937
4.146128035678237
4.146128035678237
4.146128035678237
4.146128035678237
3.6690067809585756
2.8450980400142565
4.146128035678237
4.146128035678237
4.146128035678237
4.146128035678237
2.6837300377792817
4.146

2.5026753591920503
3.8450980400142565
3.8450980400142565
4.146128035678237
3.8450980400142565
4.146128035678237
3.447158031342219
4.146128035678237
2.8908555305749317
3.8450980400142565
3.447158031342219
3.191885526238913
3.8450980400142565
4.146128035678237
3.6690067809585756
4.146128035678237
3.8450980400142565
2.8450980400142565
4.146128035678237
3.6690067809585756
4.146128035678237
4.146128035678237
2.066946789630613
2.970036776622557
3.447158031342219
2.483370203996664
4.146128035678237
4.146128035678237
1.6146491186359828
3.1461280356782377
3.3679767852945943
3.6690067809585756
2.867374434725409
3.8450980400142565
2.7147642715192504
2.3136191229720016
2.47403017774252
2.9420080530223127
3.3679767852945943
4.146128035678237
3.5440680443502752
4.146128035678237
4.146128035678237
4.146128035678237
4.146128035678237
4.146128035678237
3.8450980400142565
3.5440680443502752
4.146128035678237
4.146128035678237
4.146128035678237
2.066946789630613
1.6928096956312002
4.146128035678237
3.367

3.5440680443502752
3.1461280356782377
3.8450980400142565
4.146128035678237
3.8450980400142565
1.0597682050034898
4.146128035678237
4.146128035678237
3.447158031342219
2.602059991327962
4.146128035678237
3.447158031342219
3.447158031342219
2.9156791142999636
2.7481880270062002
3.032184683371401
4.146128035678237
4.146128035678237
1.8197921747494865
3.6690067809585756
3.3679767852945943
3.066946789630613
2.970036776622557
3.8450980400142565
3.301029995663981
4.146128035678237
2.73115468770742
2.1549019599857426
2.7481880270062002
4.146128035678237
2.9420080530223127
2.4648867983026506
1.913131925286084
2.2485009443877964
2.382700042115301
2.3072789449409825
2.970036776622557
4.146128035678237
2.9156791142999636
3.8450980400142565
2.7481880270062002
3.8450980400142565
2.276896315947262
2.73115468770742
2.455931955649724
4.146128035678237
4.146128035678237
4.146128035678237
2.654766341843965
3.6690067809585756
3.6690067809585756
3.5440680443502752
2.9999999999999996
4.146128035678237
4.146

3.301029995663981
3.6690067809585756
3.447158031342219
3.6690067809585756
4.146128035678237
2.7147642715192504
4.146128035678237
3.8450980400142565
4.146128035678237
3.8450980400142565
4.146128035678237
3.8450980400142565
3.5440680443502752
4.146128035678237
4.146128035678237
2.0
3.6690067809585756
3.1461280356782377
4.146128035678237
3.6690067809585756
4.146128035678237
4.146128035678237
4.146128035678237
1.1870866433571443
3.8450980400142565
4.146128035678237
4.146128035678237
0.6118480304731565
4.146128035678237
4.146128035678237
4.146128035678237
3.5440680443502752
4.146128035678237
4.146128035678237
3.301029995663981
3.8450980400142565
2.823908740944318
3.3679767852945943
4.146128035678237
4.146128035678237
3.3679767852945943
3.243038048686294
3.8450980400142565
4.146128035678237
4.146128035678237
4.146128035678237
3.6690067809585756
3.8450980400142565
4.146128035678237
3.8450980400142565
3.8450980400142565
3.6690067809585756
3.8450980400142565
1.7920195965308368
3.845098040014256

4.146128035678237
3.5440680443502752
2.6146491186359824
4.146128035678237
2.1870866433571443
4.146128035678237
3.5440680443502752
1.4938816946749147
4.146128035678237
4.146128035678237
3.8450980400142565
3.8450980400142565
3.6690067809585756
3.447158031342219
3.3679767852945943
4.146128035678237
3.8450980400142565
4.146128035678237
3.191885526238913
4.146128035678237
4.146128035678237
3.243038048686294
4.146128035678237
4.146128035678237
2.765916793966632
4.146128035678237
3.3679767852945943
3.301029995663981
4.146128035678237
1.7412943190583
3.243038048686294
3.6690067809585756
4.146128035678237
4.146128035678237
3.8450980400142565
2.7481880270062002
4.146128035678237
4.146128035678237
1.2403321553103692
3.3679767852945943
4.146128035678237
3.8450980400142565
2.9156791142999636
4.146128035678237
4.146128035678237
4.146128035678237
4.146128035678237
4.146128035678237
2.271066772286538
3.104735350520013
3.104735350520013
4.146128035678237
4.146128035678237
1.561796811310707
4.1461280356

3.5440680443502752
3.5440680443502752
4.146128035678237
3.6690067809585756
4.146128035678237
4.146128035678237
3.447158031342219
4.146128035678237
2.5550634286517386
3.243038048686294
4.146128035678237
1.5007057663291459
1.7825160557860937
4.146128035678237
4.146128035678237
3.3679767852945943
4.146128035678237
2.6409780573583315
3.6690067809585756
3.8450980400142565
4.146128035678237
4.146128035678237
4.146128035678237
3.8450980400142565
3.8450980400142565
2.47403017774252
3.191885526238913
4.146128035678237
3.8450980400142565
4.146128035678237
4.146128035678237
3.243038048686294
4.146128035678237
2.970036776622557
2.3265841001363694
4.146128035678237
3.8450980400142565
4.146128035678237
3.6690067809585756
3.5440680443502752
4.146128035678237
3.8450980400142565
4.146128035678237
4.146128035678237
4.146128035678237
4.146128035678237
3.8450980400142565
3.032184683371401
3.8450980400142565
4.146128035678237
2.5228787452803374
4.146128035678237
4.146128035678237
2.970036776622557
3.845098

4.146128035678237
2.1332908109730657
3.6690067809585756
3.8450980400142565
3.8450980400142565
3.8450980400142565
2.823908740944318
4.146128035678237
2.5333441789585023
3.5440680443502752
4.146128035678237
4.146128035678237
3.8450980400142565
4.146128035678237
3.032184683371401
0.8446639625349381
3.8450980400142565
3.032184683371401
4.146128035678237
3.243038048686294
4.146128035678237
4.146128035678237
4.146128035678237
4.146128035678237
3.032184683371401
3.8450980400142565
4.146128035678237
3.8450980400142565
1.739587855244283
2.9156791142999636
4.146128035678237
1.9156791142999638
3.8450980400142565
2.73115468770742
4.146128035678237
4.146128035678237
1.7147642715192506
4.146128035678237
3.8450980400142565
2.8450980400142565
4.146128035678237
2.5898255349109505
4.146128035678237
3.032184683371401
2.3607982006674706
3.447158031342219
4.146128035678237
4.146128035678237
3.3679767852945943
4.146128035678237
3.6690067809585756
2.0157942671832316
4.146128035678237
3.8450980400142565
3.301

4.146128035678237
2.104735350520013
4.146128035678237
4.146128035678237
4.146128035678237
4.146128035678237
4.146128035678237
2.7147642715192504
2.867374434725409
3.8450980400142565
3.032184683371401
2.867374434725409
4.146128035678237
4.146128035678237
0.9966010219238901
3.5440680443502752
3.447158031342219
3.447158031342219
4.146128035678237
4.146128035678237
4.146128035678237
3.8450980400142565
4.146128035678237
4.146128035678237
4.146128035678237
2.1461280356782377
4.146128035678237
3.032184683371401
2.8450980400142565
3.243038048686294
4.146128035678237
4.146128035678237
4.146128035678237
4.146128035678237
4.146128035678237
3.8450980400142565
4.146128035678237
4.146128035678237
4.146128035678237
3.5440680443502752
2.3607982006674706
4.146128035678237
3.8450980400142565
3.447158031342219
3.8450980400142565
4.146128035678237
3.8450980400142565
4.146128035678237
3.8450980400142565
3.8450980400142565
3.5440680443502752
2.7147642715192504
3.5440680443502752
3.6690067809585756
3.6690067

3.6690067809585756
2.1208221704134678
4.146128035678237
3.8450980400142565
3.104735350520013
3.6690067809585756
3.3679767852945943
3.6690067809585756
3.1461280356782377
4.146128035678237
3.8450980400142565
4.146128035678237
2.2270499433021635
3.8450980400142565
4.146128035678237
3.301029995663981
3.301029995663981
4.146128035678237
2.3265841001363694
3.3679767852945943
3.447158031342219
1.576754126063192
3.1461280356782377
2.8908555305749317
3.8450980400142565
3.8450980400142565
4.146128035678237
1.8836769459478082
4.146128035678237
4.146128035678237
4.146128035678237
3.6690067809585756
2.6690067809585756
3.8450980400142565
3.191885526238913
3.6690067809585756
1.913131925286084
3.5440680443502752
4.146128035678237
2.4385578595803015
4.146128035678237
4.146128035678237
4.146128035678237
1.5934598195660445
4.146128035678237
2.9999999999999996
3.104735350520013
3.8450980400142565
3.6690067809585756
3.3679767852945943
2.73115468770742
3.8450980400142565
4.146128035678237
3.5440680443502752

4.146128035678237
1.408140709344807
3.8450980400142565
1.9817751798938008
4.146128035678237
3.6690067809585756
4.146128035678237
3.301029995663981
3.6690067809585756
2.5550634286517386
3.6690067809585756
3.6690067809585756
4.146128035678237
3.191885526238913
4.146128035678237
4.146128035678237
3.032184683371401
1.0311836199656532
3.8450980400142565
3.066946789630613
3.8450980400142565
4.146128035678237
4.146128035678237
4.146128035678237
4.146128035678237
3.8450980400142565
4.146128035678237
3.8450980400142565
4.146128035678237
4.146128035678237
2.3200532329774117
4.146128035678237
4.146128035678237
3.8450980400142565
3.6690067809585756
4.146128035678237
2.0031132354241428
4.146128035678237
4.146128035678237
4.146128035678237
4.146128035678237
4.146128035678237
3.5440680443502752
2.216709109963945
4.146128035678237
2.421852166077449
4.146128035678237
3.6690067809585756
4.146128035678237
4.146128035678237
4.146128035678237
3.8450980400142565
1.5386730124635692
4.146128035678237
3.845098

3.301029995663981
2.4929155219028942
3.6690067809585756
3.8450980400142565
2.447158031342219
3.8450980400142565
4.146128035678237
4.146128035678237
4.146128035678237
4.146128035678237
1.016438143478937
4.146128035678237
2.4929155219028942
3.6690067809585756
4.146128035678237
3.3679767852945943
4.146128035678237
3.8450980400142565
2.823908740944318
4.146128035678237
4.146128035678237
4.146128035678237
4.146128035678237
3.5440680443502752
1.9156791142999638
1.8218455803805451
4.146128035678237
4.146128035678237
4.146128035678237
4.146128035678237
4.146128035678237
3.8450980400142565
3.6690067809585756
2.045757490560675
4.146128035678237
4.146128035678237
4.146128035678237
4.146128035678237
4.146128035678237
4.146128035678237
4.146128035678237
3.6690067809585756
4.146128035678237
4.146128035678237
3.8450980400142565
3.032184683371401
4.146128035678237
3.243038048686294
3.066946789630613
3.6690067809585756
4.146128035678237
4.146128035678237
3.1461280356782377
3.8450980400142565
3.19188552

4.146128035678237
4.146128035678237
4.146128035678237
3.8450980400142565
4.146128035678237
3.6690067809585756
4.146128035678237
2.447158031342219
2.089223184341765
3.8450980400142565
3.5440680443502752
4.146128035678237
4.146128035678237
3.5440680443502752
4.146128035678237
3.1461280356782377
4.146128035678237
3.301029995663981
1.4419775188384387
3.191885526238913
4.146128035678237
3.066946789630613
3.6690067809585756
4.146128035678237
3.5440680443502752
3.6690067809585756
4.146128035678237
2.512659580098651
4.146128035678237
4.146128035678237
4.146128035678237
2.6276140958003507
4.146128035678237
4.146128035678237
3.5440680443502752
4.146128035678237
4.146128035678237
1.9234115645306544
2.271066772286538
3.3679767852945943
4.146128035678237
3.032184683371401
2.191885526238913
3.8450980400142565
3.243038048686294
4.146128035678237
2.7844001996606447
3.066946789630613
4.146128035678237
4.146128035678237
3.5440680443502752
2.9420080530223127
4.146128035678237
4.146128035678237
4.14612803

4.146128035678237
3.6690067809585756
3.8450980400142565
4.146128035678237
3.3679767852945943
4.146128035678237
1.5301779840218368
4.146128035678237
4.146128035678237
4.146128035678237
4.146128035678237
4.146128035678237
4.146128035678237
4.146128035678237
2.2653144433974464
3.6690067809585756
4.146128035678237
3.5440680443502752
4.146128035678237
3.6690067809585756
4.146128035678237
4.146128035678237
3.8450980400142565
3.6690067809585756
4.146128035678237
2.9420080530223127
3.8450980400142565
3.6690067809585756
4.146128035678237
3.447158031342219
2.6146491186359824
3.066946789630613
4.146128035678237
4.146128035678237
4.146128035678237
4.146128035678237
4.146128035678237
3.3679767852945943
4.146128035678237
3.301029995663981
3.6690067809585756
3.8450980400142565
4.146128035678237
4.146128035678237
3.8450980400142565
4.146128035678237
4.146128035678237
4.146128035678237
4.146128035678237
4.146128035678237
3.447158031342219
4.146128035678237
4.146128035678237
4.146128035678237
4.14612803

4.146128035678237
4.146128035678237
3.104735350520013
4.146128035678237
4.146128035678237
4.146128035678237
3.5440680443502752
4.146128035678237
4.146128035678237
4.146128035678237
3.8450980400142565
4.146128035678237
4.146128035678237
4.146128035678237
3.8450980400142565
4.146128035678237
3.8450980400142565
3.5440680443502752
4.146128035678237
4.146128035678237
3.301029995663981
4.146128035678237
4.146128035678237
3.3679767852945943
4.146128035678237
4.146128035678237
3.8450980400142565
4.146128035678237
4.146128035678237
4.146128035678237
3.8450980400142565
2.3200532329774117
2.970036776622557
3.3679767852945943
4.146128035678237
3.8450980400142565
3.447158031342219
4.146128035678237
3.3679767852945943
3.301029995663981
3.8450980400142565
3.301029995663981
3.3679767852945943
3.447158031342219
4.146128035678237
3.5440680443502752
4.146128035678237
4.146128035678237
1.3593766135326768
3.5440680443502752
4.146128035678237
3.8450980400142565
3.6690067809585756
3.8450980400142565
3.845098

4.146128035678237
4.146128035678237
4.146128035678237
4.146128035678237
4.146128035678237
4.146128035678237
3.8450980400142565
4.146128035678237
4.146128035678237
4.146128035678237
4.146128035678237
4.146128035678237
3.5440680443502752
3.8450980400142565
4.146128035678237
2.7844001996606447
4.146128035678237
4.146128035678237
4.146128035678237
4.146128035678237
4.146128035678237
4.146128035678237
4.146128035678237
3.8450980400142565
4.146128035678237
4.146128035678237
4.146128035678237
4.146128035678237
3.1461280356782377
3.8450980400142565
4.146128035678237
4.146128035678237
3.8450980400142565
3.6690067809585756
4.146128035678237
3.6690067809585756
4.146128035678237
3.8450980400142565
4.146128035678237
4.146128035678237
4.146128035678237
3.8450980400142565
3.5440680443502752
4.146128035678237
4.146128035678237
4.146128035678237
3.8450980400142565
4.146128035678237
4.146128035678237
4.146128035678237
3.5440680443502752
4.146128035678237
3.8450980400142565
3.104735350520013
4.1461280356

# Index Elimination

In [11]:
%%time

index_elimination_model = {}
for term in inverted_index.keys():
    index_elimination_model[term] = {}
    for doc in inverted_index[term]:
        index_elimination_model[term][doc] = inver_inex_dup[term].count(doc)

# print(index_elimination_model['بود'])

Wall time: 32.3 s


### Champion List

In [12]:
%%time

champion_size = 150
champion_list = {}
for term in index_elimination_model.keys():
    document_dict = index_elimination_model[term]
    sorted_result = {k: v for k, v in sorted(document_dict.items(), key=lambda item: item[1])}
    sorted_result = dict(itertools.islice(reversed(sorted_result.items()), champion_size))
    champion_list[term] = collections.OrderedDict(sorted(sorted_result.items()))
# print(champion_list['بود'])

Wall time: 587 ms


### Query

In [13]:
%%time
query = input("Serach: ")
query = normalizer.normalize(query)
for p in punctuations:
    if p in line:
        query = query.replace(p, " ")
extracted_tokens = word_tokenize(query)
stem_tokens = []
for t in extracted_tokens:
    stem_tokens.append(stemmer.stem(t))

Serach: شلوار ساعت دریا شلوار
Wall time: 52.9 s


In [14]:
%%time
query_frq = {}
for t in stem_tokens:
    query_frq[t] = query_frq.get(t, 0) + 1
print(query_frq)

{'شلوار': 2, 'ساع': 1, 'دریا': 1}
Wall time: 0 ns


# Similarity

### No Champion List Results

In [24]:
%%time
match_docs = []
for term in query_frq.keys():
    match_docs.extend(inverted_index[term])
match_docs = list(dict.fromkeys(match_docs))


Wall time: 0 ns


In [33]:
result = {}
# for doc in match_docs:
#     cosine = 0.0
#     doc_l2 = 0.0
# #     query_l2 = 0
#     for term in query_frq.keys():
#         doc_tf = index_elimination_model[term].get(doc, 1)
#         cosine += (1+math.log(doc_tf, 10))*(idf[term])*(1+math.log(query_frq[term], 10))
#         doc_l2 += ((1+math.log(doc_tf, 10))*(idf[term])) ** 2
#     cosine /= math.sqrt(doc_l2)
#     result[doc] = cosine
# print(result)
for term in query_frq.keys():
    wq = 1+math.log(query_frq[term], 10)
    for doc in index_elimination_model[term].keys():
        doc_tf = index_elimination_model[term].get(doc, 0)
        wd = (1+math.log(doc_tf, 10))*(idf[term])
        result[doc] = result.get(doc, 0) + (wd*wq)
        
for doc in result.keys():
    l2_length = 0
    for t in doc.keys():
        l2_length += math.log(doc.get(t, 0))**2
    l2_length = math.sqrt(l2_length)
    result[doc] = result.get(doc, 0)/l2_length
print(result)

AttributeError: 'int' object has no attribute 'keys'

### Champion List Results

In [16]:
%%time
match_docs = []
for term in query_frq.keys():
    match_docs.extend(champion_list[term])
match_docs = list(dict.fromkeys(match_docs))


Wall time: 0 ns


In [29]:
%%time
result = {}
for doc in match_docs:
    cosine = 0.0
    doc_l2 = 0.0
#     query_l2 = 0
    for term in query_frq.keys():
        doc_tf = index_elimination_model[term].get(doc, 1)
        cosine += (1+math.log(doc_tf, 10))*(idf[term])*(1+math.log(query_frq[term], 10))
        doc_l2 += ((1+math.log(doc_tf, 10))*(idf[term])) ** 2
   
    cosine /= math.sqrt(doc_l2)
    result[doc] = cosine

# print(result)

17.190377688237078
6.742620122764807
3.8696834045345594
5.272824785211096
Wall time: 6.98 ms


# Results

In [19]:
K = 20

### Normal Sort

In [36]:

sorted_result = {k: v for k, v in sorted(result.items(), key=lambda item: item[1])}
sorted_result = dict(itertools.islice(reversed(sorted_result.items()), K))
for doc in sorted_result.keys():
    for row in sheet.iter_rows():
        if row[0].value == 'id':
            continue
        if int(row[0].value) == doc:
            print(f"Document {doc} :\n{row[2].value}", end='\n____________________________________________\n')


33
2704
2148
5433
1045
5586
4440
4606
4789
1044
2714
5943
3451
867
1183
1572
4857
1041
6273
2768


### Heap

In [26]:
%%time
import heapq as hq
# List to hold values from dictionary
heap_dict=[]

# extract the values from dictionary
for i in result.values():
    heap_dict.append(i)
    
# heapify the values
hq._heapify_max(heap_dict)   

sorted_result = []
# mapping and reconstructing final dictionary
for i in range(K):
    value = hq._heappop_max(heap_dict)
    for key,val in result.items():
        if value == val:
            sorted_result.append(key)
            break
              
print(sorted_result)

[4731, 740, 4719, 3732, 2878, 1635, 4098, 4098, 4098, 850, 850, 850, 850, 850, 1154, 1154, 1154, 1154, 1154, 1154]
Wall time: 996 µs


In [27]:
for doc in sorted_result:
    for row in sheet.iter_rows():
        if row[0].value == 'id':
            continue
        if int(row[0].value) == doc:
            print(f"Document {doc} :\n{row[2].value}", end='\n____________________________________________\n')
#             print(doc)

Document 4731 :
https://www.isna.ir/news/bushehr-584/شرایط-کاری-کارگران-عسلویه-باید-تغییر-کند-عسلویه-۱۰۰-برابر-تهران
____________________________________________
Document 740 :
https://www.isna.ir/news/99091310436/برتری-پرگل-سپاهان-مقابل-پرسپولیس-در-لیگ-برتر-هندبال-زنان
____________________________________________
Document 4719 :
https://www.isna.ir/news/98042915326/سرقت-کابل-برق-از-دلایل-خاموشی-ها-در-قم
____________________________________________
Document 3732 :
https://www.isna.ir/news/99060605182/افزایش-۱۰-۵-درصدی-تردد-در-جاده-های-برون-شهری
____________________________________________
Document 2878 :
https://www.isna.ir/news/00061215485/ساعت-رسمی-کشور-یک-ساعت-به-عقب-کشیده-می-شود
____________________________________________
Document 1635 :
https://www.isna.ir/news/98112518446/تصمیم-فدراسیون-وزنه-برداری-در-دقیقه-۹۰-تیم-به-قهرمانی-آسیا-اعزام
____________________________________________
Document 4098 :
https://www.isna.ir/news/99101410462/چهارمین-تجمع-آبداران-کهگیلویه-و-بویراحمد-بعلت-پ