In [4]:
from hazm import *
import openpyxl
from pathlib import Path
import collections
import math
import itertools
import pickle

In [5]:

N_documents = 0
normalizer = Normalizer()
stemmer = Stemmer()
tokens = []
doc_vectors = {}

In [6]:
punctuations = [':', '،', '.', ')', '(', '}', '{', '؟', '!', '-', '/', '؛', '#', '*', '\n', '\"',
                ']', '[', '«', '»', '٪', '+', '٠', '\\', '\"', '_', '\'']



In [22]:
%%time

xlsx_file = Path('../IR_Spring2021_ph12_7k.xlsx')
wb_obj = openpyxl.load_workbook(xlsx_file)
sheet = wb_obj.active
for row in sheet.iter_rows():
    each_doc_vector = {}
    line = row[1].value
    if row[0].value == 'id':
        continue
    N_documents += 1
    line = normalizer.normalize(line)
    for p in punctuations:
        if p in line:
            line = line.replace(p, " ")
    extracted_tokens = word_tokenize(line)
    stem_tokens = []
    for t in extracted_tokens:
        stem_tokens.append(stemmer.stem(t))

    term_doc = []
    for w in stem_tokens:
        term_doc.append({'term': w, 'doc_id': int(row[0].value)})
    tokens.extend(term_doc)
    
    for t in stem_tokens:
        each_doc_vector[t] = each_doc_vector.get(t, 0) + 1
    doc_vectors[int(row[0].value)] = collections.OrderedDict(sorted(each_doc_vector.items()))


Wall time: 18.2 s


In [23]:
%%time
dictionary = {}
inverted_index = {}
idf = {}
doc_term_frq = []
for t in tokens:
    word = t['term']
    doc_id = t['doc_id']
    
    frq = dictionary.get(word, 0)
    dictionary[word] = frq + 1

    doc_list = inverted_index.get(word, [])
    doc_list.append(doc_id)
    inverted_index[word] = doc_list
    
    

Wall time: 1.95 s


In [24]:
%%time
# remove stop words
frq_sorted_dic = {k: v for k, v in sorted(dictionary.items(), key=lambda item: item[1])}
stop_words = dict(itertools.islice(reversed(frq_sorted_dic.items()), 20))
for s in stop_words:
    dictionary.pop(s)
    inverted_index.pop(s)

Wall time: 21.9 ms


In [None]:
%%time

inver_inex_dup = inverted_index.copy()
for i in inverted_index:
    inverted_index[i] = list(dict.fromkeys(inverted_index[i]))
inverted_index = collections.OrderedDict(sorted(inverted_index.items()))
dictionary = collections.OrderedDict(sorted(dictionary.items()))

print(inverted_index.keys())

In [None]:
%%time

for term in inverted_index:
    df = len(inverted_index[term])
    idf[term] = math.log(N_documents / df, 10)
    print(idf[term])

# Index Elimination

In [27]:
%%time

index_elimination_model = {}
for term in inverted_index.keys():
    index_elimination_model[term] = {}
    for doc in inverted_index[term]:
        index_elimination_model[term][doc] = inver_inex_dup[term].count(doc)

# print(index_elimination_model['بود'])

Wall time: 18.8 s


### Champion List

In [28]:
%%time

champion_size = 150
champion_list = {}
for term in index_elimination_model.keys():
    document_dict = index_elimination_model[term]
    sorted_result = {k: v for k, v in sorted(document_dict.items(), key=lambda item: item[1])}
    sorted_result = dict(itertools.islice(reversed(sorted_result.items()), champion_size))
    champion_list[term] = collections.OrderedDict(sorted(sorted_result.items()))
# print(champion_list['بود'])

Wall time: 498 ms


### File Writting

In [29]:
%%time
file = open('stop_words.obj', 'wb') 
pickle.dump(stop_words, file)
file.close()

file = open('index_elimination.obj', 'wb') 
pickle.dump(index_elimination_model, file)
file.close()

file = open('idf.obj', 'wb') 
pickle.dump(idf, file)
file.close()

file = open('doc_vec.obj', 'wb') 
pickle.dump(doc_vectors, file)
file.close()

file = open('champion.obj', 'wb') 
pickle.dump(champion_list, file)
file.close()

Wall time: 654 ms


### File Reading

In [7]:
readed = open('stop_words.obj', 'rb')
stop_words = pickle.load(readed)

readed = open('index_elimination.obj', 'rb')
index_elimination_model = pickle.load(readed)

readed = open('idf.obj', 'rb')
idf = pickle.load(readed)

readed = open('champion.obj', 'rb')
champion_list = pickle.load(readed)

readed = open('doc_vec.obj', 'rb')
doc_vectors = pickle.load(readed)


### Query

In [25]:
%%time
query = input("Serach: ")
query = normalizer.normalize(query)
for p in punctuations:
    if p in query:
        query = query.replace(p, " ")
extracted_tokens = word_tokenize(query)
stem_tokens = []
for t in extracted_tokens:
    stem_tokens.append(stemmer.stem(t))
for s in stop_words:
    if s in stem_tokens:
        for t in stem_tokens:
            if t == s:
                stem_tokens.remove(t)

Serach: پلی‌آف لیگ قهرمانان آسیا
Wall time: 12.4 s


In [26]:
%%time
query_frq = {}
for t in stem_tokens:
    query_frq[t] = query_frq.get(t, 0) + 1
print(query_frq)

{'پلی\u200cآف': 1, 'لیگ': 1, 'قهرمان': 1, 'آسیا': 1}
Wall time: 98.6 ms


# Similarity

### No Champion List Results

In [None]:
result = {}

for term in query_frq.keys():
    wq = 1+math.log(query_frq[term], 10)
    for doc in index_elimination_model[term].keys():
        doc_tf = index_elimination_model[term].get(doc, 0)
        wd = (1+math.log(doc_tf, 10))*(idf[term])
        result[doc] = result.get(doc, 0) + (wd*wq)
        
for doc in result.keys():
    l2_length = 0
    for t in doc_vectors[doc].values():
        l2_length += (1+math.log(t, 10))**2
    l2_length = math.sqrt(l2_length)
    result[doc] = result.get(doc, 0)/l2_length
print(result)

### Champion List Results

In [None]:
%%time

result = {}

for term in query_frq.keys():
    wq = 1+math.log(query_frq[term], 10)
    for doc in champion_list[term].keys():
        doc_tf = index_elimination_model[term].get(doc, 0)
        wd = (1+math.log(doc_tf, 10))*(idf[term])
        result[doc] = result.get(doc, 0) + (wd*wq)
        
for doc in result.keys():
    l2_length = 0
    for t in doc_vectors[doc].values():
        l2_length += (1+math.log(t, 10))**2
    l2_length = math.sqrt(l2_length)
    result[doc] = result.get(doc, 0)/l2_length
print(result)

# Results

In [28]:
K = 50

### Normal Sort

In [13]:
xlsx_file = Path('../IR_Spring2021_ph12_7k.xlsx')
wb_obj = openpyxl.load_workbook(xlsx_file)
sheet = wb_obj.active
sorted_result = {k: v for k, v in sorted(result.items(), key=lambda item: item[1])}
sorted_result = dict(itertools.islice(reversed(sorted_result.items()), K))
for doc in sorted_result.keys():
    for row in sheet.iter_rows():
        if row[0].value == 'id':
            continue
        if int(row[0].value) == doc:
            print(f"Document {doc} :\n{row[2].value}", end='\n____________________________________________\n')


Document 4514 :
https://www.isna.ir/news/98022915417/عرضه-آخرین-دستاوردهای-استارتاپ-های-حوزه-اقتصاد-دیجیتال-در-نمایشگاه
____________________________________________
Document 6641 :
https://www.isna.ir/news/98030200713/برگزاری-پاویون-شرکت-های-دانش-بنیان-در-دو-نمایشگاه-حوزه-سلامت
____________________________________________
Document 6637 :
https://www.isna.ir/news/98030200713/برگزاری-پاویون-شرکت-های-دانش-بنیان-در-دو-نمایشگاه-حوزه-سلامت
____________________________________________
Document 6635 :
https://www.isna.ir/news/98030200713/برگزاری-پاویون-شرکت-های-دانش-بنیان-در-دو-نمایشگاه-حوزه-سلامت
____________________________________________
Document 4528 :
https://www.isna.ir/news/98030200713/برگزاری-پاویون-شرکت-های-دانش-بنیان-در-دو-نمایشگاه-حوزه-سلامت
____________________________________________
Document 672 :
https://www.isna.ir/news/99082013716/صاحب-سوت-طلایی-فیلا-درگذشت
____________________________________________
Document 3374 :
https://www.isna.ir/news/99022417557/برپایی-پاویون-شرکت-های

### Heap

In [29]:
%%time
import heapq as hq
# List to hold values from dictionary
heap_dict=[]

# extract the values from dictionary
for i in result.values():
    heap_dict.append(i)
    
# heapify the values
hq._heapify_max(heap_dict)   

sorted_result = []
while len(sorted_result) < K:
    value = hq._heappop_max(heap_dict)
    for key,val in result.items():
        if value == val:
            sorted_result.append(key)
            break
              
print(sorted_result)

[1596, 632, 394, 154, 1583, 1012, 667, 1584, 1584, 1048, 832, 1020, 1244, 103, 615, 156, 1654, 1094, 1597, 1495, 1640, 1095, 873, 47, 73, 1672, 321, 318, 1119, 1021, 357, 102, 269, 973, 1619, 66, 1329, 312, 399, 1703, 287, 444, 98, 98, 1595, 306, 1192, 497, 1324, 494]
Wall time: 262 ms


In [30]:
xlsx_file = Path('../IR_Spring2021_ph12_7k.xlsx')
wb_obj = openpyxl.load_workbook(xlsx_file)
sheet = wb_obj.active
for doc in sorted_result:
    for row in sheet.iter_rows():
        if row[0].value == 'id':
            continue
        if int(row[0].value) == doc:
            print(f"Document {doc} :\n{row[2].value}", end='\n____________________________________________\n')
#             print(doc)

Document 1596 :
https://www.isna.ir/news/98110100757/مدیران-۴-تیم-آسیایی-ایران-پنجشنبه-با-دبیرکل-AFC-دیدار-می-کنند
____________________________________________
Document 632 :
https://www.isna.ir/news/99080704492/رویارویی-قهرمانان-سابق-لیگ-برتر-فوتبال-در-دیداری-تدارکاتی
____________________________________________
Document 394 :
https://www.isna.ir/news/99053022911/نکونام-بر-دستان-بازیکنان-فولاد
____________________________________________
Document 154 :
https://www.isna.ir/news/99030904913/نظر-موافق-سرمربی-تیم-ملی-فوتسال-با-حذف-پلی-آف-لیگ-برتر
____________________________________________
Document 1583 :
https://www.isna.ir/news/98102720937/جزییات-تصمیم-کمیته-مسابقات-AFC-درباره-ایران-بازی-در-زمین-بی-طرف
____________________________________________
Document 1012 :
https://www.isna.ir/news/99121713154/مجوز-حضور-جودوی-ایران-در-قهرمانی-آسیا-صادر-شد
____________________________________________
Document 667 :
https://www.isna.ir/news/99081912504/رادوشوویچ-امروز-در-محل-تمرین-پرسپولیس-حاضر-شد
_