In [3]:
import pandas as pd
import numpy as np
import heapq
import math
import os
import pickle 

In [4]:
# !pip install hazm
from hazm import *
from collections import Counter

In [5]:
data_IR=pd.read_csv("IR_CSV.csv")
punctuations=pd.read_csv("punctuations.csv")
chars_to_remove=''.join(punctuations["punctuations"].values.tolist()).replace(" ","")

In [6]:
data_IR = data_IR[data_IR['content'].notnull()]

# Phase 2

In [28]:
normalizer = Normalizer()
lemmatizer = Lemmatizer()


def tokenize(doc):
    nd=str(normalizer.normalize(doc))
    nd=nd.translate(str.maketrans('','',chars_to_remove))
    words=list(map(lambda t:lemmatizer.lemmatize(t),word_tokenize(nd)))    
    return dict(Counter(words))

def make_dictionary(data):
    dictionary={}
    lengths={}
    for i,doc in zip(data["id"],data["content"]):
        doc=str(doc)
        doc=doc.replace("انتهای پیام","")
        counts=tokenize(doc)
        l=0
        for k,v in counts.items():
            l+=(1+math.log(v,10))**2
            if k in dictionary.keys():
                dictionary[k][0]+=1
                dictionary[k][1][i]=1+math.log(v,10)
            else:
                dictionary[k]=[1,{i:1+math.log(v,10)}]
        lengths[i]=np.sqrt(l)
    #remove stop words
    for i in stopwords_list():
        if i in dictionary.keys():
            dictionary.pop(i)
    return dictionary,lengths


def make_champion(dictionary,r):
    champs={}
    for k,v in dictionary.items():
        best_r=dict(Counter(v[1]).most_common(r))
        champs[k]=[v[0],best_r]
    return champs

def normalize_query(q):# do the same for query
    q_doc=tokenize(q)
    for k,v in q_doc.items():
        q_doc[k]=1+math.log(v)
        
    #remove stop words
    for i in stopwords_list():
        if i in q_doc.keys():
            q_doc.pop(i)
    return q_doc

N=len(data_IR["content"])
def compute_scores(q,dictionary): #cosine similarity(slide 38/ lecture 6)
    sims={} 
    for k,v in q.items():
        idf=math.log(N/dictionary[k][0])
        for docid,w in dictionary[k][1].items():
            if docid in sims.keys():
                sims[docid]+=w*v*idf
            else:
                sims[docid]=w*v*idf
    for docid in sims.keys():
        sims[docid]=sims[docid]/lengths[docid]
    return sims

def best_k(scores, k):# make heap table and pop k
    best_scores =[]
    heap = []
    for docid,score in scores.items():
        heapq.heappush(heap,(-score, docid))
    for i in range(k):
        best_scores.append(heapq.heappop(heap))
    return best_scores

def query(q,k,dictionary,champion,champs=0): 
    q=normalize_query(q)
    best_r={}
    if champs:
        best_r=compute_scores(q,champion)
        if len(best_r)<k:
            best_r.update(compute_scores(q,dictionary))
    else:
        best_r=compute_scores(q,dictionary)
    return best_k(best_r,k)
    

In [29]:
def retrieve_docs(topk,data):
    results=[]
    for t in topk:
        results.append(data[data["id"]==t[1]]["url"])
    return results

In [30]:
%%time
my_dict,lengths=make_dictionary(data_IR)

Wall time: 24.9 s


In [8]:
%%time
my_champs=make_champion(my_dict,7)

Wall time: 728 ms


In [156]:
%%time
topk=query("بنیاد نخبگان و جهاد دانشگاهی",5,my_dict,my_champs,0)
retrieve_docs(topk,data_IR)

Wall time: 9.95 ms


[4228    https://www.isna.ir/news/99120100312/رویداد-خا...
 Name: url, dtype: object,
 2379    https://www.isna.ir/news/99102116017/ارائه-گزا...
 Name: url, dtype: object,
 4394    https://www.isna.ir/news/98020703648/ارزش-۱۰-ک...
 Name: url, dtype: object,
 2617    https://www.isna.ir/news/98022814809/تشکیل-۱۰-...
 Name: url, dtype: object,
 6060    https://www.isna.ir/news/99070906684/رتبه-اول-...
 Name: url, dtype: object]

In [159]:
%%time
topk=query("سازمان حمایت از مصرف کننده",5,my_dict,my_champs,0)
retrieve_docs(topk,data_IR)

Wall time: 21.2 ms


[3132    https://www.isna.ir/news/98110705292/بررسی-مشک...
 Name: url, dtype: object,
 5287    https://www.isna.ir/news/98110705292/بررسی-مشک...
 Name: url, dtype: object,
 2235    https://www.isna.ir/news/99081208080/بررسی-افز...
 Name: url, dtype: object,
 3911    https://www.isna.ir/news/99081208080/بررسی-افز...
 Name: url, dtype: object,
 3955    https://www.isna.ir/news/99090100757/قیمت-شیر-...
 Name: url, dtype: object]

[(-1.540911463934925, 3133),
 (-1.540911463934925, 5288),
 (-1.4152608628721397, 2236),
 (-1.4152608628721397, 3912),
 (-1.3485069978003228, 3956)]

In [158]:
%%time
topk=query("تحرک ذهنی زنان خانه‌دار",5,my_dict,my_champs,0)
retrieve_docs(topk,data_IR)

Wall time: 16.2 ms


[6717    https://www.isna.ir/news/98042915171/زنان-شاغل...
 Name: url, dtype: object,
 6721    https://www.isna.ir/news/98042915171/زنان-شاغل...
 Name: url, dtype: object,
 2732    https://www.isna.ir/news/98042614092/تحرک-نیرو...
 Name: url, dtype: object,
 6206    https://www.isna.ir/news/99082819819/راه-های-پ...
 Name: url, dtype: object,
 42    https://www.isna.ir/news/99011809817/پیام-یونس...
 Name: url, dtype: object]

In [138]:
%%time
topk=query("بیماران کرونایی مبتلا و مرگ",5,my_dict,my_champs,1)

Wall time: 3.85 ms


In [10]:
topk

[(-1.0817750115332747, 5470),
 (-1.0348375352019714, 6337),
 (-1.0348375352019714, 6341),
 (-0.9309592553610874, 5629),
 (-0.9309592553610874, 5633)]

In [12]:
retrieve_docs(topk,data_IR)

[5892    https://www.isna.ir/news/99052518443/آمار-فوتی...
 Name: url, dtype: object,
 5961    https://www.isna.ir/news/99061511083/دانش-آموز...
 Name: url, dtype: object,
 6277    https://www.isna.ir/news/99092620240/داروی-بیم...
 Name: url, dtype: object,
 5499    https://www.isna.ir/news/99012514275/تحلیل-وضع...
 Name: url, dtype: object,
 6178    https://www.isna.ir/news/99081811494/صعود-مرگ-...
 Name: url, dtype: object]

In [143]:
topk=query("سازمان حمایت از مصرف کننده",5,my_dict,my_champs,0)
retrieve_docs(topk,data_IR)

[3132    https://www.isna.ir/news/98110705292/بررسی-مشک...
 Name: url, dtype: object,
 5287    https://www.isna.ir/news/98110705292/بررسی-مشک...
 Name: url, dtype: object,
 2235    https://www.isna.ir/news/99081208080/بررسی-افز...
 Name: url, dtype: object,
 3911    https://www.isna.ir/news/99081208080/بررسی-افز...
 Name: url, dtype: object,
 3955    https://www.isna.ir/news/99090100757/قیمت-شیر-...
 Name: url, dtype: object]

In [142]:
retrieve_docs(topk,data_IR)

[4228    https://www.isna.ir/news/99120100312/رویداد-خا...
 Name: url, dtype: object,
 2379    https://www.isna.ir/news/99102116017/ارائه-گزا...
 Name: url, dtype: object,
 4394    https://www.isna.ir/news/98020703648/ارزش-۱۰-ک...
 Name: url, dtype: object,
 2617    https://www.isna.ir/news/98022814809/تشکیل-۱۰-...
 Name: url, dtype: object,
 6060    https://www.isna.ir/news/99070906684/رتبه-اول-...
 Name: url, dtype: object]

In [140]:
topk

[(-1.7357404059721542, 149),
 (-1.7031733170033982, 5964),
 (-1.6158117610958103, 6408),
 (-1.5916403585637995, 6005),
 (-1.5352852966576216, 684)]

In [41]:
topk=query("واکسیناسیون بیماری کرونا در جهان",10,my_dict,my_champs,1)

In [20]:
topk

[(-0.7857281437102277, 6458),
 (-0.7857281437102277, 6464),
 (-0.6169447948270286, 5964),
 (-0.4887679279028567, 6444),
 (-0.4645062162973372, 6411),
 (-0.35801965094416827, 1319),
 (-0.3448540985911378, 1320),
 (-0.3367207962136427, 6273),
 (-0.294274974464398, 1625),
 (-0.24770889941060906, 979)]

# Phase 3

In [24]:
import random
from tqdm import tqdm

In [9]:
names=os.listdir("IR00_dataset_ph3")
files=[]
for i, filename in enumerate(names):
     files.append(pd.read_excel("IR00_dataset_ph3"+"\\"+filename, sheet_name=None)["Sheet1"])

  after removing the cwd from sys.path.


In [26]:
idf={}
def calc_len(doc):
    return np.sqrt(sum(list(map(lambda x:x**2,doc.values()))))

def calc_docs(data):
    docs={}
    lengths={}
    idf={}
    for i,doc in tqdm(zip(data["id"],data["content"])):
        doc=str(doc)
        doc=doc.replace("انتهای پیام","")
        td=tokenize(doc)
        for k,v in td.items():
            idf[k]=idf.get(k,0)+v
            td[k]=1+math.log(v,10)
        for sw in stopwords_list():
            if sw in td.keys():
                td.pop(sw)
        docs[i]=td
        lengths[i]=calc_len(docs[i])
    N=len(docs)
    for docid,tlist in tqdm(docs.items()):
        for word,freq in tlist.items():
            tf=1+math.log(freq,10)
#             in_df=math.log(N/idf[word])
            docs[docid][word]=tf/lengths[docid]
    return docs,idf

In [22]:
offset=int(files[0].tail(1)["id"])
files[1]["id"]=files[1]["id"]+offset
offset=int(files[1].tail(1)["id"])
files[2]["id"]=files[2]["id"]+offset


In [23]:
data=pd.concat(files)

In [24]:
data["topic"].replace({"political": "politics", "sport": "sports"}, inplace=True)

In [25]:
data.index=data.id

In [26]:
data=data[data['content'].notnull()]
data=data[data['topic'].notnull()]

# Kmeans

In [36]:
%%time
# docs=calc_docs(data)

docs,idf=calc_docs(data)

NameError: name 'calc_docs' is not defined

In [32]:
# save docs and lengths
# file_pi = open('docs.obj', 'wb') 
# pickle.dump(docs, file_pi)

# file_pi = open('lengths.obj', 'wb') 
# pickle.dump(lengths, file_pi)

# file_pi = open('idf.obj', 'wb') 
# pickle.dump(idf, file_pi)


In [37]:
# read 
file_pi = open('docs.obj', 'rb') 
docs=pickle.load(file_pi)

# file_pi = open('lengths.obj', 'rb') 
# lengths=pickle.load(file_pi)

file_pi = open('idf.obj', 'rb') 
idf=pickle.load(file_pi)

In [38]:
def assign_clusters(centres,centre_len,docs):   
    
    clusters={k:[] for k in range(len(centres))}
    for docid in docs.keys():
        sim=[]
        for i in range(len(centres)):
            cosine=0
            d1=docs[docid]
            d2=centres[i]
            for k in d1.keys():
                if k in d2.keys():
#                     cosine+=(1+math.log(d1[k],10))*(1+math.log(d2[k],10))*math.log(N/idf[k],10)
                    cosine+=d1[k]*d2[k]
            if centre_len[i]!=0:
                sim.append(cosine/np.sqrt(centre_len[i]))
        best_centre=np.argmax(sim)
        clusters[best_centre].append(docid)

    return clusters


def calc_centres(clusters,docs):
    centres={}
    centre_len={k:0 for k in range(len(clusters))}
    for centre,followers in clusters.items():
        new_centre_tf={}
        for f in followers:
            for word,tf in docs[f].items():
                new_centre_tf[word]=new_centre_tf.get(word,0)+tf
        l=0
        for k in new_centre_tf.keys():
            if len(followers)!=0:
                new_centre_tf[k]/=len(followers)
            l+=new_centre_tf[k]**2
        centre_len[centre]=math.sqrt(l)

        centres[centre]=new_centre_tf
    return centres,centre_len


def kmeans(docs,k,iteration):
    c_ids = (np.random.uniform(0, len(docs), size = k).astype(int))
    centres=[docs[i] for i in c_ids]
    centre_len={}
    for i in range(len(centres)):
        centre_len[i]=calc_len(centres[i])
    for i in tqdm(range(iteration)):
        clusters=assign_clusters(centres,centre_len,docs)
        centres,centre_len=calc_centres(clusters,docs)
        for k,v in clusters.items():
            print(k,":",len(v))
    return centres,centre_len,clusters

In [37]:
%%time
centres,centre_len,clusters=kmeans(docs,10,10)

 10%|████████▎                                                                          | 1/10 [00:30<04:32, 30.30s/it]

0 : 6941
1 : 1239
2 : 3536
3 : 3467
4 : 3591
5 : 2141
6 : 4374
7 : 12640
8 : 6628
9 : 5504


 20%|████████████████▌                                                                  | 2/10 [02:06<09:13, 69.18s/it]

0 : 3980
1 : 2536
2 : 5599
3 : 4821
4 : 1867
5 : 2091
6 : 7821
7 : 8041
8 : 8829
9 : 4476


 30%|████████████████████████▉                                                          | 3/10 [03:33<08:59, 77.05s/it]

0 : 3883
1 : 2086
2 : 5649
3 : 5918
4 : 2137
5 : 2541
6 : 8025
7 : 6998
8 : 8521
9 : 4303


 40%|█████████████████████████████████▏                                                 | 4/10 [04:38<07:15, 72.59s/it]

0 : 3722
1 : 1859
2 : 5458
3 : 6799
4 : 2137
5 : 2766
6 : 8121
7 : 7595
8 : 8192
9 : 3412


 50%|█████████████████████████████████████████▌                                         | 5/10 [05:25<05:16, 63.21s/it]

0 : 3506
1 : 1772
2 : 5077
3 : 7481
4 : 2050
5 : 2963
6 : 8176
7 : 7795
8 : 7942
9 : 3299


 60%|█████████████████████████████████████████████████▊                                 | 6/10 [06:10<03:48, 57.04s/it]

0 : 3375
1 : 1728
2 : 4928
3 : 7732
4 : 2021
5 : 3081
6 : 8186
7 : 8072
8 : 7664
9 : 3274


 70%|██████████████████████████████████████████████████████████                         | 7/10 [06:55<02:39, 53.05s/it]

0 : 3301
1 : 1670
2 : 4894
3 : 7822
4 : 1975
5 : 3141
6 : 8217
7 : 8307
8 : 7464
9 : 3270


 80%|██████████████████████████████████████████████████████████████████▍                | 8/10 [07:57<01:51, 55.95s/it]

0 : 3230
1 : 1654
2 : 4864
3 : 7869
4 : 1992
5 : 3148
6 : 8254
7 : 8453
8 : 7334
9 : 3263


 90%|██████████████████████████████████████████████████████████████████████████▋        | 9/10 [08:53<00:55, 55.89s/it]

0 : 3171
1 : 1684
2 : 4850
3 : 7899
4 : 1995
5 : 3161
6 : 8288
7 : 8515
8 : 7236
9 : 3262


100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [09:44<00:00, 58.46s/it]

0 : 3110
1 : 1729
2 : 4825
3 : 7967
4 : 2028
5 : 3155
6 : 8321
7 : 8504
8 : 7160
9 : 3262
Wall time: 9min 44s





In [None]:
# %%time
# centres_5,centre_len_5,clusters_5=kmeans(docs,5,10)

In [None]:
# %%time
# centres_7,centre_len_7,clusters_7=kmeans(docs,7,10)


In [46]:
# # save centres and clusters
# kcentres = open('kmeans_centres1010.obj', 'wb') 
# pickle.dump(centres, kcentres)
# kclusters = open('kmeans_clusters1010.obj', 'wb') 
# pickle.dump(clusters, kclusters)
# kclens = open('kmeans_clens1010.obj', 'wb') 
# pickle.dump(centre_len, kclens)


In [39]:
# read centres and clusters
file = open('kmeans_centres1010.obj', 'rb') 
kcentres=pickle.load(file)
file = open('kmeans_clusters1010.obj', 'rb') 
kclusters=pickle.load(file)
# file = open('kmeans_clens1010.obj', 'rb') 
# kclens=pickle.load(file)

In [48]:
def RSS(centres,centre_len,clusters):
    rss=[0 for i in range(len(centres))]
    for k,v in clusters.items():
        for i in tqdm(v):
            for word in set(centres[k].keys()).union(set(docs[i].keys())):
                rss[k]+=abs(centres.get(word,0)-docs[i].get(word,0))**2
        if centre_len[k]!=0:
            rss[k]/=centre_len[k]
    return sum(rss)

In [49]:
# %%time
# RSS(kcentres,kclens,kclusters)

In [50]:
# models=[]
# rss=[]
# for i in tqdm(range(5)):
#     centres,centre_len,clusters=kmeans(docs,10,5)
#     models.append([centres,centre_len,clusters])
#     rss.append(RSS(centres,centre_len,clusters))

In [40]:
def query_kmeans(query,clusters,centres,b,K):
    q_doc=normalize_query(query)
    print(q_doc)
    heap=[]
    #compare with leaders
    for c in range(len(centres)):
        cosine=0
        for k in q_doc.keys():
            if k in centres[c].keys():
                cosine+=centres[c][k]*q_doc[k]
        heapq.heappush(heap,(-cosine, c))
    
    docs_to_search=[]
    
    for i in range(b):
        poped=heapq.heappop(heap)
        print(poped[1])
        docs_to_search.extend(clusters[poped[1]])
    heapk=[]
    for docid in docs_to_search:
        cosine=0
        for k in q_doc.keys():
            if k in docs[docid].keys():
                cosine+=docs[docid][k]*q_doc[k]
        heapq.heappush(heapk,(-cosine, docid))
    results=[]
    for i in range(K):
        results.append(heapq.heappop(heapk))
    return results

In [41]:
retrieve_docs(query_kmeans("توخل مادرید قهرمانان زیدان چلسی رئال",kclusters,kcentres,1,5),data)

{'توخل': 1.0, 'مادرید': 1.0, 'قهرمان': 1.0, 'زید': 1.0, 'چلسی': 1.0, 'رئال': 1.0}
6


[id
 29460    https://www.farsnews.ir/news/13990720000394/وض...
 Name: url, dtype: object,
 id
 29553    https://www.farsnews.ir/news/13990716000917/را...
 Name: url, dtype: object,
 id
 38197    https://www.farsnews.ir/news/13990910000168/سر...
 Name: url, dtype: object,
 id
 31301    https://www.farsnews.ir/news/13990529000612/صا...
 Name: url, dtype: object,
 id
 32914    https://www.farsnews.ir/news/13990417000584/اح...
 Name: url, dtype: object]

In [42]:
len(inverted_index)

140283

In [44]:
retrieve_docs(query_kmeans("اعزام کاروان های اردوی راهیان نور",kclusters,kcentres,1,5),data)

{'اعزام': 1.0, 'کاروان': 1.0, 'اردو': 1.0, 'نور': 1.0}
0


[id
 9702    https://www.isna.ir/news/98021106170/حضور-لرست...
 Name: url, dtype: object,
 id
 30319    https://www.farsnews.ir/news/13990627000294/90...
 Name: url, dtype: object,
 id
 31120    https://www.farsnews.ir/news/13990603000575/2-...
 Name: url, dtype: object,
 id
 35394    https://www.farsnews.ir/news/13990217000324/رش...
 Name: url, dtype: object,
 id
 32159    https://www.farsnews.ir/news/13990506000599/پی...
 Name: url, dtype: object]

In [130]:
retrieve_docs(query_kmeans("تعطیلی مدارس و مراکز آموزشی",clusters,centres,1,5),data)

{'تعطیل': 1.0, 'مدارس': 1.0, 'مراکز': 1.0, 'آموزش': 1.0}
7


[id
 7194    https://www.isna.ir/news/98120806789/مهدهای-کو...
 Name: url, dtype: object,
 id
 7193    https://www.isna.ir/news/98120806794/دانش-آموز...
 Name: url, dtype: object,
 id
 6066    https://www.isna.ir/news/99071108223/کرونا-مدا...
 Name: url, dtype: object,
 id
 6071    https://www.isna.ir/news/99071108223/کرونا-مدا...
 Name: url, dtype: object,
 id
 6145    https://www.isna.ir/news/99080905279/اطلاعیه-ج...
 Name: url, dtype: object]

In [132]:
retrieve_docs(query_kmeans("حقوق معلمان چند تومان است",clusters,centres,1,5),data)

{'حقوق': 1.0, 'معلمان': 1.0, 'تومان': 1.0, '#است': 1.0}
2


[id
 44943    https://www.farsnews.ir/news/13991002000138/نم...
 Name: url, dtype: object,
 id
 43017    https://www.farsnews.ir/news/13991114000553/جم...
 Name: url, dtype: object,
 id
 39629    https://www.farsnews.ir/news/14000202000663/تج...
 Name: url, dtype: object,
 id
 42481    https://www.farsnews.ir/news/13991126000321/تج...
 Name: url, dtype: object,
 id
 49551    https://www.farsnews.ir/news/13990623000559/تج...
 Name: url, dtype: object]

In [55]:
data.iloc[50053]

id                                                     50054
content    به گزارش خبرنگار حوزه دولت خبرگزاری فارس، محمو...
topic                                               politics
url        https://www.farsnews.ir/news/13990612000526/وا...
Name: 50054, dtype: object

In [56]:
retrieve_docs(query_kmeans("حقوق معلمان",kclusters,kcentres,1,5),data)

{'حقوق': 1.0, 'معلمان': 1.0}
2


[id
 39629    https://www.farsnews.ir/news/14000202000663/تج...
 Name: url, dtype: object,
 id
 44943    https://www.farsnews.ir/news/13991002000138/نم...
 Name: url, dtype: object,
 id
 43017    https://www.farsnews.ir/news/13991114000553/جم...
 Name: url, dtype: object,
 id
 44839    https://www.farsnews.ir/news/13991003000282/چر...
 Name: url, dtype: object,
 id
 42481    https://www.farsnews.ir/news/13991126000321/تج...
 Name: url, dtype: object]

# KNN

In [57]:
%%time
inverted_index,lengths=make_dictionary(data)

Wall time: 4min 21s


In [21]:
# save docs and lengths
# file_pi = open('inverted_index.obj', 'wb') 
# pickle.dump(inverted_index, file_pi)

# file_pi = open('ii_lengths.obj', 'wb') 
# pickle.dump(lengths, file_pi)

# read 
file_pi = open('inverted_index.obj', 'rb') 
inverted_index=pickle.load(file_pi)

file_pi = open('ii_lengths.obj', 'rb') 
lengths=pickle.load(file_pi)


In [68]:
cats=['sports', 'politics', 'economy', 'health', 'culture']

def normalize_doc(doc):
    doc=str(doc)
    doc=doc.replace("انتهای پیام","")
    doc=tokenize(doc)
    #remove stop words
    for i in stopwords_list():
        if i in doc.keys():
            doc.pop(i)
    return doc

N=len(data["content"])
def compute_scores(q,dictionary): #cosine similarity(slide 38/ lecture 6)
    sims={} 
    for k,v in q.items():
        idf=math.log(N/dictionary[k][0])
        for docid,w in dictionary[k][1].items():
            if docid in sims.keys():
                sims[docid]+=w*v*idf
            else:
                sims[docid]=w*v*idf
    for docid in sims.keys():
        sims[docid]=sims[docid]/lengths[docid]
    return sims

def knn(q,inverted_index, K,data):
    
    # compute all similarities
    sims={} 
    for k,v in q.items():
        if k not in inverted_index.keys():
            continue
        idf=math.log(N/inverted_index[k][0])
        for docid,w in inverted_index[k][1].items():
            if docid in sims.keys():
                sims[docid]+=w*v*idf
            else:
                sims[docid]=w*v*idf
    for docid in sims.keys():
        sims[docid]=sims[docid]/lengths[docid]
    
    if len(sims) < K:
        K = len(sims)
    
    #find k nearest using heap
    best_scores =[]
    heap = []
    for docid,score in sims.items():
        heapq.heappush(heap,(-score, docid))
    max_cat="none"
    maximum=0
    for i in range(K):
        category={i:0 for i in cats }
        poped=heapq.heappop(heap)
        cat=data.topic.loc[poped[1]]
        category[cat]=category[cat]+1
        if category[cat]>maximum:
            maximum=category[cat]
            max_cat=cat
    return max_cat

In [69]:
knn(normalize_doc("در ایران واکسن کرونا"),inverted_index,7,data)

'health'

In [144]:
knn(normalize_doc(" سرمایه گذاری در بازار بورس"),inverted_index,7,data)

'economy'

In [71]:
knn(normalize_doc("گالری هنری"),inverted_index,7,data)

'culture'

In [72]:
knn(normalize_doc("شطرنج بانوان"),inverted_index,7,data)

'sports'

In [224]:
set(data.topic)

{'culture', 'economy', 'health', 'politics', 'sports'}

Label datas from phase1

In [120]:
data_IR_normalized={}
for i in tqdm(range(len(data_IR))):
    data_IR_normalized[data_IR.iloc[i].id]=normalize_doc(data_IR.iloc[i])

100%|█████████████████████████████████████████████████████████████████████████████| 6996/6996 [00:29<00:00, 234.48it/s]


In [230]:
data_cats={c:[] for c in cats}
data_cats["none"]=[]
for k,v in tqdm(data_IR_normalized.items()):
    data_cats[knn(v,inverted_index,5,data)].append(k)

100%|██████████████████████████████████████████████████████████████████████████████| 6996/6996 [15:07<00:00,  7.71it/s]


In [121]:
# # save categories
# # file_pi = open('categories.obj', 'wb') 
# # pickle.dump(data_cats, file_pi)
# file_pi = open('data_labled.obj', 'wb') 
# pickle.dump(data_IR_normalized, file_pi)
# #read categories
# # file_pi = open('categories.obj', 'rb') 
# # data_cats = pickle.load(file_pi)
# file_pi = open('data_labled.obj', 'rb') 
# data_IR_normalized = pickle.load(file_pi)

In [77]:
for k,v in data_cats.items():
    print(k,":",len(v))

sports : 1808
politics : 2046
economy : 1465
health : 1214
culture : 461
none : 2


In [124]:
def query_knn(query,categories,docs,idf,K):
    sp = query.split(" ")
    cat = sp[0][4:]
    
    q_doc=normalize_query("".join(sp[1:]))
    print(q_doc)
    heap=[]
    for i in categories[cat]:
        cosine=0
        for k,v in q_doc.items():
            if k in docs[i].keys():
                cosine+=v*docs[i][k]*idf[k][0]
        heapq.heappush(heap,(-cosine, i))
    result=[]
    for i in range(K):
        result.append(heapq.heappop(heap))
    return result

In [152]:
topk=query_knn("cat:sports کرونا", data_cats,docs,inverted_index,5)
retrieve_docs(topk,data_IR)

{'کرونا': 1.0}


[298    https://www.isna.ir/news/99042619747/تکذیب-خبر...
 Name: url, dtype: object,
 683    https://www.isna.ir/news/99082315541/ابراهیم-ج...
 Name: url, dtype: object,
 200    https://www.isna.ir/news/99032717763/کرونایی-ه...
 Name: url, dtype: object,
 367    https://www.isna.ir/news/99052216709/رییس-فدرا...
 Name: url, dtype: object,
 318    https://www.isna.ir/news/99050402689/سرپرست-تی...
 Name: url, dtype: object]

In [151]:
topk=query_knn("cat:politics کرونا", data_cats,docs,inverted_index,5)
retrieve_docs(topk,data_IR)

{'کرونا': 1.0}


[2035    https://www.isna.ir/news/99061309882/تکذیب-شای...
 Name: url, dtype: object,
 2249    https://www.isna.ir/news/99082215136/نماینده-ا...
 Name: url, dtype: object,
 2290    https://www.isna.ir/news/99091108977/چنارانی-و...
 Name: url, dtype: object,
 1740    https://www.isna.ir/news/99011608234/محمدرضا-خ...
 Name: url, dtype: object,
 1883    https://www.isna.ir/news/99041410245/زاهدی-از-...
 Name: url, dtype: object]

In [150]:
topk=query_knn("cat:health کرونا", data_cats,docs,inverted_index,5)
retrieve_docs(topk,data_IR)

{'کرونا': 1.0}


[2035    https://www.isna.ir/news/99061309882/تکذیب-شای...
 Name: url, dtype: object,
 2249    https://www.isna.ir/news/99082215136/نماینده-ا...
 Name: url, dtype: object,
 2290    https://www.isna.ir/news/99091108977/چنارانی-و...
 Name: url, dtype: object,
 1740    https://www.isna.ir/news/99011608234/محمدرضا-خ...
 Name: url, dtype: object,
 1883    https://www.isna.ir/news/99041410245/زاهدی-از-...
 Name: url, dtype: object]

In [None]:
topk=query_knn("cat:health انقلاب", data_cats,docs,inverted_index,5)
retrieve_docs(topk,data_IR)

In [154]:
topk=query_knn("cat:culture انقلاب", data_cats,docs,inverted_index,5)
retrieve_docs(topk,data_IR)

{'انقلاب': 1.0}


[2879    https://www.isna.ir/news/98070100781/یک-فعال-د...
 Name: url, dtype: object,
 2708    https://www.isna.ir/news/98041709012/تحلیل-مرح...
 Name: url, dtype: object,
 2876    https://www.isna.ir/news/98062919996/اقتدار-کن...
 Name: url, dtype: object,
 3037    https://www.isna.ir/news/98091309931/بسیج-برنا...
 Name: url, dtype: object,
 2905    https://www.isna.ir/news/98071209011/اتفاقات-ا...
 Name: url, dtype: object]

In [153]:
topk=query_knn("cat:economy انقلاب", data_cats,docs,inverted_index,5)
retrieve_docs(topk,data_IR)

{'انقلاب': 1.0}


[3301    https://www.isna.ir/news/99020906760/اعتبار-۷۰...
 Name: url, dtype: object,
 2863    https://www.isna.ir/news/98061708728/پیگیری-فر...
 Name: url, dtype: object,
 5275    https://www.isna.ir/news/98110604396/10-هزار-و...
 Name: url, dtype: object,
 5276    https://www.isna.ir/news/98110604329/کلنگ-4000...
 Name: url, dtype: object,
 5274    https://www.isna.ir/news/98110604422/آیین-کلنگ...
 Name: url, dtype: object]

In [129]:
topk

[(-773.5279564343397, 299),
 (-740.3713727942745, 684),
 (-636.973485597178, 201),
 (-629.1959278361627, 368),
 (-602.9717559667836, 319)]

In [18]:
data=files[0]

In [20]:
data[data["id"]==9587]["url"]

9586    https://www.isna.ir/news/98012208857/ضرورت-تول...
Name: url, dtype: object