In [1]:
from parsivar import Normalizer, Tokenizer, FindStems
from stopwordsiso import stopwords
from positional_index.index import PositionalIndex
import json
import pandas as pd

## 1 - Loading the json data

In [2]:
with open('./IR_data_news_12k.json') as json_file:
    docs = json.load(json_file)

## 2 - Doing all preprocesses

In [3]:
normalizer = Normalizer()
tokenizer = Tokenizer()
stemmer = FindStems()
stop_words = stopwords('fa')

In [4]:
print('Started preprocessing ...')
for doc_id in docs:
    text = docs[doc_id]['content']
    stemmed_tokens = PositionalIndex.preprocess(text)
    docs[doc_id]['tokens'] = stemmed_tokens
print('Finished preprocessing')

Started preprocessing ...
Finished preprocessing


In [5]:
index = PositionalIndex()
for doc_id in docs:
    index.add_from_dict(doc_id, docs[doc_id])

In [6]:
index.dictionary['آسیا']

[0: [5, 22, 34, 56], 8: [133, 139, 160, 165, 257, 307], 13: [145, 225, 237], 29: [502], 30: [14, 20, 53], 34: [107], 50: [436, 442], 54: [175, 187], 55: [296], 56: [56], 60: [51], 65: [20], 68: [12, 28, 97, 100, 154, 164, 171, 206, 212, 224, 242, 292, 311], 70: [10, 86], 71: [130], 84: [29], 86: [391], 91: [16], 93: [37, 47, 62], 104: [146], 122: [14, 39, 43], 130: [10, 58], 140: [192], 141: [60], 142: [46, 53, 92, 100, 134, 160, 179, 275], 143: [16, 83, 94, 107, 152], 148: [180, 250, 499, 545], 155: [170], 163: [1957, 2018, 2030, 2043, 2060], 181: [451], 182: [75], 184: [114], 202: [69, 78], 210: [12], 221: [11], 232: [31, 53, 98], 281: [96], 286: [65], 301: [8, 40], 304: [299], 305: [24], 320: [159], 322: [18, 221], 337: [15, 72, 86], 345: [13], 382: [16, 52], 387: [40, 183], 391: [54], 395: [40, 49, 90, 96], 406: [18, 26], 408: [89], 409: [6, 16, 44, 60, 71], 415: [159, 166, 186], 421: [14, 69, 82], 429: [53], 449: [283], 455: [11], 477: [140], 478: [178, 280, 301], 482: [48, 135], 

In [7]:
index.query('فدراسیون فوتبال')

{0: 1,
 1: 1,
 3: 4,
 7: 1,
 16: 1,
 33: 2,
 35: 1,
 36: 1,
 37: 1,
 45: 1,
 53: 2,
 79: 1,
 80: 2,
 81: 3,
 84: 1,
 86: 2,
 87: 1,
 89: 6,
 127: 3,
 128: 1,
 134: 1,
 137: 1,
 138: 1,
 139: 10,
 165: 2,
 171: 1,
 179: 2,
 182: 5,
 184: 2,
 191: 1,
 202: 8,
 204: 2,
 229: 1,
 259: 4,
 293: 1,
 302: 3,
 304: 1,
 311: 3,
 319: 1,
 324: 6,
 327: 1,
 338: 1,
 349: 3,
 352: 4,
 354: 2,
 375: 1,
 378: 1,
 385: 11,
 387: 2,
 401: 2,
 402: 2,
 404: 8,
 409: 2,
 417: 1,
 446: 4,
 458: 2,
 464: 1,
 472: 1,
 478: 1,
 483: 1,
 493: 1,
 499: 1,
 502: 4,
 505: 1,
 509: 1,
 510: 2,
 511: 1,
 513: 1,
 534: 2,
 555: 2,
 560: 4,
 577: 2,
 578: 1,
 582: 1,
 592: 1,
 629: 1,
 687: 1,
 699: 2,
 716: 1,
 719: 4,
 775: 3,
 789: 2,
 790: 3,
 797: 4,
 798: 2,
 805: 3,
 841: 4,
 847: 2,
 848: 2,
 856: 1,
 857: 1,
 860: 5,
 868: 2,
 870: 1,
 885: 2,
 894: 1,
 895: 1,
 912: 1,
 913: 1,
 923: 1,
 937: 3,
 940: 1,
 953: 1,
 964: 1,
 982: 6,
 987: 5,
 1004: 1,
 1010: 1,
 1011: 2,
 1020: 1,
 1030: 1,
 1053: 1,
 1054:

In [8]:
words = index.preprocess('فدراسیون فوتبال')
words

['فدراسیون', 'فوتبال']

In [9]:
stack = []
for i, word in enumerate(words):
    if len(stack) == 0:
        n = index.get_phrase_docs([word])
    else:
        n = index.get_phrase_and(stack.pop(), [word])
    stack.append(n)
    print(stack[0])

{0: 1, 1: 1, 3: 4, 7: 1, 8: 2, 13: 15, 16: 1, 27: 11, 30: 1, 33: 2, 35: 1, 36: 1, 37: 1, 45: 3, 50: 3, 52: 1, 53: 2, 55: 1, 56: 1, 58: 1, 62: 2, 65: 5, 68: 6, 70: 2, 79: 1, 80: 2, 81: 3, 84: 1, 86: 6, 87: 1, 89: 6, 100: 5, 106: 14, 122: 2, 127: 3, 128: 2, 133: 1, 134: 1, 137: 1, 138: 1, 139: 10, 140: 8, 144: 8, 147: 2, 154: 17, 160: 11, 164: 4, 165: 2, 169: 4, 171: 1, 174: 2, 176: 1, 179: 4, 182: 5, 184: 3, 191: 1, 202: 8, 204: 4, 209: 7, 228: 2, 229: 4, 259: 4, 293: 1, 302: 4, 304: 1, 311: 3, 312: 4, 319: 1, 324: 15, 327: 1, 338: 1, 349: 3, 352: 4, 354: 2, 371: 6, 375: 1, 378: 8, 385: 17, 387: 2, 391: 1, 396: 3, 397: 2, 401: 2, 402: 2, 404: 8, 408: 3, 409: 2, 417: 1, 419: 1, 420: 2, 435: 10, 445: 1, 446: 4, 449: 3, 451: 1, 452: 2, 454: 6, 456: 8, 457: 1, 458: 13, 459: 8, 460: 1, 461: 9, 463: 3, 464: 1, 465: 6, 468: 1, 469: 10, 472: 11, 473: 4, 478: 1, 479: 3, 483: 1, 493: 1, 495: 5, 499: 1, 502: 4, 505: 1, 508: 4, 509: 1, 510: 2, 511: 1, 513: 1, 517: 4, 523: 1, 534: 2, 544: 12, 545: 2

In [30]:
text = 'فدراسیون' + ' ! ' + 'فوتبال'
index.query(text)

{8: 2,
 13: 15,
 27: 11,
 30: 1,
 50: 3,
 52: 1,
 55: 1,
 56: 1,
 58: 1,
 62: 2,
 65: 5,
 68: 6,
 70: 2,
 100: 5,
 106: 14,
 122: 2,
 133: 1,
 140: 8,
 144: 8,
 147: 2,
 154: 17,
 160: 11,
 164: 4,
 169: 4,
 174: 2,
 176: 1,
 209: 7,
 228: 2,
 312: 4,
 371: 6,
 391: 1,
 396: 3,
 397: 2,
 408: 3,
 419: 1,
 420: 2,
 435: 10,
 445: 1,
 449: 3,
 451: 1,
 452: 2,
 454: 6,
 456: 8,
 457: 1,
 459: 8,
 460: 1,
 461: 9,
 463: 3,
 465: 6,
 468: 1,
 469: 10,
 473: 4,
 479: 3,
 495: 5,
 508: 4,
 517: 4,
 523: 1,
 544: 12,
 545: 2,
 547: 1,
 549: 1,
 550: 1,
 554: 1,
 562: 10,
 568: 3,
 569: 2,
 570: 2,
 574: 2,
 576: 4,
 596: 1,
 638: 2,
 643: 1,
 650: 1,
 674: 8,
 695: 9,
 730: 16,
 732: 6,
 733: 11,
 739: 1,
 754: 1,
 756: 2,
 773: 3,
 783: 2,
 786: 8,
 800: 1,
 818: 4,
 819: 7,
 828: 1,
 850: 4,
 872: 1,
 890: 1,
 901: 1,
 911: 10,
 924: 5,
 926: 2,
 932: 2,
 938: 2,
 944: 7,
 960: 3,
 996: 6,
 1000: 2,
 1013: 7,
 1025: 1,
 1067: 1,
 1102: 2,
 1104: 2,
 1106: 2,
 1109: 4,
 1113: 2,
 1119: 2,
 1

In [29]:
index.preprocess('کتاب های ! خواندنی')

['کتاب', '!', 'خواندنی']

In [28]:
PositionalIndex.stop_words = PositionalIndex.stop_words.difference('!')