In [2]:
from parsivar import Normalizer, Tokenizer, FindStems
from stopwordsiso import stopwords
from positional_index.index import PositionalIndex
import json
import pandas as pd

## 1 - Loading the json data

In [3]:
with open('./IR_data_news_12k.json') as json_file:
    docs = json.load(json_file)

In [None]:
docs['0']['content']

## 2 - Doing all preprocesses

In [4]:
normalizer = Normalizer()
tokenizer = Tokenizer()
stemmer = FindStems()
stop_words = stopwords('fa')

In [5]:
print('Started preprocessing ...')
for doc_id in docs:
    text = docs[doc_id]['content']
    normalized_text = normalizer.normalize(text)            # Normalization
    tokens = tokenizer.tokenize_words(normalized_text)      # Tokenization
    nonstop_tokens = []                                     # Handling stop words
    for token in tokens:
        if token not in stop_words:
            nonstop_tokens.append(token)
    stemmed_tokens = pd.Series(nonstop_tokens).apply(stemmer.convert_to_stem).values        # Getting stems
    docs[doc_id]['tokens'] = stemmed_tokens
print('Finished preprocessing.')

Started preprocessing ...
Finished preprocessing.


In [None]:
docs['0']

In [6]:
index = PositionalIndex()
for doc_id in docs:
    index.add_from_dict(doc_id, docs[doc_id])

In [7]:
index.dictionary['آسیا']

[0: [5, 56, 34, 22], 8: [133, 307, 257, 165, 160, 139], 13: [145, 237, 225], 29: [502], 30: [14, 53, 20], 34: [107], 50: [436, 442], 54: [175, 187], 55: [296], 56: [56], 60: [51], 65: [20], 68: [12, 311, 292, 242, 224, 212, 206, 171, 164, 154, 100, 97, 28], 70: [10, 86], 71: [130], 84: [29], 86: [391], 91: [16], 93: [37, 62, 47], 104: [146], 122: [14, 43, 39], 130: [10, 58], 140: [192], 141: [60], 142: [46, 275, 179, 160, 134, 100, 92, 53], 143: [16, 152, 107, 94, 83], 148: [180, 545, 499, 250], 155: [170], 163: [1957, 2060, 2043, 2030, 2018], 181: [451], 182: [75], 184: [114], 202: [69, 78], 210: [12], 221: [11], 232: [31, 98, 53], 281: [96], 286: [65], 301: [8, 40], 304: [299], 305: [24], 320: [159], 322: [18, 221], 337: [15, 86, 72], 345: [13], 382: [16, 52], 387: [40, 183], 391: [54], 395: [40, 96, 90, 49], 406: [18, 26], 408: [89], 409: [6, 71, 60, 44, 16], 415: [159, 186, 166], 421: [14, 82, 69], 429: [53], 449: [283], 455: [11], 477: [140], 478: [178, 301, 280], 482: [48, 135], 