In [1]:
from parsivar import Normalizer, Tokenizer, FindStems
from stopwordsiso import stopwords
from positional_index.index import PositionalIndex
import json
import pandas as pd

## 1 - Loading the json data

In [2]:
with open('./IR_data_news_12k.json') as json_file:
    docs = json.load(json_file)

In [None]:
docs['0']['content']

## 2 - Doing all preprocesses

In [3]:
normalizer = Normalizer()
tokenizer = Tokenizer()
stemmer = FindStems()
stop_words = stopwords('fa')

In [4]:
print('Started preprocessing ...')
for doc_id in docs:
    text = docs[doc_id]['content']
    normalized_text = normalizer.normalize(text)            # Normalization
    tokens = tokenizer.tokenize_words(normalized_text)      # Tokenization
    nonstop_tokens = []                                     # Handling stop words
    for token in tokens:
        if token not in stop_words:
            nonstop_tokens.append(token)
    stemmed_tokens = pd.Series(nonstop_tokens).apply(stemmer.convert_to_stem).values        # Getting stems
    docs[doc_id]['tokens'] = stemmed_tokens
print('Finished preprocessing.')

Started preprocessing ...
Finished preprocessing.


In [None]:
docs['0']

In [5]:
index = PositionalIndex()
for doc_id in docs:
    index.add_from_dict(doc_id, docs[doc_id])

In [None]:
index.dictionary['آسیا']

In [None]:
t = normalizer.normalize('تحریم های آمریکا ! ایران')
t

In [None]:
ts = tokenizer.tokenize_words(t)

In [None]:
for token in ts:
    print(stemmer.convert_to_stem(token))