In [1]:
!pip install hazm



In [2]:
from hazm import Normalizer, WordTokenizer, Stemmer
import nltk
import pandas as pd
import re
from string import punctuation as punctuation_str
import csv
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from tqdm import tqdm
import math

In [3]:
!wget https://github.com/mohamad-dehghani/persian-pdf-books-dataset/raw/master/final_books.xlsx

--2023-11-12 20:26:35--  https://github.com/mohamad-dehghani/persian-pdf-books-dataset/raw/master/final_books.xlsx
Resolving github.com (github.com)... 140.82.112.4
Connecting to github.com (github.com)|140.82.112.4|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/mohamad-dehghani/persian-pdf-books-dataset/master/final_books.xlsx [following]
--2023-11-12 20:26:36--  https://raw.githubusercontent.com/mohamad-dehghani/persian-pdf-books-dataset/master/final_books.xlsx
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1380625 (1.3M) [application/octet-stream]
Saving to: ‘final_books.xlsx.3’


2023-11-12 20:26:36 (20.0 MB/s) - ‘final_books.xlsx.3’ saved [1380625/1380625]



#Persian Dataset

In [4]:
df = pd.read_excel('/content/final_books.xlsx')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2824 entries, 0 to 2823
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   title     2824 non-null   object
 1   date      2824 non-null   object
 2   content   2441 non-null   object
 3   category  2824 non-null   object
 4   author    2824 non-null   object
 5   comments  2824 non-null   object
dtypes: object(6)
memory usage: 132.5+ KB


In [5]:
df.head()

Unnamed: 0,title,date,content,category,author,comments
0,بهترین درسهای زندگی‎,۲ دی ۱۳۹۸,تی دی جیکس یک کشیش، نویسنده، سخنران آمریکایی ...,روانشناسی,سارا رزولت,بدون دیدگاه
1,از سلاجقه تا صفویه,۱ دی ۱۳۹۸,کتاب حاضر که اینک در دسترس خوانندگان محترم قر...,تاریخ ایران,نصرت الله مشکوتی,بدون دیدگاه
2,سرگذشت رضاشاه,۲۹ آذر ۱۳۹۸,دکتر فتح الله بینا تحصیلات خود را در رشته پزش...,تاریخ ایران,فتح الله بینا,بدون دیدگاه
3,سیری نوین در فرگشت و اسرار مغز,۲۸ آذر ۱۳۹۸,مجموعه پیش رو مطالبیست که در گروه Evolution ت...,علم و دانش,مانی منوچهری,بدون دیدگاه
4,آسمان پرستاره,۲۶ آذر ۱۳۹۸,کتاب آسمان پرستاره نوشته‌ی کتی هایدن، به زبان...,کودکان و نوجوانان,کتی هایدن,بدون دیدگاه


In [6]:
def remove_symbols_and_numbers(content):
  content = re.sub(f'[{punctuation_str}؟!،,?،٪×÷»«><:–]', '', content)
  return content

In [7]:
def normalizer(text):
  normalizer = Normalizer()
  removed_symbol_text = remove_symbols_and_numbers(text)
  normalized_text = normalizer.normalize(removed_symbol_text)
  normalized_text = normalized_text.replace("\u200c", " ")
  normalized_text = normalized_text.replace("\u200e", " ")
  return normalized_text

In [8]:
def tokenizer(normalized_text):
  word_tokenizer = WordTokenizer()
  persian_stopwords = pd.read_fwf('https://raw.githubusercontent.com/sobhe/hazm/master/hazm/data/stopwords.dat', header=None)[0].to_list()
  tokens = word_tokenizer.tokenize(normalized_text)
  tokens = list(filter(lambda t: t not in persian_stopwords, tokens))
  return tokens

In [9]:
def stemmer(tokens):
  stemmer = Stemmer()
  stems = [stemmer.stem(token) for token in tokens]
  stems = list(filter(lambda t: len(t) >= 2, stems))
  return stems

In [10]:
def persian_preprocess(text):
  normalized_text = normalizer(text)
  tokens = tokenizer(normalized_text)
  stems = stemmer(tokens)
  return stems

In [11]:
def create_posting_list(turn, tokens, postingList):
  for j, word in enumerate(tokens):
    postingList["total"] += 1
    if word in postingList["words"].keys():
        if turn not in postingList["words"][word].keys():
            postingList["words"][word]["docs_id"][turn] = 1
            postingList["words"][word]["count"] += 1
        else:
            postingList["words"][word]["docs_id"][turn] += 1
            postingList["words"][word]["count"] += 1
    else:
        postingList["words"][word] = {"docs_id": {}, "count": 1}
        postingList["words"][word]["docs_id"][turn] = 1
  return postingList

In [12]:
start_doc=0
end_doc=10
#df.iloc[start_doc:end_doc]

In [13]:
columnsName = df.columns
columnsName

Index(['title', 'date', 'content', 'category', 'author', 'comments'], dtype='object')

In [14]:
#Preprocess
df['Preprocessed']=list
for index, row in tqdm (df.iloc[start_doc:end_doc].iterrows()):
  df.at[index, 'Preprocessed'] = []
  for column in columnsName:
    if pd.isna(row[column]):
      continue
    df.at[index, 'Preprocessed'].append(persian_preprocess(row[column]))
  df.at[index, 'Preprocessed'] = [item for sublist in df.at[index, 'Preprocessed'] for item in sublist]

10it [00:46,  4.65s/it]


In [15]:
#Check the result of preprocess
df.iloc[start_doc:end_doc]

Unnamed: 0,title,date,content,category,author,comments,Preprocessed
0,بهترین درسهای زندگی‎,۲ دی ۱۳۹۸,تی دی جیکس یک کشیش، نویسنده، سخنران آمریکایی ...,روانشناسی,سارا رزولت,بدون دیدگاه,"[درس, زندگ, ۱۳۹۸, جیکس, کش, نویسنده, سخنر, آمر..."
1,از سلاجقه تا صفویه,۱ دی ۱۳۹۸,کتاب حاضر که اینک در دسترس خوانندگان محترم قر...,تاریخ ایران,نصرت الله مشکوتی,بدون دیدگاه,"[سلاجقه, صفویه, ۱۳۹۸, کتاب, حاضر, اینک, دسترس,..."
2,سرگذشت رضاشاه,۲۹ آذر ۱۳۹۸,دکتر فتح الله بینا تحصیلات خود را در رشته پزش...,تاریخ ایران,فتح الله بینا,بدون دیدگاه,"[سرگذ, رضاشاه, ۲۹, آذر, ۱۳۹۸, دک, فتح, الله, ب..."
3,سیری نوین در فرگشت و اسرار مغز,۲۸ آذر ۱۳۹۸,مجموعه پیش رو مطالبیست که در گروه Evolution ت...,علم و دانش,مانی منوچهری,بدون دیدگاه,"[سیر, نوین, فرگ, اسرار, مغز, ۲۸, آذر, ۱۳۹۸, مج..."
4,آسمان پرستاره,۲۶ آذر ۱۳۹۸,کتاب آسمان پرستاره نوشته‌ی کتی هایدن، به زبان...,کودکان و نوجوانان,کتی هایدن,بدون دیدگاه,"[آس, پرستاره, ۲۶, آذر, ۱۳۹۸, کتاب, آس, پرستاره..."
5,تاریخ مبارک غازانی,۲۴ آذر ۱۳۹۸,تاریخ مبارک غازانى، اثر فارسی رشیدالدین فضل‌ا...,تاریخ ایران,رشید الدین فضل الله,بدون دیدگاه,"[تاریخ, مبارک, غازان, ۲۴, آذر, ۱۳۹۸, تاریخ, مب..."
6,وبلاگ نویسی شیرین با وردپرس,۲۳ آذر ۱۳۹۸,کتاب وبلاگ نویسی شیرین با WordPress به آموزش ...,طراحی وب سایت,مایا,بدون دیدگاه,"[وبلاگ, نویس, شیرین, وردپرس, ۲۳, آذر, ۱۳۹۸, کت..."
7,نقش روابط جنسی و راز کامیابی در زندگی,۲۲ آذر ۱۳۹۸,استاد من پروفسور ویلسون که اینک در بیماریهای ...,پزشکی و سلامت,هادی محمدپور,بدون دیدگاه,"[نق, روابط, جنس, راز, کامیاب, زندگ, ۲۲, آذر, ۱..."
8,معرفی کتاب الکترونیکی,۲۲ آذر ۱۳۹۸,چیست؟ از آغاز تاریخ بشریت تاکنون شکل کتاب ها ...,دسته‌بندی نشده,,بدون دیدگاه,"[معرف, کتاب, الکترونیک, ۲۲, آذر, ۱۳۹۸, چیس, آغ..."
9,فرهنگ ریشه واژگان فارسی,۲۰ آذر ۱۳۹۸,ساختار کتاب چنین است که بر سر هر رویه درآیه‌ی...,آموزش زبان,دکتر علی نورائی,بدون دیدگاه,"[فرهنگ, ریشه, واژگ, فارس, ۲۰, آذر, ۱۳۹۸, ساختا..."


In [16]:
#Create Posting List
persianPostingList = {"words": {}, "total": 0}
for index, row in tqdm (df.iloc[start_doc:end_doc].iterrows()):
      persianPostingList = create_posting_list(index, row["Preprocessed"], persianPostingList)

10it [00:00, 2702.34it/s]


In [17]:
print("\nPosting List:")
print(persianPostingList)


Posting List:
{'words': {'درس': {'docs_id': {0: 1}, 'count': 1}, 'زندگ': {'docs_id': {0: 1, 5: 1, 7: 1, 8: 1}, 'count': 6}, '۱۳۹۸': {'docs_id': {0: 1, 1: 1, 2: 1, 3: 1, 4: 1, 5: 1, 6: 1, 7: 1, 8: 1, 9: 1}, 'count': 10}, 'جیکس': {'docs_id': {0: 1}, 'count': 5}, 'کش': {'docs_id': {0: 1}, 'count': 2}, 'نویسنده': {'docs_id': {0: 1, 5: 1, 9: 1}, 'count': 3}, 'سخنر': {'docs_id': {0: 1}, 'count': 3}, 'آمریکا': {'docs_id': {0: 1, 9: 1}, 'count': 3}, 'سمینار': {'docs_id': {0: 1}, 'count': 1}, 'انگیزش': {'docs_id': {0: 1}, 'count': 3}, 'مذهب': {'docs_id': {0: 1, 5: 1}, 'count': 2}, 'دلیل': {'docs_id': {0: 1}, 'count': 1}, 'ایر': {'docs_id': {0: 1, 1: 1, 2: 1, 5: 1}, 'count': 9}, 'سیاه': {'docs_id': {0: 1}, 'count': 1}, 'پوس': {'docs_id': {0: 1}, 'count': 1}, 'لقب': {'docs_id': {0: 1}, 'count': 1}, 'گرفته_اس': {'docs_id': {0: 1}, 'count': 1}, 'زندگینامه': {'docs_id': {0: 1}, 'count': 1}, 'شاهد': {'docs_id': {0: 1, 8: 1}, 'count': 2}, 'فعال': {'docs_id': {0: 1}, 'count': 2}, 'مختلف': {'docs_id': 

In [18]:
#Sorting Posting List by count

sorted_words = sorted(persianPostingList['words'].items(), key=lambda x: x[1]['count'], reverse=True)
sorted_persianPostingList = {'words': dict(sorted_words), 'total': persianPostingList['total']}
print(sorted_persianPostingList)

{'words': {'کتاب': {'docs_id': {0: 1, 1: 1, 2: 1, 3: 1, 4: 1, 5: 1, 6: 1, 8: 1, 9: 1}, 'count': 30}, 'تاریخ': {'docs_id': {1: 1, 2: 1, 5: 1, 8: 1}, 'count': 19}, 'زب': {'docs_id': {5: 1, 6: 1, 9: 1}, 'count': 13}, 'دیدگاه': {'docs_id': {0: 1, 1: 1, 2: 1, 3: 1, 4: 1, 5: 1, 6: 1, 7: 1, 8: 1, 9: 1}, 'count': 12}, 'واژه': {'docs_id': {9: 1}, 'count': 11}, '۱۳۹۸': {'docs_id': {0: 1, 1: 1, 2: 1, 3: 1, 4: 1, 5: 1, 6: 1, 7: 1, 8: 1, 9: 1}, 'count': 10}, 'ایر': {'docs_id': {0: 1, 1: 1, 2: 1, 5: 1}, 'count': 9}, 'آذر': {'docs_id': {2: 1, 3: 1, 4: 1, 5: 1, 6: 1, 7: 1, 8: 1, 9: 1}, 'count': 8}, 'فرهنگ': {'docs_id': {8: 1, 9: 1}, 'count': 8}, 'غاز': {'docs_id': {5: 1}, 'count': 7}, 'وردپرس': {'docs_id': {6: 1}, 'count': 7}, 'زندگ': {'docs_id': {0: 1, 5: 1, 7: 1, 8: 1}, 'count': 6}, 'استفاده': {'docs_id': {4: 1, 6: 1, 8: 1}, 'count': 6}, 'جیکس': {'docs_id': {0: 1}, 'count': 5}, 'مختلف': {'docs_id': {0: 1, 3: 1, 8: 1}, 'count': 5}, 'پا': {'docs_id': {0: 1, 1: 1, 2: 1}, 'count': 5}, 'الله': {'docs_id'

In [19]:
#Save Persian data frame after the preprocess
df.to_csv('persian.csv')

In [20]:
#Save Posting List
with open("/content/persian_outputPosting.txt", "w") as txt_file:
        txt_file.write(str(sorted_persianPostingList))

In [21]:
#Size of files
!ls -l --block-size=MB

total 58MB
-rw-r--r-- 1 root  root   2MB Nov 12 17:57 final_books.xlsx
-rw-r--r-- 1 root  root   2MB Nov 12 19:43 final_books.xlsx.1
-rw-r--r-- 1 root  root   2MB Nov 12 20:25 final_books.xlsx.2
-rw-r--r-- 1 root  root   2MB Nov 12 20:26 final_books.xlsx.3
drwxrwxr-x 2 14584 14584  1MB May  6  2013 MovieSummaries
-rw-r--r-- 1 root  root  49MB Jun 11  2015 MovieSummaries.tar.gz
-rw-r--r-- 1 root  root   4MB Nov 12 20:27 persian.csv
-rw-r--r-- 1 root  root   1MB Nov 12 20:27 persian_outputPosting.txt
drwxr-xr-x 1 root  root   1MB Nov  9 14:25 sample_data


#English Dataset

In [22]:
!wget http://www.cs.cmu.edu/~ark/personas/data/MovieSummaries.tar.gz
!tar -xf MovieSummaries.tar.gz

--2023-11-12 20:27:23--  http://www.cs.cmu.edu/~ark/personas/data/MovieSummaries.tar.gz
Resolving www.cs.cmu.edu (www.cs.cmu.edu)... 128.2.42.95
Connecting to www.cs.cmu.edu (www.cs.cmu.edu)|128.2.42.95|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 48002242 (46M) [application/x-gzip]
Saving to: ‘MovieSummaries.tar.gz.1’


2023-11-12 20:27:38 (3.11 MB/s) - ‘MovieSummaries.tar.gz.1’ saved [48002242/48002242]



In [23]:
df2 = pd.read_csv("/content/MovieSummaries/plot_summaries.txt", delimiter = "\t",names=["id","text"])
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42303 entries, 0 to 42302
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      42303 non-null  int64 
 1   text    42303 non-null  object
dtypes: int64(1), object(1)
memory usage: 661.1+ KB


In [24]:
df2.head()

Unnamed: 0,id,text
0,23890098,"Shlykov, a hard-working taxi driver and Lyosha..."
1,31186339,The nation of Panem consists of a wealthy Capi...
2,20663735,Poovalli Induchoodan is sentenced for six yea...
3,2231378,"The Lemon Drop Kid , a New York City swindler,..."
4,595909,Seventh-day Adventist Church pastor Michael Ch...


In [25]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [26]:
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [27]:
def convertToLowerCase(row):
    return row.lower()

In [28]:
# def deleteNumbers(row):
#     return re.sub(r'\d+', '', row)

In [29]:
def deletePunctuation(row):
    return re.sub(r'[^\w\s]', '', row)

In [30]:
def deleteWhiteSpace(row):
    return row.strip()

In [31]:
def english_normalizer(text):
  to_lower_case_text = convertToLowerCase(text)
  deleted_punct_words = deletePunctuation(to_lower_case_text)
  deleted_white_space = deleteWhiteSpace(deleted_punct_words)
  return deleted_white_space

In [32]:
def english_tokenizer(normalized_text):
  english_stopwords = set(stopwords.words('english'))
  tokens = word_tokenize(normalized_text)
  tokens = list(filter(lambda t: t not in english_stopwords, tokens))
  return tokens


In [33]:
def english_stemmer(tokens):
  stems = [PorterStemmer().stem(token) for token in tokens]
  return stems

In [34]:
def english_preprocess(text):
  normalized_text = english_normalizer(text)
  tokens = english_tokenizer(normalized_text)
  stems = english_stemmer(tokens)
  return stems

In [35]:
#df2.iloc[start_doc:end_doc]

In [36]:
#Preprocess
df2['Preprocessed'] = list

for index, row in tqdm(df2.iloc[start_doc:end_doc].iterrows()):
  df.at[index, 'Preprocessed'] = []
  df2.at[index, 'Preprocessed'] = english_preprocess(row["text"])

10it [00:00, 139.23it/s]


In [37]:
df2.iloc[start_doc:end_doc]

Unnamed: 0,id,text,Preprocessed
0,23890098,"Shlykov, a hard-working taxi driver and Lyosha...","[shlykov, hardwork, taxi, driver, lyosha, saxo..."
1,31186339,The nation of Panem consists of a wealthy Capi...,"[nation, panem, consist, wealthi, capitol, twe..."
2,20663735,Poovalli Induchoodan is sentenced for six yea...,"[pooval, induchoodan, sentenc, six, year, pris..."
3,2231378,"The Lemon Drop Kid , a New York City swindler,...","[lemon, drop, kid, new, york, citi, swindler, ..."
4,595909,Seventh-day Adventist Church pastor Michael Ch...,"[seventhday, adventist, church, pastor, michae..."
5,5272176,The president is on his way to give a speech. ...,"[presid, way, give, speech, travel, man, show,..."
6,1952976,"{{plot}} The film opens in 1974, as a young gi...","[plot, film, open, 1974, young, girl, dahlia, ..."
7,24225279,"The story begins with Hannah, a young Jewish t...","[stori, begin, hannah, young, jewish, teen, co..."
8,2462689,Infuriated at being told to write one final co...,"[infuri, told, write, one, final, column, laid..."
9,20532852,A line of people drool at the window of the s...,"[line, peopl, drool, window, shop, market, but..."


In [38]:
#Create Posting List

englishPostingList = {"words": {}, "total": 0}
for index, row in tqdm (df2.iloc[start_doc:end_doc].iterrows()):
      englishPostingList = create_posting_list(index, row["Preprocessed"], englishPostingList)

10it [00:00, 2338.35it/s]


In [39]:
#Sorting Posting List by count
en_sorted_words = sorted(englishPostingList['words'].items(), key=lambda x: x[1]['count'], reverse=True)
sorted_englishPostingList = {'words': dict(en_sorted_words), 'total': englishPostingList['total']}
print(sorted_englishPostingList)

{'words': {'kid': {'docs_id': {3: 1, 7: 1}, 'count': 36}, 'katniss': {'docs_id': {1: 1}, 'count': 24}, 'dahlia': {'docs_id': {6: 1}, 'count': 21}, 'cecilia': {'docs_id': {6: 1}, 'count': 19}, 'induchoodan': {'docs_id': {2: 1}, 'count': 18}, 'charley': {'docs_id': {3: 1}, 'count': 18}, 'peeta': {'docs_id': {1: 1}, 'count': 16}, 'thoma': {'docs_id': {5: 1}, 'count': 15}, 'hannah': {'docs_id': {7: 1}, 'count': 15}, 'kill': {'docs_id': {1: 1, 2: 1, 5: 1, 8: 1}, 'count': 14}, 'find': {'docs_id': {1: 1, 3: 1, 4: 1, 5: 1, 6: 1}, 'count': 14}, 'home': {'docs_id': {3: 1, 5: 1, 6: 1, 7: 1}, 'count': 14}, 'domin': {'docs_id': {7: 1}, 'count': 14}, 'girl': {'docs_id': {1: 1, 2: 1, 6: 1}, 'count': 13}, 'menon': {'docs_id': {2: 1}, 'count': 12}, 'school': {'docs_id': {6: 1, 7: 1}, 'count': 12}, 'natasha': {'docs_id': {6: 1}, 'count': 12}, 'rue': {'docs_id': {1: 1}, 'count': 11}, 'return': {'docs_id': {1: 1, 2: 1, 3: 1, 4: 1, 6: 1, 7: 1}, 'count': 11}, 'money': {'docs_id': {3: 1, 7: 1, 8: 1}, 'count'

In [40]:
print(englishPostingList)

{'words': {'shlykov': {'docs_id': {0: 1}, 'count': 1}, 'hardwork': {'docs_id': {0: 1}, 'count': 1}, 'taxi': {'docs_id': {0: 1}, 'count': 1}, 'driver': {'docs_id': {0: 1, 7: 1}, 'count': 2}, 'lyosha': {'docs_id': {0: 1}, 'count': 1}, 'saxophonist': {'docs_id': {0: 1}, 'count': 1}, 'develop': {'docs_id': {0: 1, 8: 1}, 'count': 2}, 'bizarr': {'docs_id': {0: 1, 4: 1}, 'count': 2}, 'loveh': {'docs_id': {0: 1}, 'count': 1}, 'relationship': {'docs_id': {0: 1}, 'count': 1}, 'despit': {'docs_id': {0: 1, 6: 1, 7: 1}, 'count': 4}, 'prejudic': {'docs_id': {0: 1}, 'count': 1}, 'realiz': {'docs_id': {0: 1, 6: 1, 8: 1, 9: 1}, 'count': 4}, 'arent': {'docs_id': {0: 1}, 'count': 1}, 'differ': {'docs_id': {0: 1, 7: 1}, 'count': 2}, 'nation': {'docs_id': {1: 1, 8: 1}, 'count': 2}, 'panem': {'docs_id': {1: 1}, 'count': 1}, 'consist': {'docs_id': {1: 1}, 'count': 1}, 'wealthi': {'docs_id': {1: 1}, 'count': 1}, 'capitol': {'docs_id': {1: 1}, 'count': 2}, 'twelv': {'docs_id': {1: 1}, 'count': 1}, 'poorer': {'

In [41]:
#Save Persian data frame after the preprocess
df2.to_csv('english.csv')

In [42]:
#Save Posting List
with open("/content/english_outputPosting.txt", "w") as txt_file:
        txt_file.write(str(sorted_englishPostingList))

In [43]:
!ls -l --block-size=MB

total 183MB
-rw-r--r-- 1 root  root  78MB Nov 12 20:27 english.csv
-rw-r--r-- 1 root  root   1MB Nov 12 20:27 english_outputPosting.txt
-rw-r--r-- 1 root  root   2MB Nov 12 17:57 final_books.xlsx
-rw-r--r-- 1 root  root   2MB Nov 12 19:43 final_books.xlsx.1
-rw-r--r-- 1 root  root   2MB Nov 12 20:25 final_books.xlsx.2
-rw-r--r-- 1 root  root   2MB Nov 12 20:26 final_books.xlsx.3
drwxrwxr-x 2 14584 14584  1MB May  6  2013 MovieSummaries
-rw-r--r-- 1 root  root  49MB Jun 11  2015 MovieSummaries.tar.gz
-rw-r--r-- 1 root  root  49MB Jun 11  2015 MovieSummaries.tar.gz.1
-rw-r--r-- 1 root  root   4MB Nov 12 20:27 persian.csv
-rw-r--r-- 1 root  root   1MB Nov 12 20:27 persian_outputPosting.txt
drwxr-xr-x 1 root  root   1MB Nov  9 14:25 sample_data


#Boolean model

In [44]:
def getBooleanModel(start_doc, end_doc, postingList):
    boolean_model_all = {}
    for i in range(start_doc, end_doc):
        boolean_model_all.setdefault(i, {})
        for word in postingList["words"]:
            if i in postingList["words"][word]["docs_id"]:
                boolean_model_all[i][word] = 1
            else:
                boolean_model_all[i][word] = 0
    return boolean_model_all

In [45]:
def queryStemmer(word):
  norm = convertToLowerCase(word)
  stemmer = Stemmer()
  stem_word = stemmer.stem(norm)
  return stem_word

In [46]:
def ready_query(query, postingList, booleanModel):
  query_dict = {}
  for word in query:
    if "!" in word:
      word = word.replace('!','')
    norm_word = queryStemmer(word)
    query_dict[norm_word] = []
    for i in range(start_doc, end_doc):
      if ((word in postingList["words"].keys()) and (booleanModel[i][word] == 1)):
        query_dict[norm_word].append(i)
      if ((norm_word in postingList["words"]) and (booleanModel[i][norm_word] == 1)):
        query_dict[norm_word].append(i)
  return query_dict

In [47]:
def and_find_docs(query, start_doc, end_doc, query_dict):
  doc_ids = {i for i in range(start_doc, end_doc)}
  for word in query:
    if "!" in word:
      word = word.replace('!','')
      word = queryStemmer(word)
      doc_ids = doc_ids - set(query_dict[word])

    else:
      word = queryStemmer(word)
      doc_ids = doc_ids & set(query_dict[word])

  return doc_ids

In [48]:
def or_find_docs(query, start_doc, end_doc, query_dict):
  final_doc_id = set()
  doc_ids = {i for i in range(start_doc, end_doc)}
  for word in query:
    if "!" in word:
      word = word.replace('!','')
      word = queryStemmer(word)
      final_doc_id.update(list(doc_ids - set(query_dict[word])))

    else:
      word = queryStemmer(word)
      final_doc_id.update(list(doc_ids & set(query_dict[word])))

  return final_doc_id

In [49]:
persianBooleanModel = getBooleanModel(start_doc, end_doc, sorted_persianPostingList)
englishBooleanModel = getBooleanModel(start_doc, end_doc, sorted_englishPostingList)

In [None]:
print("1:Persian 2:English")
l=str(input())

query = str(input("enter query:"))

docs = []
allDocs = []

if l=='1':
  if "+" in query:
    query = query.split("+")
    allDocs = ready_query(query, sorted_persianPostingList, persianBooleanModel)
    docs = or_find_docs(query, start_doc, end_doc, allDocs)
  elif "." in query:
    query = query.split(".")
    allDocs = ready_query(query, sorted_persianPostingList, persianBooleanModel)
    docs = and_find_docs(query, start_doc, end_doc, allDocs)
else:
  if "+" in query:
    query = query.split("+")
    allDocs = ready_query(query, sorted_englishPostingList, englishBooleanModel)
    docs = or_find_docs(query, start_doc, end_doc, allDocs)
  elif "." in query:
    query = query.split(".")
    allDocs = ready_query(query, sorted_englishPostingList, englishBooleanModel)
    docs = and_find_docs(query, start_doc, end_doc, allDocs)

print(docs)

1:Persian 2:English
