## Group 5
##### Ehsan Hosseini 9912223190
##### Pegah Motahari 9812223078

In [1]:
pip install hazm



In [2]:
from hazm import Normalizer, WordTokenizer, Stemmer
import nltk
import pandas as pd
import re
from string import punctuation as punctuation_str
import csv
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from tqdm import tqdm
import math
import numpy as np
import glob
import os


In [3]:
!wget https://github.com/mohamad-dehghani/persian-pdf-books-dataset/raw/master/final_books.xlsx

--2023-12-01 12:59:16--  https://github.com/mohamad-dehghani/persian-pdf-books-dataset/raw/master/final_books.xlsx
Resolving github.com (github.com)... 20.27.177.113
Connecting to github.com (github.com)|20.27.177.113|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/mohamad-dehghani/persian-pdf-books-dataset/master/final_books.xlsx [following]
--2023-12-01 12:59:17--  https://raw.githubusercontent.com/mohamad-dehghani/persian-pdf-books-dataset/master/final_books.xlsx
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.110.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1380625 (1.3M) [application/octet-stream]
Saving to: ‘final_books.xlsx’


2023-12-01 12:59:18 (6.54 MB/s) - ‘final_books.xlsx’ saved [1380625/1380625]



# Persian Dataset

In [4]:
df = pd.read_excel('final_books.xlsx')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2824 entries, 0 to 2823
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   title     2824 non-null   object
 1   date      2824 non-null   object
 2   content   2441 non-null   object
 3   category  2824 non-null   object
 4   author    2824 non-null   object
 5   comments  2824 non-null   object
dtypes: object(6)
memory usage: 132.5+ KB


In [5]:
df.head()

Unnamed: 0,title,date,content,category,author,comments
0,بهترین درسهای زندگی‎,۲ دی ۱۳۹۸,تی دی جیکس یک کشیش، نویسنده، سخنران آمریکایی ...,روانشناسی,سارا رزولت,بدون دیدگاه
1,از سلاجقه تا صفویه,۱ دی ۱۳۹۸,کتاب حاضر که اینک در دسترس خوانندگان محترم قر...,تاریخ ایران,نصرت الله مشکوتی,بدون دیدگاه
2,سرگذشت رضاشاه,۲۹ آذر ۱۳۹۸,دکتر فتح الله بینا تحصیلات خود را در رشته پزش...,تاریخ ایران,فتح الله بینا,بدون دیدگاه
3,سیری نوین در فرگشت و اسرار مغز,۲۸ آذر ۱۳۹۸,مجموعه پیش رو مطالبیست که در گروه Evolution ت...,علم و دانش,مانی منوچهری,بدون دیدگاه
4,آسمان پرستاره,۲۶ آذر ۱۳۹۸,کتاب آسمان پرستاره نوشته‌ی کتی هایدن، به زبان...,کودکان و نوجوانان,کتی هایدن,بدون دیدگاه


### persian dataset pre-process

In [6]:
def remove_symbols_and_numbers(content):
  content = re.sub(f'[{punctuation_str}؟!،,?،٪×÷»«><:–]', '', content)
  return content

In [7]:
def normalizer(text):
  normalizer = Normalizer()
  removed_symbol_text = remove_symbols_and_numbers(text)
  normalized_text = normalizer.normalize(removed_symbol_text)
  normalized_text = normalized_text.replace("\u200c", " ")
  normalized_text = normalized_text.replace("\u200e", " ")
  return normalized_text

In [8]:
def tokenizer(normalized_text):
  word_tokenizer = WordTokenizer()
  tokens = word_tokenizer.tokenize(normalized_text)
  return tokens

In [9]:
def stemmer(tokens):
  stemmer = Stemmer()
  stems = [stemmer.stem(token) for token in tokens]
  stems = list(filter(lambda t: len(t) >= 3, stems))
  return stems

In [10]:
def persian_preprocess(text):
  normalized_text = normalizer(text)
  tokens = tokenizer(normalized_text)
  stems = stemmer(tokens)
  return stems

In [11]:
def create_posting_list(turn, tokens, postingList):
  for j, word in enumerate(tokens):
    postingList["total"] += 1
    if word in postingList["words"].keys():
        if turn not in postingList["words"][word].keys():
            postingList["words"][word]["docs_id"][turn] = 1
            postingList["words"][word]["count"] += 1
        else:
            postingList["words"][word]["docs_id"][turn] += 1
            postingList["words"][word]["count"] += 1
    else:
        postingList["words"][word] = {"docs_id": {}, "count": 1}
        postingList["words"][word]["docs_id"][turn] = 1
  return postingList

In [12]:
persian_start_doc=0
#the process must be done for the first 1000 documents
persian_end_doc=1000

In [13]:
columnsName = df.columns
columnsName

Index(['title', 'date', 'content', 'category', 'author', 'comments'], dtype='object')

In [14]:
#Preprocess
df['Preprocessed']=list
for index, row in tqdm (df.iloc[persian_start_doc:persian_end_doc].iterrows()):
  df.at[index, 'Preprocessed'] = []
  for column in columnsName:
    if pd.isna(row[column]):
      continue
    df.at[index, 'Preprocessed'].append(persian_preprocess(row[column]))
  df.at[index, 'Preprocessed'] = [item for sublist in df.at[index, 'Preprocessed'] for item in sublist]

1000it [1:32:14,  5.53s/it]


In [15]:
#Check the result of preprocess
#create an "id" column which is equal to index
df.insert(0, 'id', range(len(df)))
df.iloc[persian_start_doc:persian_end_doc]

Unnamed: 0,id,title,date,content,category,author,comments,Preprocessed
0,0,بهترین درسهای زندگی‎,۲ دی ۱۳۹۸,تی دی جیکس یک کشیش، نویسنده، سخنران آمریکایی ...,روانشناسی,سارا رزولت,بدون دیدگاه,"[درس, زندگ, ۱۳۹۸, جیکس, نویسنده, سخنر, آمریکا,..."
1,1,از سلاجقه تا صفویه,۱ دی ۱۳۹۸,کتاب حاضر که اینک در دسترس خوانندگان محترم قر...,تاریخ ایران,نصرت الله مشکوتی,بدون دیدگاه,"[سلاجقه, صفویه, ۱۳۹۸, کتاب, حاضر, اینک, دسترس,..."
2,2,سرگذشت رضاشاه,۲۹ آذر ۱۳۹۸,دکتر فتح الله بینا تحصیلات خود را در رشته پزش...,تاریخ ایران,فتح الله بینا,بدون دیدگاه,"[سرگذ, رضاشاه, آذر, ۱۳۹۸, فتح, الله, بینا, تحص..."
3,3,سیری نوین در فرگشت و اسرار مغز,۲۸ آذر ۱۳۹۸,مجموعه پیش رو مطالبیست که در گروه Evolution ت...,علم و دانش,مانی منوچهری,بدون دیدگاه,"[سیر, نوین, فرگ, اسرار, مغز, آذر, ۱۳۹۸, مجموعه..."
4,4,آسمان پرستاره,۲۶ آذر ۱۳۹۸,کتاب آسمان پرستاره نوشته‌ی کتی هایدن، به زبان...,کودکان و نوجوانان,کتی هایدن,بدون دیدگاه,"[پرستاره, آذر, ۱۳۹۸, کتاب, پرستاره, نوشته, های..."
...,...,...,...,...,...,...,...,...
995,995,اثرات مشروبات الکلی، چای و قهوه از دیدگاه دان...,۱ آذر ۱۳۹۴,در این کتاب، تاثیرات مشروبات الکلی، چای و قهو...,پزشکی و سلامت,علی اصغر اکبری,۴ دیدگاه,"[اثر, مشروب, الکل, قهوه, دیدگاه, دان, پزشک, آذ..."
996,996,ایران زمین,۳۰ آبان ۱۳۹۴,این کتاب درباره تاریخ ایران و چگونگی پیدایش ت...,تاریخ ایران,جمشید نغماچی کازرونی,۱ دیدگاه,"[ایر, زمین, ۱۳۹۴, این, کتاب, درباره, تاریخ, ای..."
997,997,معلول,۳۰ آبان ۱۳۹۴,جامعه ما شناخت زیادی درباره معلولان ندارند و گ...,جامعه شناسی,وحید نجفی,۱ دیدگاه,"[معلول, ۱۳۹۴, جامعه, شناخ, زیاد, درباره, معلول..."
998,998,شایع ترین اختلالات جنسی در مردان,۲۸ آبان ۱۳۹۴,در این کتاب اختلالات شایع دستگاه تناسلی مردان ...,پزشکی و سلامت,رضا پوردستگردان,۵ دیدگاه,"[شایع, اختلال, جنس, مرد, ۱۳۹۴, این, کتاب, اختل..."


### persian dataset posting-list

In [16]:
#Create Posting List
persianPostingList = {"words": {}, "total": 0}
for index, row in tqdm (df.iloc[persian_start_doc:persian_end_doc].iterrows()):
      persianPostingList = create_posting_list(index, row["Preprocessed"], persianPostingList)

1000it [00:00, 2778.90it/s]


In [17]:
#Sorting Posting List by count

sorted_words = sorted(persianPostingList['words'].items(), key=lambda x: x[1]['count'], reverse=True)
sorted_persianPostingList = {'words': dict(sorted_words), 'total': persianPostingList['total']}
print(sorted_persianPostingList)

{'words': {'این': {'docs_id': {0: 1, 1: 1, 3: 1, 4: 1, 5: 1, 6: 1, 8: 1, 9: 1, 13: 1, 14: 1, 15: 1, 16: 1, 17: 1, 18: 1, 19: 1, 20: 1, 21: 1, 22: 1, 23: 1, 24: 1, 25: 1, 26: 1, 27: 1, 29: 1, 31: 1, 32: 1, 33: 1, 34: 1, 35: 1, 37: 1, 38: 1, 39: 1, 40: 1, 41: 1, 42: 1, 43: 1, 44: 1, 45: 1, 46: 1, 47: 1, 48: 1, 49: 1, 51: 1, 52: 1, 53: 1, 54: 1, 55: 1, 56: 1, 57: 1, 58: 1, 59: 1, 60: 1, 61: 1, 62: 1, 63: 1, 64: 1, 65: 1, 66: 1, 68: 1, 69: 1, 70: 1, 71: 1, 72: 1, 74: 1, 77: 1, 78: 1, 79: 1, 81: 1, 82: 1, 83: 1, 84: 1, 87: 1, 88: 1, 89: 1, 91: 1, 92: 1, 93: 1, 94: 1, 95: 1, 97: 1, 98: 1, 99: 1, 101: 1, 102: 1, 103: 1, 104: 1, 105: 1, 107: 1, 108: 1, 109: 1, 110: 1, 111: 1, 112: 1, 113: 1, 114: 1, 116: 1, 117: 1, 118: 1, 119: 1, 121: 1, 122: 1, 123: 1, 124: 1, 125: 1, 126: 1, 127: 1, 128: 1, 129: 1, 130: 1, 132: 1, 133: 1, 134: 1, 136: 1, 137: 1, 138: 1, 140: 1, 141: 1, 142: 1, 143: 1, 145: 1, 146: 1, 148: 1, 150: 1, 151: 1, 152: 1, 154: 1, 155: 1, 157: 1, 159: 1, 160: 1, 161: 1, 162: 1, 163

In [18]:
print("\nPosting List:")
print(sorted_persianPostingList)


Posting List:
{'words': {'این': {'docs_id': {0: 1, 1: 1, 3: 1, 4: 1, 5: 1, 6: 1, 8: 1, 9: 1, 13: 1, 14: 1, 15: 1, 16: 1, 17: 1, 18: 1, 19: 1, 20: 1, 21: 1, 22: 1, 23: 1, 24: 1, 25: 1, 26: 1, 27: 1, 29: 1, 31: 1, 32: 1, 33: 1, 34: 1, 35: 1, 37: 1, 38: 1, 39: 1, 40: 1, 41: 1, 42: 1, 43: 1, 44: 1, 45: 1, 46: 1, 47: 1, 48: 1, 49: 1, 51: 1, 52: 1, 53: 1, 54: 1, 55: 1, 56: 1, 57: 1, 58: 1, 59: 1, 60: 1, 61: 1, 62: 1, 63: 1, 64: 1, 65: 1, 66: 1, 68: 1, 69: 1, 70: 1, 71: 1, 72: 1, 74: 1, 77: 1, 78: 1, 79: 1, 81: 1, 82: 1, 83: 1, 84: 1, 87: 1, 88: 1, 89: 1, 91: 1, 92: 1, 93: 1, 94: 1, 95: 1, 97: 1, 98: 1, 99: 1, 101: 1, 102: 1, 103: 1, 104: 1, 105: 1, 107: 1, 108: 1, 109: 1, 110: 1, 111: 1, 112: 1, 113: 1, 114: 1, 116: 1, 117: 1, 118: 1, 119: 1, 121: 1, 122: 1, 123: 1, 124: 1, 125: 1, 126: 1, 127: 1, 128: 1, 129: 1, 130: 1, 132: 1, 133: 1, 134: 1, 136: 1, 137: 1, 138: 1, 140: 1, 141: 1, 142: 1, 143: 1, 145: 1, 146: 1, 148: 1, 150: 1, 151: 1, 152: 1, 154: 1, 155: 1, 157: 1, 159: 1, 160: 1, 161:

In [19]:
for token, info in sorted_persianPostingList['words'].items():
  print(f"{token} -> freq: {info['count']}, posting list: {info['docs_id']}")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
بودیمعاخه -> freq: 1, posting list: {276: 1}
قراردادن -> freq: 1, posting list: {277: 1}
لال -> freq: 1, posting list: {278: 1}
اشکالات -> freq: 1, posting list: {278: 1}
بلع -> freq: 1, posting list: {278: 1}
ناشنوا -> freq: 1, posting list: {278: 1}
فلج -> freq: 1, posting list: {278: 1}
هشتجین -> freq: 1, posting list: {278: 1}
قرارم -> freq: 1, posting list: {279: 1}
خوانیمتحول -> freq: 1, posting list: {279: 1}
سیرجان -> freq: 1, posting list: {280: 1}
قاران -> freq: 1, posting list: {280: 1}
۱۲۳۰ -> freq: 1, posting list: {280: 1}
سیرج -> freq: 1, posting list: {280: 1}
معاریف -> freq: 1, posting list: {280: 1}
مطایبه -> freq: 1, posting list: {280: 1}
فرام -> freq: 1, posting list: {280: 1}
خواندمقصود -> freq: 1, posting list: {280: 1}
طبعانه -> freq: 1, posting list: {280: 1}
گردنه -> freq: 1, posting list: {280: 1}
صحراگرد -> freq: 1, posting list: {280: 1}
لفافه -> freq: 1, posting list: {280: 1}
غارتگر -> freq:

# English Dataset

In [20]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [21]:
file_path = '/content/drive/MyDrive/Colab Notebooks/Novels/'
file_list = glob.glob(file_path + "*")
titles = []
texts = []
def read_document(file_name):
  with open(file_name) as file:
    titles.append(os.path.basename(file_name))
    texts.append(file.read())

for i in range(len(file_list)):
  read_document(file_list[i])

data = {'id': range(len(titles)), 'title': titles, 'text': texts}
df2 = pd.DataFrame(data)


In [22]:
#create the df2 data frame with "title" as the title of txt file and "text" as the content of the file
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28 entries, 0 to 27
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      28 non-null     int64 
 1   title   28 non-null     object
 2   text    28 non-null     object
dtypes: int64(1), object(2)
memory usage: 800.0+ bytes


In [23]:
df2.head()

Unnamed: 0,id,title,text
0,0,HowardsEnd.txt,HOWARDS END\n\nBy E. M. Forster\n\n\n\n\nCHAPT...
1,1,TheSignOfTheFour.txt,Chapter I\nThe Science of Deduction\n\nSherloc...
2,2,ARoomWithAView.txt,"PART ONE\n\n\n\nChapter I: The Bertolini\n\n""T..."
3,3,TheMysteriousAffairAtStyles.txt,The Mysterious Affair at Styles\n\nby Agatha C...
4,4,TheManWhoWasThursday.txt,THE MAN WHO WAS THURSDAY\nA NIGHTMARE\n\n\nTo ...


In [25]:
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

### English dataset pre-process

In [26]:
def convertToLowerCase(row):
    return row.lower()

In [28]:
def deletePunctuation(row):
    return re.sub(r'[^\w\s]', '', row)

In [29]:
def deleteWhiteSpace(row):
    return row.strip()

In [30]:
def english_normalizer(text):
  to_lower_case_text = convertToLowerCase(text)
  deleted_punct_words = deletePunctuation(to_lower_case_text)
  deleted_white_space = deleteWhiteSpace(deleted_punct_words)
  return deleted_white_space

In [31]:
def english_tokenizer(normalized_text):
  tokens = word_tokenize(normalized_text)
  return tokens


In [32]:
def english_stemmer(tokens):
  stems = [PorterStemmer().stem(token) for token in tokens]
  stems = list(filter(lambda t: len(t) >= 3, stems))

  return stems

In [33]:
def english_preprocess(text):
  normalized_text = english_normalizer(text)
  tokens = english_tokenizer(normalized_text)
  stems = english_stemmer(tokens)
  return stems

In [34]:
english_start_doc=0
#the process must be done for the all documents
english_end_doc=len(df2)

In [35]:
#Preprocess
df2['Preprocessed'] = list
for index, row in tqdm(df2.iloc[english_start_doc:english_end_doc].iterrows()):
  df2.at[index, 'Preprocessed'] = []
  df2.at[index, 'Preprocessed'] = english_preprocess(row["text"])

28it [01:06,  2.36s/it]


In [36]:
#create an "id" column which is equal to index
df2.iloc[english_start_doc:english_end_doc]

Unnamed: 0,id,title,text,Preprocessed
0,0,HowardsEnd.txt,HOWARDS END\n\nBy E. M. Forster\n\n\n\n\nCHAPT...,"[howard, end, forster, chapter, one, may, well..."
1,1,TheSignOfTheFour.txt,Chapter I\nThe Science of Deduction\n\nSherloc...,"[chapter, the, scienc, deduct, sherlock, holm,..."
2,2,ARoomWithAView.txt,"PART ONE\n\n\n\nChapter I: The Bertolini\n\n""T...","[part, one, chapter, the, bertolini, the, sign..."
3,3,TheMysteriousAffairAtStyles.txt,The Mysterious Affair at Styles\n\nby Agatha C...,"[the, mysteri, affair, style, agatha, christi,..."
4,4,TheManWhoWasThursday.txt,THE MAN WHO WAS THURSDAY\nA NIGHTMARE\n\n\nTo ...,"[the, man, who, thursday, nightmar, edmund, cl..."
5,5,AnneOfGreenGables.txt,ANNE OF GREEN GABLES\n\n\n\n\nCHAPTER I. Mrs. ...,"[ann, green, gabl, chapter, rachel, lynd, surp..."
6,6,ThePictureOfDorianGray.txt,The Picture of Dorian Gray\n\nby Oscar Wilde\n...,"[the, pictur, dorian, gray, oscar, wild, the, ..."
7,7,APassageToIndia.txt,The Project Gutenberg EBook of A Passage to In...,"[the, project, gutenberg, ebook, passag, india..."
8,8,WinnieThePooh.txt,WINNIE-THE-POOH\n\n ...,"[winniethepooh, _bi, milne_, chapter, which, a..."
9,9,TheSunAlsoRises.txt,THE SUN ALSO RISES\n\nby Ernest Hemingway\n\n\...,"[the, sun, also, rise, ernest, hemingway, _thi..."


### English dataset posting-list

In [37]:
#Create Posting List

englishPostingList = {"words": {}, "total": 0}
for index, row in tqdm (df2.iloc[english_start_doc:english_end_doc].iterrows()):
      englishPostingList = create_posting_list(index, row["Preprocessed"], englishPostingList)

28it [00:01, 14.40it/s]


In [38]:
#Sorting Posting List by count
en_sorted_words = sorted(englishPostingList['words'].items(), key=lambda x: x[1]['count'], reverse=True)
sorted_englishPostingList = {'words': dict(en_sorted_words), 'total': englishPostingList['total']}
print(sorted_englishPostingList)

{'words': {'the': {'docs_id': {0: 1, 1: 1, 2: 1, 3: 1, 4: 1, 5: 1, 6: 1, 7: 1, 8: 1, 9: 1, 10: 1, 11: 1, 12: 1, 13: 1, 14: 1, 15: 1, 16: 1, 17: 1, 18: 1, 19: 1, 20: 1, 21: 1, 22: 1, 23: 1, 24: 1, 25: 1, 26: 1, 27: 1}, 'count': 107472}, 'and': {'docs_id': {0: 1, 1: 1, 2: 1, 3: 1, 4: 1, 5: 1, 6: 1, 7: 1, 8: 1, 9: 1, 10: 1, 11: 1, 12: 1, 13: 1, 14: 1, 15: 1, 16: 1, 17: 1, 18: 1, 19: 1, 20: 1, 21: 1, 22: 1, 23: 1, 24: 1, 25: 1, 26: 1, 27: 1}, 'count': 71092}, 'her': {'docs_id': {0: 1, 1: 1, 2: 1, 3: 1, 4: 1, 5: 1, 6: 1, 7: 1, 8: 1, 9: 1, 10: 1, 11: 1, 12: 1, 13: 1, 14: 1, 15: 1, 16: 1, 17: 1, 18: 1, 19: 1, 20: 1, 21: 1, 22: 1, 23: 1, 24: 1, 25: 1, 26: 1, 27: 1}, 'count': 30315}, 'that': {'docs_id': {0: 1, 1: 1, 2: 1, 3: 1, 4: 1, 5: 1, 6: 1, 7: 1, 8: 1, 9: 1, 10: 1, 11: 1, 12: 1, 13: 1, 14: 1, 15: 1, 16: 1, 17: 1, 18: 1, 19: 1, 20: 1, 21: 1, 22: 1, 23: 1, 24: 1, 25: 1, 26: 1, 27: 1}, 'count': 29273}, 'she': {'docs_id': {0: 1, 1: 1, 2: 1, 3: 1, 4: 1, 5: 1, 6: 1, 7: 1, 8: 1, 9: 1, 10: 1, 11: 

In [39]:
for token, info in sorted_englishPostingList['words'].items():
  print(f"{token} -> freq: {info['count']}, posting list: {info['docs_id']}")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
eyesemma -> freq: 1, posting list: {20: 1}
riddlebook -> freq: 1, posting list: {20: 1}
daysharriet -> freq: 1, posting list: {20: 1}
househow -> freq: 1, posting list: {20: 1}
beautifulther -> freq: 1, posting list: {20: 1}
inno -> freq: 1, posting list: {20: 1}
housekeeperno -> freq: 1, posting list: {20: 1}
charmingon -> freq: 1, posting list: {20: 1}
sillyso -> freq: 1, posting list: {20: 1}
satisfiedso -> freq: 1, posting list: {20: 1}
smilingso -> freq: 1, posting list: {20: 1}
prosingso -> freq: 1, posting list: {20: 1}
unfastidiousand -> freq: 1, posting list: {20: 1}
niecesi -> freq: 1, posting list: {20: 1}
appeari -> freq: 1, posting list: {20: 1}
illequip -> freq: 1, posting list: {20: 1}
ribband -> freq: 1, posting list: {20: 1}
expectedsh -> freq: 1, posting list: {20: 1}
expectthat -> freq: 1, posting list: {20: 1}
illbestow -> freq: 1, posting list: {20: 1}
taylorit -> freq: 1, posting list: {20: 1}
toowha

# Jaccard

In [40]:
def jaccard(list1, list2):
  intersection = len(list(set(list1).intersection(list2)))
  union = (len(list1) + len(list2)) - intersection
  return float(intersection) / union

In [41]:
def query_with_jaccard(query, language, docs):
  #convert query to stems(normalize the query) based on the language of the query and save it in "query_stems"
  if language == 'persian':
    query_stems = persian_preprocess(query)
  elif language == 'english':
    query_stems = english_preprocess(query)

  jaccard_results = []
  title = []
  for doc in range(len(docs)):
    jaccard_results.append((doc, jaccard(query_stems, docs.loc[doc, 'Preprocessed'])))
    title.append(docs.loc[doc, 'title'])

  return sorted(jaccard_results, key=lambda x: -x[1]), title

In [42]:
#test a query_with_jaccard and print the top 10
query_with_jaccard("ایران در عهد باستان",'persian',df[persian_start_doc:persian_end_doc])[0:10]

([(216, 0.06666666666666667),
  (986, 0.038461538461538464),
  (616, 0.03773584905660377),
  (100, 0.03571428571428571),
  (444, 0.030612244897959183),
  (860, 0.027777777777777776),
  (926, 0.025),
  (409, 0.024691358024691357),
  (165, 0.023809523809523808),
  (996, 0.022988505747126436),
  (817, 0.022727272727272728),
  (453, 0.021052631578947368),
  (58, 0.020833333333333332),
  (625, 0.020833333333333332),
  (416, 0.020202020202020204),
  (969, 0.02),
  (742, 0.0196078431372549),
  (987, 0.01904761904761905),
  (465, 0.018867924528301886),
  (935, 0.018867924528301886),
  (868, 0.018518518518518517),
  (667, 0.01818181818181818),
  (655, 0.017699115044247787),
  (4, 0.017543859649122806),
  (727, 0.017543859649122806),
  (840, 0.017543859649122806),
  (332, 0.017241379310344827),
  (350, 0.017241379310344827),
  (548, 0.01694915254237288),
  (420, 0.016666666666666666),
  (110, 0.016129032258064516),
  (539, 0.016129032258064516),
  (159, 0.015873015873015872),
  (538, 0.015625),


In [43]:
# query_with_jaccard("Mr. Henry Dashwood had one son", 'english', df2[english_start_doc:english_end_doc])[0:10]

# Normalized TF

In [44]:
def normalized_tf(word, doc_stems, doc_id,  posting_list):
  if word in doc_stems:
    for id, freq in posting_list['words'][word]['docs_id'].items():
      if id == doc_id:
        return (1 + math.log(freq))/len(doc_stems)

  return 0

In [45]:
def query_with_tf(query, docs, language ,posting_list):
  #convert query to stems(normalize the query) based on the language of the query and save it in "query_stems"

  if language == 'persian':
    query_stems = persian_preprocess(query)
  elif language == 'english':
    query_stems = english_preprocess(query)

  results = []

  for doc in range(len(docs)):
    weight = 0
    for token in query_stems:
      weight +=  normalized_tf(token, docs.loc[doc, 'Preprocessed'], doc, posting_list)

    results.append((doc, weight))



  return sorted(results, key=lambda x: -x[1])

In [46]:
#test a query_with_tf and print the top 10
query_with_tf("ایران در عهد باستان", df[persian_start_doc:persian_end_doc],"persian",persianPostingList)[0:10]

[(216, 0.07692307692307693),
 (986, 0.0392156862745098),
 (616, 0.038461538461538464),
 (100, 0.03636363636363636),
 (444, 0.030612244897959183),
 (860, 0.028169014084507043),
 (926, 0.02631578947368421),
 (165, 0.025),
 (409, 0.025),
 (817, 0.023809523809523808)]

In [47]:
# query_with_tf("Mr. Henry Dashwood had one son", df2[english_start_doc:english_end_doc], 'english', englishPostingList)[0:10]

# TF-IDF

In [48]:
def cosaine_normilizer(query_tf_idf, sum_of_squares):
  normilized_query_tf_idf={}
  cosaine_norm = 1 / math.sqrt(sum_of_squares)
  for token in query_tf_idf.keys():
    normilized_query_tf_idf[token] = query_tf_idf[token] * cosaine_norm

  return normilized_query_tf_idf

In [49]:
def vectorize_query(query, language ,posting_list, all_docs):
  #convert query to stems(normalize the query) based on the language of the query and save it in "query_stems"
  if language == 'persian':
    query_stems = persian_preprocess(query)
  elif language == 'english':
    query_stems = english_preprocess(query)

  query_tf_idf = {}
  sum_of_squares = 0
  for token in query_stems:
    if token in posting_list['words'].keys():
      freq = posting_list['words'][token]['count']
      docs_count = len(posting_list['words'][token]['docs_id'])
      query_tf_idf[token] = (1 + math.log(freq)) * (math.log(all_docs/docs_count))
      sum_of_squares += query_tf_idf[token]**2
    else:
      query_tf_idf[token] = 0
  return cosaine_normilizer(query_tf_idf,sum_of_squares)

In [50]:
def compute_tf_idf_score(query_tf_idf, doc_stems):
  doc_token_tf_idf = {}
  sum_of_squares = 0
  for token in doc_stems:
    if token in query_tf_idf.keys() and token not in doc_token_tf_idf.keys():
      freq = doc_stems.count(token)
      tf = 1 + math.log(freq)
      sum_of_squares += tf ** 2
      doc_token_tf_idf[token] = tf

  cosine_score = 0 if sum_of_squares == 0 else 1 / math.sqrt(sum_of_squares)

  for token in doc_token_tf_idf.keys():
    doc_token_tf_idf[token] = doc_token_tf_idf[token] * cosine_score

  score = 0

  for token in doc_token_tf_idf.keys():
    score += doc_token_tf_idf[token] * query_tf_idf[token]

  return score

In [51]:
def query_with_tf_idf(query, language, docs, posting_list, n_docs):
  query_vector = vectorize_query(query, language, posting_list, n_docs)
  scores = []
  title = []
  for doc in range(len(docs)):
    scores.append((doc, compute_tf_idf_score(query_vector, docs.loc[doc, 'Preprocessed'])))
    title.append(docs.loc[doc, 'title'])

  return sorted(scores, key=lambda x: -x[1]), title


In [52]:
#test a query_with_tf and print the top 10
query_with_tf_idf("ایران در عهد باستان","persian",df[persian_start_doc:persian_end_doc], persianPostingList, persian_end_doc)[0:10]

([(963, 0.9225734616766298),
  (444, 0.8727161140197124),
  (986, 0.7604239945635034),
  (172, 0.7522494983457706),
  (841, 0.7522269091130837),
  (416, 0.7439310037703228),
  (390, 0.7361611001788431),
  (625, 0.7361611001788431),
  (71, 0.736161100178843),
  (357, 0.736161100178843),
  (616, 0.736161100178843),
  (868, 0.7320148103386274),
  (16, 0.7320148103386273),
  (475, 0.7224530649582797),
  (987, 0.7224530649582797),
  (665, 0.7122501255227068),
  (316, 0.7117264185017631),
  (455, 0.7117264185017631),
  (651, 0.7117264185017631),
  (655, 0.7117264185017631),
  (621, 0.7021342059388856),
  (996, 0.6725806838969146),
  (409, 0.6654299826733868),
  (860, 0.6654299826733868),
  (727, 0.6610227778664051),
  (4, 0.6552898205570739),
  (31, 0.6552898205570739),
  (300, 0.6552898205570739),
  (317, 0.6552898205570739),
  (348, 0.6552898205570739),
  (522, 0.6552898205570739),
  (568, 0.6552898205570739),
  (598, 0.6552898205570739),
  (646, 0.6552898205570739),
  (710, 0.655289820557

In [53]:
# query_with_tf_idf("Mr. Henry Dashwood had one son","english",df2[english_start_doc:english_end_doc], englishPostingList, english_end_doc)[0:10]

# Queries

In [54]:
#enter persian queries
persian_queries = {
'Q1': "ایران در عهد باستان",
'Q2': "موضوع این کتاب",
'Q3': "دریا گسترهای بس زیبا، فریبنده و شگفت انگیز است",
'Q4': "روانشناسی کودک",
'Q5': "فیزیکدان معاصر استیون هاوکینگ",
'Q6': "مهارتهای مطالعه برای دانش آموزان و دانشجویان",
'Q7': "اشکانیان از سویی دیرپاترین دودمان فرمانروای ایران و طولانی ترین دوران تاریخ ما"
}

In [55]:
#enter english queries
english_queries = {
'Q1': "Mr. Henry Dashwood had one son",
'Q2': "no money for gambling",
'Q3': "All through the day Miss Abbott had seemed to Philip like a goddess",
'Q4': "I d like a shillin",
'Q5': "Are bears any good at discovering it?",
'Q6': "On a January evening of the early seventies"
}

In [56]:
#fix the outputs of functions shape like the mentioned in pdf of the project

## Persian Queries

In [57]:
print('-' * 80)
print('Jaccard(Top 10):')
print('-' * 80)

for q_id, q_items in persian_queries.items():
  jaccard_score, jaccard_title = query_with_jaccard(q_items,'persian',df[persian_start_doc:persian_end_doc])[0:10]
  print(f"{q_id} : {q_items}")
  for i in range(10):
    print(f"Number{i+1}: {jaccard_title[i]}, Score: {jaccard_score[i]}")
  print('-' * 50)


--------------------------------------------------------------------------------
Jaccard(Top 10):
--------------------------------------------------------------------------------
Q1 : ایران در عهد باستان
Number1:  بهترین درسهای زندگی‎, Score: (216, 0.06666666666666667)
Number2:  از سلاجقه تا صفویه, Score: (986, 0.038461538461538464)
Number3:  سرگذشت رضاشاه, Score: (616, 0.03773584905660377)
Number4:  سیری نوین در فرگشت و اسرار مغز, Score: (100, 0.03571428571428571)
Number5:  آسمان پرستاره, Score: (444, 0.030612244897959183)
Number6:  تاریخ مبارک غازانی, Score: (860, 0.027777777777777776)
Number7:  وبلاگ نویسی شیرین با وردپرس, Score: (926, 0.025)
Number8:  نقش روابط جنسی و راز کامیابی در زندگی, Score: (409, 0.024691358024691357)
Number9: معرفی کتاب الکترونیکی, Score: (165, 0.023809523809523808)
Number10:  فرهنگ ریشه واژگان فارسی, Score: (996, 0.022988505747126436)
--------------------------------------------------
Q2 : موضوع این کتاب
Number1:  بهترین درسهای زندگی‎, Score: (28, 0.0833333

In [58]:
print('-' * 80)
print('TF_IDF(Top 10):')
print('-' * 80)

for q_id, q_items in persian_queries.items():
  tf_idf_score, tf_idf_title = query_with_tf_idf(q_items, 'persian', df[persian_start_doc:persian_end_doc], persianPostingList, persian_end_doc)[0:10]
  print(f"{q_id} : {q_items}")
  for i in range(10):
    print(f"Number{i+1}: {tf_idf_title[i]}, Score: {tf_idf_score[i]}")
  print('-' * 50)

--------------------------------------------------------------------------------
TF_IDF(Top 10):
--------------------------------------------------------------------------------
Q1 : ایران در عهد باستان
Number1:  بهترین درسهای زندگی‎, Score: (963, 0.9225734616766298)
Number2:  از سلاجقه تا صفویه, Score: (444, 0.8727161140197124)
Number3:  سرگذشت رضاشاه, Score: (986, 0.7604239945635034)
Number4:  سیری نوین در فرگشت و اسرار مغز, Score: (172, 0.7522494983457706)
Number5:  آسمان پرستاره, Score: (841, 0.7522269091130837)
Number6:  تاریخ مبارک غازانی, Score: (416, 0.7439310037703228)
Number7:  وبلاگ نویسی شیرین با وردپرس, Score: (390, 0.7361611001788431)
Number8:  نقش روابط جنسی و راز کامیابی در زندگی, Score: (625, 0.7361611001788431)
Number9: معرفی کتاب الکترونیکی, Score: (71, 0.736161100178843)
Number10:  فرهنگ ریشه واژگان فارسی, Score: (357, 0.736161100178843)
--------------------------------------------------
Q2 : موضوع این کتاب
Number1:  بهترین درسهای زندگی‎, Score: (701, 0.885073737773

## English Queries


In [59]:
print('-' * 80)
print('Jaccard(Top 10):')
print('-' * 80)

for q_id, q_items in english_queries.items():
  jaccard_score, jaccard_title = query_with_jaccard(q_items,'english',df2[english_start_doc:english_end_doc])[0:10]
  print(f"{q_id} : {q_items}")
  for i in range(10):
    print(f"Number{i+1}: {jaccard_title[i]}, Score: {jaccard_score[i]}")
  print('-' * 50)

--------------------------------------------------------------------------------
Jaccard(Top 10):
--------------------------------------------------------------------------------
Q1 : Mr. Henry Dashwood had one son
Number1: HowardsEnd.txt, Score: (13, 0.00018772292096865028)
Number2: TheSignOfTheFour.txt, Score: (8, 0.00018541409147095178)
Number3: ARoomWithAView.txt, Score: (19, 0.0001462629808395495)
Number4: TheMysteriousAffairAtStyles.txt, Score: (10, 9.973404255319148e-05)
Number5: TheManWhoWasThursday.txt, Score: (1, 9.576098059244126e-05)
Number6: AnneOfGreenGables.txt, Score: (26, 8.195153932308028e-05)
Number7: ThePictureOfDorianGray.txt, Score: (27, 7.991688643810437e-05)
Number8: APassageToIndia.txt, Score: (9, 7.839909057054938e-05)
Number9: WinnieThePooh.txt, Score: (18, 7.694872649857645e-05)
Number10: TheSunAlsoRises.txt, Score: (3, 7.24427702115329e-05)
--------------------------------------------------
Q2 : no money for gambling
Number1: HowardsEnd.txt, Score: (10, 9.9

In [60]:
print('-' * 80)
print('TF_IDF(Top 10):')
print('-' * 80)

for q_id, q_items in english_queries.items():
  tf_idf_score, tf_idf_title = query_with_tf_idf(q_items, 'english', df2[english_start_doc:english_end_doc], englishPostingList, english_end_doc)[0:10]
  print(f"{q_id} : {q_items}")
  for i in range(10):
    print(f"Number{i+1}: {tf_idf_title[i]}, Score: {tf_idf_score[i]}")
  print('-' * 50)

--------------------------------------------------------------------------------
TF_IDF(Top 10):
--------------------------------------------------------------------------------
Q1 : Mr. Henry Dashwood had one son
Number1: HowardsEnd.txt, Score: (21, 0.5274126846900873)
Number2: TheSignOfTheFour.txt, Score: (6, 0.15700600690024621)
Number3: ARoomWithAView.txt, Score: (11, 0.15047943541351644)
Number4: TheMysteriousAffairAtStyles.txt, Score: (0, 0.14783079672094065)
Number5: TheManWhoWasThursday.txt, Score: (16, 0.13867069267745114)
Number6: AnneOfGreenGables.txt, Score: (15, 0.13004119053127766)
Number7: ThePictureOfDorianGray.txt, Score: (20, 0.10398415985821147)
Number8: APassageToIndia.txt, Score: (14, 0.10215085272864848)
Number9: WinnieThePooh.txt, Score: (9, 0.08527563883818963)
Number10: TheSunAlsoRises.txt, Score: (8, 0.07823090550532487)
--------------------------------------------------
Q2 : no money for gambling
Number1: HowardsEnd.txt, Score: (23, 0.466932713113514)
Number2