# pre processing

output:
    books_train_csv and books_test_csv
    
        [
            {
                "title": name of book (normalized),
                "description": description of book (tokenized & normalized& remove Word that do not contain letters),
                "categories" : categories of book (normalized)
            },
            .
            .
            .,
            {...}
        ]

In [None]:
import csv
from hazm import Normalizer, sent_tokenize, word_tokenize

### importing csv

In [None]:
books_train_csv = list()

with open("books_train.csv") as file:
    reader = csv.DictReader(file)
    books_train_csv = [i for i in reader]

In [None]:
books_test_csv = list()

with open("books_test.csv") as file:
    reader = csv.DictReader(file)
    books_test_csv = [i for i in reader]

In [None]:
books_train_csv
books_test_csv

### Normalizing

In [None]:
normalizer = Normalizer()
books_train_csv = [{j : normalizer.normalize(i[j]).replace("\u200c", " ") for j in i.keys()} for i in books_train_csv]
books_test_csv = [{j : normalizer.normalize(i[j]).replace("\u200c", " ") for j in i.keys()} for i in books_test_csv]

In [None]:
books_train_csv
books_test_csv

### tokenize

In [None]:
def has_farsi_letters(text : str) -> bool:
    for char in text:
            if char.isalpha() and ord(char) >= 0x0600 and ord(char) <= 0x06FF:
                   return True
    return False

In [None]:
def is_valid_word(word : str) -> bool:
    if has_farsi_letters(word):
        return True
    return False

In [None]:
def tokenize_str(txt : str) -> list(str()):
    token = list()
    for sent in sent_tokenize(txt):
        for word in word_tokenize(sent):
            if is_valid_word(word):
                token.append(word)
    return token

In [None]:
for i in range(len(books_train_csv)):
    for j in {"description"}:
        books_train_csv[i][j] = tokenize_str(books_train_csv[i][j])

for i in range(len(books_test_csv)):
    for j in {"description"}:
        books_test_csv[i][j] = tokenize_str(books_test_csv[i][j])

In [None]:
books_train_csv
books_test_csv

# processing

### count words per category

In [None]:
all_words = list()
for i in books_train_csv:
    all_words.extend(i["description"])

In [210]:
categories_data = {"total" : {"total" : 0 , **{word : 0 for word in all_words}}}
for book in books_train_csv:
    if book["categories"] not in categories_data.keys():
        categories_data[book["categories"]] = {"total" : 0, **{word : 0 for word in all_words}}
    for word in book["description"]:
        categories_data[book["categories"]][word] += 1
        categories_data[book["categories"]]["total"] += 1
        categories_data["total"][word] += 1
        categories_data["total"]["total"] += 1

In [211]:
categories_data

{'total': {'total': 496563,
  'ساختار': 50,
  'نظریه': 172,
  'های': 6139,
  'جامعه': 648,
  'شناسی': 378,
  'ایران': 359,
  'نوشته': 936,
  'ابوالفضل': 8,
  'رمضانی': 7,
  'دربردارنده': 49,
  'درباره': 837,
  'از': 12430,
  'منظر': 76,
  'اندیشمندانی': 2,
  'همچون': 140,
  'جواد': 8,
  'طباطبایی': 14,
  'همایون': 13,
  'کاتوزیان': 1,
  'احمد': 80,
  'اشرف': 12,
  'پرویز': 7,
  'پیران': 3,
  'حسین': 138,
  'بشیریه': 1,
  'و': 26459,
  'است': 7181,
  'در': 14150,
  'بخشی': 549,
  'کتاب': 4068,
  'می': 13917,
  'خوانیم': 477,
  'تحلیل': 188,
  'تاکید': 36,
  'بیشتر': 363,
  'بر': 1489,
  'خلاصه': 66,
  'سازی': 127,
  'دقیق': 76,
  'ها': 3776,
  'با': 5571,
  'شکافتن': 1,
  'اجزا': 4,
  'عناصر': 38,
  'آنها': 725,
  'تلاش': 214,
  'شود': 1554,
  'تا': 1784,
  'جایی': 135,
  'که': 11073,
  'امکان': 89,
  'دارد': 1226,
  'کمترین': 26,
  'چیزی': 321,
  'قلم': 306,
  'نیفتد': 2,
  'هدف': 205,
  'این': 7594,
  'برای': 2931,
  'تجزیه': 29,
  'شناختی': 103,
  'موجود': 111,
  'به': 14071,
  'اصلی

### Create bag of word

In [212]:
import pandas as pd

In [218]:
df = pd.DataFrame(categories_data)
df = df.transpose()

In [219]:
df

Unnamed: 0,total,ساختار,نظریه,های,جامعه,شناسی,ایران,نوشته,ابوالفضل,رمضانی,...,بزک,جودانه,کتابفروش,کماندوهای,قدرش,بهروان,ستمگر,خواهی_هود,بنوشند,اهلش
total,496563,50,172,6139,648,378,359,936,8,7,...,1,1,1,1,1,1,1,1,1,1
جامعه شناسی,93900,33,130,1517,453,292,190,156,3,1,...,0,0,0,0,0,0,0,0,0,0
کلیات اسلام,65265,2,17,612,72,44,34,110,2,0,...,0,0,0,0,0,0,0,0,1,1
داستان کودک و نوجوانان,40765,0,1,595,4,0,21,155,2,6,...,0,0,0,0,0,1,1,1,0,0
داستان کوتاه,91437,3,1,1072,33,5,51,171,1,0,...,1,1,1,1,1,0,0,0,0,0
مدیریت و کسب و کار,82417,7,20,1192,35,31,23,174,0,0,...,0,0,0,0,0,0,0,0,0,0
رمان,122779,5,3,1151,51,6,40,170,0,0,...,0,0,0,0,0,0,0,0,0,0


### calculate p(c)

In [None]:
sum_book = {"total" : 0}
for book in books_train_csv:
    if book["categories"] not in sum_book.keys():
        sum_book[book["categories"]] = 0
    sum_book[book["categories"]] += 1
    sum_book["total"] += 1

In [None]:
p_c = dict()
for category in sum_book.keys():
    p_c[category] = sum_book[category] / sum_book["total"]

### calculate p(x)

In [215]:
p_x = dict()

for word in categories_data["total"].keys():
    p_x[word] = categories_data["total"][word] / categories_data["total"]["total"]

### calculate p(x|c)

In [231]:
p_x_c = dict()

for category in categories_data.keys():
    p_x_c[category] = dict()
    for word in categories_data[category].keys():
        p_x_c[category][word] = categories_data[category][word] / categories_data[category]["total"]