In [146]:
import csv
import re

TRAIN_LIMIT = 16712  # 80% of 20890
HAFEZ = "hafez"
SADI = "saadi"

data_set = list()
test_set = list()
sadi_verse_count = 0
hafez_verse_count = 0
words_set = set()
with open("train_test.csv", encoding="utf8") as csvfile:
    csvreader = csv.reader(csvfile, delimiter=",")
    i = 0
    for text, label in csvreader:
        if i <= TRAIN_LIMIT:
            if label == SADI:
                sadi_verse_count += 1
            elif label == HAFEZ:
                hafez_verse_count += 1
            text = re.split(' |\u200c', text)
            data_set.append([text, label])
        else:
            test_set.append([text, label])
        i += 1
#         print(text, label)

print("done")

done


Bayes' theorem
--------------
$$ P(C_k \mid X) = \frac{P(X \mid C_k) \, P(C_k)}{P(X)} $$
$$ C_k = \text{Sadi / Hafez} $$
$$ X = WORDS $$ <br>

we need to calculate following equation to determine probaility of being the new verse in Sadi class or Hafez class.

$$ P(C_k \mid x_1,...,x_n) =  \frac{P(x_1,...,x_n \mid C_k) \, P(C_k)}{P(x_1,...,x_n)}$$

In practice, there is interest only in the numerator of that fraction, because the denominator does not depend on $C$ and the values of the features $x_{i}$ are given, so that the denominator is effectively constant. The numerator is equivalent to the joint probability model <br>
$$ p(C_k, x_1,...,x_n) $$
Now the "naive" conditional independence assumptions come into play: assume that all features in $x$ are mutually independent, conditional on the category $C_k$. Under this assumption, 

$$ p(x_i \mid x_{i+1},...,x_n,C_k) = p(x_i\mid C_k) $$

Thus, the joint model can be expressed as <br>
Naïve Bayes:

$$ P(C_k \mid x_1,...,x_n) \propto p(C_k, x_1,...,x_n) $$<br>
$$ = p(C_k)p(x_1 \mid C_k)p(x_2 \mid C_k)  $$<br>
$$ = p(C_k) \prod_{i=1}^{n} p(x_i \mid C_k) $$

Bayes Classifier<br>
$$ \hat y = \underset{k \in 1,...,K}{argmax} \text{  } p(C_k) \prod_{i=1}^{n} p(x_i \mid C_k) $$

$x_i$ are words seen in input which we make the probability table of it beforehand.

In [147]:
import numpy as np

hafez_words_count = dict()
sadi_words_count = dict()
for data in data_set:
    text = data[0]
    label = data[1]
    if label == SADI:
        for word in text:
            words_set.add(word)
            if word in sadi_words_count:
                sadi_words_count[word] += 1
            else:
                sadi_words_count[word] = 1
    if label == HAFEZ:
        for word in text:
            words_set.add(word)
            if word in hafez_words_count:
                hafez_words_count[word] += 1
            else:
                hafez_words_count[word] = 1

# print(sadi_words_count)
print("done")

done


Let c refer to a class (such as Positive or Negative), and let w refer to a token or word.

The maximum likelihood estimator for $P(w|c)$ is 
$$ \frac{count(w,c)}{count(c)}= \frac{\text{counts w in class c}}{\text{counts of words in class c.}} $$

This estimation of $P(w|c)$ could be problematic since it would give us probability 0 for documents with unknown words.

A common way of solving this problem is to use Laplace smoothing.

Let $V$ be the set of words in the training set, add a new element $UNK$ (for unknown) to the set of words.

Define 

$$ P(w|c)=\frac{count(w,c)+1}{count(c)+|V|+1} $$

where $V$ refers to the vocabulary (the words in the training set).

In particular, any unknown word will have probability
$ \frac{1}{count(c)+|V|+1}$

In [156]:
def calculate_hafez_p(text):
    p = hafez_verse_count/(hafez_verse_count+sadi_verse_count)  #p(C_k)
#     p = np.log(hafez_verse_count/(hafez_verse_count+sadi_verse_count))  #p(C_k)
    for word in text:
        if word in hafez_words_count:
            p *= (hafez_words_count[word]/len(hafez_words_count))
#             p += np.log(hafez_words_count[word]/len(hafez_words_count))
        else:
            p = 0
    return p
            
def calculate_sadi_p(text):
    p = sadi_verse_count/(hafez_verse_count+sadi_verse_count)  #p(C_k)
#     p = np.log(sadi_verse_count/(hafez_verse_count+sadi_verse_count))  #p(C_k)
    for word in text:
        if word in sadi_words_count:
            p *= (sadi_words_count[word]/len(sadi_words_count))
#             p += np.log(sadi_words_count[word]/len(sadi_words_count))
        else:
            p = 0
    return p

In [157]:
correct = 0
for data in test_set:
    text = data[0]
    label = data[1]
    text = re.split(' |\u200c', text)
    if calculate_hafez_p(text) > calculate_sadi_p(text):
        if label == HAFEZ:
            correct += 1
#         print("H", label, text)
    else:
        if label == SADI:
            correct += 1
#         print("S", label, text)
        
print(correct/len(test_set))

0.7198946612401245


## Additive smoothing (Laplace smoothing)

In [158]:
def calculate_hafez_p(text):
    p = hafez_verse_count/(hafez_verse_count+sadi_verse_count)  #p(C_k)
    for word in text:
        if word in hafez_words_count:
            p *= ((hafez_words_count[word]+1)/(len(words_set) + len(hafez_words_count) + 1 ))
        else:
            p = 0
    return p
            
def calculate_sadi_p(text):
    p = sadi_verse_count/(hafez_verse_count+sadi_verse_count)  #p(C_k)
    for word in text:
        if word in sadi_words_count:
            p *= ((sadi_words_count[word]+1)/(len(words_set) + len(sadi_words_count) + 1 ))
        else:
            p = 0
    return p

In [159]:
correct = 0
for data in test_set:
    text = data[0]
    label = data[1]
    text = re.split(' |\u200c', text)
    if calculate_hafez_p(text) > calculate_sadi_p(text):
        if label == HAFEZ:
            correct += 1
#         print("H", label, text)
    else:
        if label == SADI:
            correct += 1
#         print("S", label, text)
        
print(correct/len(test_set))

0.7019391908067991
