<h1 align=center style="line-height:200%;font-family:vazir;color:#0099cc">
<font face="vazirmatn" color="#0099cc">
Classification nlp naivebayes</font>
</h1>

In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score
from hazm import stopwords_list, Normalizer, word_tokenize, Lemmatizer
import re

<center>
<div dir=rtl style="direction: rtl;line-height:200%;font-family:vazir;font-size:medium">
<font face="vazirmatn" size=3>
    
|ستون|توضیحات|
|:------:|:---:|
|<code>comment</code>|متن کامنت|
|<code>price_value</code>|آیا درباره‌ی قیمت در آن صحبت شده (<code>1</code>) یا خیر (<code>0</code>)||
    
</font>
</div>
</center>

In [2]:
train_data = pd.read_csv('../data/train.csv')
train_data.head(1)

Unnamed: 0,comment,price_value
0,قیمت مناسب وکیفیت خوب پیشنهادمیکنم حتما خرید کنید,1


In [3]:
test_data = pd.read_csv('../data/test.csv')
test_data.head(1)

Unnamed: 0,comment
0,با این قیمت بیرون جنس خوبی پیدا نمیشه، واقعا ر...


In [4]:
# !pip install hazm

In [5]:
# stopwords_list()

In [6]:
normalizer = Normalizer()
lemmatizer = Lemmatizer()
stopwords = set(stopwords_list())
pattern_html = re.compile(r"<.*?>")
pattern_num = re.compile(r"[0-9۰-۹]+")
pattern_punc = re.compile(r"[^\w\s]")

def preprocessing(text):
    text = pattern_html.sub("", text)
    text = pattern_num.sub("", text)
    text = pattern_punc.sub("", text)

    text = normalizer.normalize(text)
    tokens = word_tokenize(text)

    filtered = []
    for w in tokens:
        if w and w not in stopwords:
            w2 = lemmatizer.lemmatize(w)
            if "#" in w2:
                w2 = w2.split("#")[0]
            filtered.append(w2)
    return filtered



x = preprocessing('از همه نظر عالی بود و باز هم حتما ازشون خرید می کنم 40 شدم')
x

['حتما', 'ازشون', 'خرید', 'شد']

In [7]:
train_data["tokens"] = train_data["comment"].apply(preprocessing)

neg_tokens = train_data[train_data["price_value"]==0]["tokens"]
pos_tokens = train_data[train_data["price_value"]==1]["tokens"]

In [8]:
n0 = (train_data["price_value"] == 0).sum()
n1 = (train_data["price_value"] == 1).sum()
n_total = len(train_data)

prior_probability = {
    0: n0 / n_total,
    1: n1 / n_total,
}

In [9]:
def token_counter(texts):
    count_dict = {}

    for text in texts:
        filtered = preprocessing(str(text))
        for word in filtered:
            count_dict[word] = count_dict.get(word, 0) +1

    return count_dict

x = token_counter([['سلام سلام من پریا هستم و خیلی خوشحالم که اینجام'],['پریا پریا پریا']])
x

{'سلام': 2, 'پریا': 4, '': 1, 'خوشحال': 1, 'اینجا': 1}

In [10]:
negative_class_count = token_counter(neg_tokens)

In [11]:
positive_class_count = token_counter(pos_tokens)

In [12]:
vocab = set(negative_class_count.keys()) | set(positive_class_count.keys())

vocab_size = len(vocab)

classes_total_count = {
    0: sum(negative_class_count.values()),
    1: sum(positive_class_count.values()),
}

In [13]:
def compute_probability(text, cls):
    class_dict = {0: negative_class_count, 1: positive_class_count}
    token_counts = class_dict[cls]

    tokens = preprocessing(str(text))

    probab = prior_probability[cls]

    total_tokens = classes_total_count[cls]

    for w in tokens:
        count_w = token_counts.get(w, 0) 
        pw_given_class = (count_w + 1) / (total_tokens + vocab_size)
        probab *= pw_given_class

    return probab


In [14]:
def predict(test):
    predictions = []

    for text in test:
        posetive = compute_probability(text, 1)
        negetive = compute_probability(text, 0)
        
        if posetive > negetive:
            predictions.append(1)
        else:
            predictions.append(0)
            
    return np.array(predictions)

In [15]:
train_predictions = predict(train_data['comment'])
accuracy_score(train_predictions, train_data['price_value']) 

0.888825

In [16]:
pred_test =  predict(test_data['comment'])
submission = pd.DataFrame(pred_test, columns=['price_value'])
submission

Unnamed: 0,price_value
0,1
1,1
2,1
3,0
4,1
...,...
7995,0
7996,1
7997,1
7998,0
