In [None]:
!pip install hazm

In [1]:
from hazm import *
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from gensim.models import FastText, Word2Vec
from sklearn.preprocessing import MinMaxScaler
from xgboost import XGBClassifier

## Loading Data

In [2]:
data = pd.read_csv('/kaggle/input/snappfood-persian-sentiment-analysis/Snappfood - Sentiment Analysis.csv' , on_bad_lines='skip' , delimiter='\t')
data.head()

Unnamed: 0.1,Unnamed: 0,comment,label,label_id
0,,واقعا حیف وقت که بنویسم سرویس دهیتون شده افتضاح,SAD,1.0
1,,قرار بود ۱ ساعته برسه ولی نیم ساعت زودتر از مو...,HAPPY,0.0
2,,قیمت این مدل اصلا با کیفیتش سازگاری نداره، فقط...,SAD,1.0
3,,عالللی بود همه چه درست و به اندازه و کیفیت خوب...,HAPPY,0.0
4,,شیرینی وانیلی فقط یک مدل بود.,HAPPY,0.0


## Preprocessing 

In [3]:
data = data[['comment' , 'label' , 'label_id']]
data.dropna(inplace=True)
data.drop_duplicates(keep='first', inplace=True)
data['label_id'] = data['label_id'].astype(int)
data.head()

Unnamed: 0,comment,label,label_id
0,واقعا حیف وقت که بنویسم سرویس دهیتون شده افتضاح,SAD,1
1,قرار بود ۱ ساعته برسه ولی نیم ساعت زودتر از مو...,HAPPY,0
2,قیمت این مدل اصلا با کیفیتش سازگاری نداره، فقط...,SAD,1
3,عالللی بود همه چه درست و به اندازه و کیفیت خوب...,HAPPY,0
4,شیرینی وانیلی فقط یک مدل بود.,HAPPY,0


In [4]:
X = (data['comment'])
y = (data['label_id'])
hazm_normalizer = Normalizer()
X = X.apply(lambda v: hazm_normalizer.normalize(v))

In [5]:
hazm_tokenizer = WordTokenizer(replace_numbers=True, replace_ids=True)
X = X.apply(lambda v: hazm_tokenizer.tokenize(v))
X.head()

0    [واقعا, حیف, وقت, که, بنویسم, سرویس, دهیتون, ش...
1    [قرار, بود, NUM, 1, ساعته, برسه, ولی, نیم, ساع...
2    [قیمت, این, مدل, اصلا, با, کیفیتش, سازگاری, ند...
3    [عالی, بود, همه, چه, درست, و, به, اندازه, و, ک...
4               [شیرینی, وانیلی, فقط, یک, مدل, بود, .]
Name: comment, dtype: object

In [6]:
hazm_stemmer = Stemmer()
X = X.apply(lambda v: [hazm_stemmer.stem(w) for w in v])
X.head()

0    [واقعا, حیف, وق, که, بنویس, سرویس, دهیتون, شده...
1    [قرار, بود, NUM, 1, ساعته, برسه, ول, ن, ساع, ز...
2    [قیم, این, مدل, اصلا, با, کیفیت, سازگار, نداره...
3    [عال, بود, همه, چه, درس, و, به, اندازه, و, کیف...
4                 [شیرین, وانیل, فقط, یک, مدل, بود, .]
Name: comment, dtype: object

In [7]:
hazm_lemmatizer = Lemmatizer()
X = X.apply(lambda v: [hazm_lemmatizer.lemmatize(w) for w in v]) 
X.head()

0    [واقعا, حیف, وق, که, بنویس, سرویس, دهیتون, شده...
1    [قرار, بود#باش, NUM, 1, ساعته, برسه, ول, #هست,...
2    [قیم, این, مدل, اصلا, با, کیفیت, سازگار, نداره...
3    [عال, بود#باش, همه, چه, درس, و, به, اندازه, و,...
4             [شیرین, وانیل, فقط, یک, مدل, بود#باش, .]
Name: comment, dtype: object

In [8]:
stopwords = stopwords_list()
X = X.apply(lambda v: [w for w in v if w not in stopwords]) 
X.head()

0       [واقعا, حیف, وق, بنویس, سرویس, دهیتون, افتضاح]
1    [قرار, بود#باش, NUM, 1, ساعته, برسه, ول, #هست,...
2    [قیم, مدل, اصلا, کیفیت, سازگار, نداره, ،, ظاهر...
3    [عال, بود#باش, درس, اندازه, کیف, ،, امیداور, ک...
4                      [شیرین, وانیل, مدل, بود#باش, .]
Name: comment, dtype: object

In [9]:
X = X.apply(lambda v: ' '.join(v)) 
X = list(X)
X[:5]

['واقعا حیف وق بنویس سرویس دهیتون افتضاح',
 'قرار بود#باش NUM 1 ساعته برسه ول #هست ساع زود موقع ، دید#بین چقدر پلاک خفنهه ، سالهاس مشتریشون سالهاس مزه میده غذاشون',
 'قیم مدل اصلا کیفیت سازگار نداره ، ظاهر فریبنده داره ، میکنن کالباس قارچ',
 'عال بود#باش درس اندازه کیف ، امیداور کیفیتتون باشه مشتر همیشگ بش',
 'شیرین وانیل مدل بود#باش .']

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15)

## Creating different embeddings

#### Using CountVectorizer

In [11]:
cvectorizer = CountVectorizer()
cvectorized_X = cvectorizer.fit_transform(X_train)
cvectorized_X_test = cvectorizer.transform(X_test)

vec_df = pd.DataFrame(cvectorized_X.toarray(), columns=cvectorizer.get_feature_names_out())
print(vec_df.shape)
vec_df

(59058, 18811)


Unnamed: 0,11,aa,aaaallliii,aali,ab,ablimo,about,acting,adam,adasi,...,یکیلو,یکیه,یکیو,یگ,یگانه,یی,ییب,ییسکو,ییسکوئ,ییه
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59053,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
59054,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
59055,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
59056,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


#### Using TfidfVectorizer

In [12]:
tvectorizer = TfidfVectorizer()
tvectorized_X = tvectorizer.fit_transform(X_train)
tvectorized_X_test = tvectorizer.transform(X_test)

vec_df = pd.DataFrame(tvectorized_X.toarray(), columns=tvectorizer.get_feature_names_out())
print(vec_df.shape)
vec_df

(59058, 18811)


Unnamed: 0,11,aa,aaaallliii,aali,ab,ablimo,about,acting,adam,adasi,...,یکیلو,یکیه,یکیو,یگ,یگانه,یی,ییب,ییسکو,ییسکوئ,ییه
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59053,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
59054,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
59055,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
59056,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### Using word2vec gensim

In [13]:
def transform(X, model):
    singleDataItemEmbedding = np.zeros(100)
    embedded=[]
    for row in X:
        wordCount=0
        for word in row:
            if word in model.wv:
                singleDataItemEmbedding=singleDataItemEmbedding+model.wv[word]
                wordCount=wordCount+1
        if wordCount == 0:
            embedded.append([0.0] * 100)
        else:    
            singleDataItemEmbedding=singleDataItemEmbedding/wordCount  
            embedded.append(singleDataItemEmbedding)
    return embedded

In [14]:
word2vec_model=Word2Vec(X_train, min_count=1, workers=4)
wvectorized_X = transform(X_train, word2vec_model)
wvectorized_X_test = transform(X_test, word2vec_model)
wvectorized_X[0]

array([-0.23486617,  0.24166886, -0.22290097, -0.18840912, -0.02902828,
        0.00189095,  0.17254937,  0.19451919,  0.19121531,  0.18694273,
        0.00768116, -0.21328798, -0.25352433, -0.22513642, -0.36463285,
        0.11925242, -0.02470215,  0.07918755,  0.07545689,  0.07908999,
        0.01438279, -0.25701881, -0.12135038,  0.11792886, -0.39722959,
       -0.13912457,  0.03897722, -0.36257576, -0.02514064, -0.19543859,
       -0.13835933,  0.16194705,  0.08300001,  0.33877909, -0.15040748,
        0.20900746,  0.46220217,  0.27117889, -0.06023325, -0.027349  ,
        0.06553629,  0.27725639,  0.11152325, -0.36858903, -0.19897812,
       -0.16754104,  0.13963492,  0.15623683,  0.13753263,  0.21379005,
       -0.07636567,  0.03723719,  0.26753343,  0.11208913,  0.01781907,
       -0.06461927,  0.07680541, -0.06470375,  0.12298266,  0.11185242,
        0.17763689, -0.14600179, -0.01997962, -0.22582391,  0.06521953,
       -0.17600484,  0.06733819,  0.07063088, -0.19278821, -0.01

#### Using FastText gensim

In [15]:
fasttext_model=FastText(X_train, min_count=1, workers=4)
fvectorized_X = transform(X_train, fasttext_model)
fvectorized_X_test = transform(X_test, fasttext_model)
fvectorized_X[0]

array([-0.05565026, -0.07095725,  0.1395434 , -0.00094651,  0.03300567,
       -0.08787721,  0.0058505 ,  0.02027289,  0.1969049 ,  0.20711192,
        0.03216778, -0.6451599 , -0.01071723,  0.13157988, -0.07564722,
        0.04511601,  0.19749844,  0.3215181 , -0.02090897, -0.07903458,
       -0.03862619, -0.13167357, -0.30612626, -0.08158436, -0.33704233,
        0.15827095, -0.07805118,  0.20415802, -0.31913335, -0.01624858,
       -0.2791194 , -0.190673  ,  0.11376268,  0.28649324, -0.08497957,
        0.44113879,  0.22094694,  0.31564415, -0.03789346, -0.10897261,
       -0.04266175,  0.11378855,  0.28764662, -0.59972144, -0.17624136,
       -0.15096083, -0.05330103, -0.06418658,  0.10438083,  0.18349163,
       -0.0900192 , -0.05855512,  0.14169531, -0.03337781, -0.18505434,
       -0.26106485,  0.15137053,  0.02700835,  0.03152488,  0.12251208,
       -0.04068808, -0.07078365, -0.13309288, -0.13614426,  0.24853554,
       -0.22610103,  0.02738104,  0.09957232, -0.14501147,  0.12

## Classifiying with different classifiers

#### Using LogisticRegression

In [16]:
logreg = LogisticRegression(solver='newton-cg')

logreg.fit(cvectorized_X, y_train)
y_pred = logreg.predict(cvectorized_X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.85      0.82      0.83      5275
           1       0.82      0.85      0.83      5147

    accuracy                           0.83     10422
   macro avg       0.83      0.83      0.83     10422
weighted avg       0.83      0.83      0.83     10422



In [17]:
logreg.fit(tvectorized_X, y_train)
y_pred = logreg.predict(tvectorized_X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.87      0.80      0.84      5275
           1       0.81      0.88      0.84      5147

    accuracy                           0.84     10422
   macro avg       0.84      0.84      0.84     10422
weighted avg       0.84      0.84      0.84     10422



In [18]:
logreg.fit(wvectorized_X, y_train)
y_pred = logreg.predict(wvectorized_X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.72      0.65      0.68      5275
           1       0.67      0.75      0.71      5147

    accuracy                           0.70     10422
   macro avg       0.70      0.70      0.70     10422
weighted avg       0.70      0.70      0.70     10422



In [19]:
logreg.fit(fvectorized_X, y_train)
y_pred = logreg.predict(fvectorized_X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.72      0.64      0.68      5275
           1       0.67      0.75      0.71      5147

    accuracy                           0.70     10422
   macro avg       0.70      0.70      0.69     10422
weighted avg       0.70      0.70      0.69     10422



#### Using MultinomialNB

In [20]:
multinomialNB = MultinomialNB()

multinomialNB.fit(cvectorized_X, y_train)
y_pred = multinomialNB.predict(cvectorized_X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.87      0.78      0.82      5275
           1       0.80      0.88      0.84      5147

    accuracy                           0.83     10422
   macro avg       0.83      0.83      0.83     10422
weighted avg       0.83      0.83      0.83     10422



In [21]:
multinomialNB.fit(tvectorized_X, y_train)
y_pred = multinomialNB.predict(tvectorized_X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.88      0.75      0.81      5275
           1       0.78      0.89      0.83      5147

    accuracy                           0.82     10422
   macro avg       0.83      0.82      0.82     10422
weighted avg       0.83      0.82      0.82     10422



In [22]:
scaler = MinMaxScaler()
scaled_X= scaler.fit_transform(wvectorized_X)
scaled_X_test = scaler.transform(wvectorized_X_test)
multinomialNB.fit(scaled_X, y_train)
y_pred = multinomialNB.predict(scaled_X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.61      0.69      0.65      5275
           1       0.63      0.54      0.59      5147

    accuracy                           0.62     10422
   macro avg       0.62      0.62      0.62     10422
weighted avg       0.62      0.62      0.62     10422



In [23]:
scaled_X= scaler.fit_transform(fvectorized_X)
scaled_X_test = scaler.transform(fvectorized_X_test)
multinomialNB.fit(scaled_X, y_train)
y_pred = multinomialNB.predict(scaled_X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.60      0.68      0.64      5275
           1       0.62      0.53      0.58      5147

    accuracy                           0.61     10422
   macro avg       0.61      0.61      0.61     10422
weighted avg       0.61      0.61      0.61     10422



#### Using RandomForestClassifier

In [24]:
randomFC = RandomForestClassifier(criterion='entropy')

randomFC.fit(cvectorized_X, y_train)
y_pred = randomFC.predict(cvectorized_X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.86      0.80      0.83      5275
           1       0.81      0.86      0.84      5147

    accuracy                           0.83     10422
   macro avg       0.83      0.83      0.83     10422
weighted avg       0.83      0.83      0.83     10422



In [25]:
randomFC.fit(tvectorized_X, y_train)
y_pred = randomFC.predict(tvectorized_X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.87      0.79      0.83      5275
           1       0.80      0.88      0.84      5147

    accuracy                           0.83     10422
   macro avg       0.83      0.83      0.83     10422
weighted avg       0.83      0.83      0.83     10422



In [26]:
randomFC.fit(wvectorized_X, y_train)
y_pred = randomFC.predict(wvectorized_X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.73      0.67      0.70      5275
           1       0.69      0.74      0.72      5147

    accuracy                           0.71     10422
   macro avg       0.71      0.71      0.71     10422
weighted avg       0.71      0.71      0.71     10422



In [27]:
randomFC.fit(fvectorized_X, y_train)
y_pred = randomFC.predict(fvectorized_X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.73      0.67      0.70      5275
           1       0.69      0.75      0.72      5147

    accuracy                           0.71     10422
   macro avg       0.71      0.71      0.71     10422
weighted avg       0.71      0.71      0.71     10422



#### Using XGBClassifier

In [28]:
xgb = XGBClassifier()

xgb.fit(cvectorized_X, y_train)
y_pred = xgb.predict(cvectorized_X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.86      0.80      0.83      5275
           1       0.81      0.86      0.83      5147

    accuracy                           0.83     10422
   macro avg       0.83      0.83      0.83     10422
weighted avg       0.83      0.83      0.83     10422



In [29]:
xgb.fit(tvectorized_X, y_train)
y_pred = xgb.predict(tvectorized_X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.87      0.78      0.82      5275
           1       0.80      0.88      0.84      5147

    accuracy                           0.83     10422
   macro avg       0.83      0.83      0.83     10422
weighted avg       0.83      0.83      0.83     10422



In [30]:
xgb.fit(wvectorized_X, y_train)
y_pred = xgb.predict(wvectorized_X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.74      0.68      0.71      5275
           1       0.70      0.76      0.73      5147

    accuracy                           0.72     10422
   macro avg       0.72      0.72      0.72     10422
weighted avg       0.72      0.72      0.72     10422



In [31]:
xgb.fit(fvectorized_X, y_train)
y_pred = xgb.predict(fvectorized_X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.74      0.67      0.70      5275
           1       0.69      0.75      0.72      5147

    accuracy                           0.71     10422
   macro avg       0.71      0.71      0.71     10422
weighted avg       0.71      0.71      0.71     10422

