In [1]:
from __future__ import unicode_literals
from hazm import *
import tensorflow as tf
from keras.models import Sequential
import pandas as pd
from keras.layers import Dense
import numpy as np
import re
from urlextract import URLExtract
import emojis
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn import preprocessing, metrics
from keras.layers import Dense, Dropout,Activation
from keras.optimizers import Adadelta,Adam,RMSprop
from keras import utils as np_utils
from keras_preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences

# Load and Read Data

In [None]:
corpus = pd.read_csv('digikala-comments-raw.csv', on_bad_lines= 'skip')

In [88]:
corpus.head()

Unnamed: 0,id,title,body,created_at,rate,recommendation_status,is_buyer,product_id,advantages,disadvantages,likes,dislikes,seller_title,seller_code,true_to_size_rate
0,53672599,پیشنهاد نمیشود,به درد نمیخوره,23 شهریور 1402,1.0,not_recommended,True,252058,,,0,0,دیجی‌کالا,5A52N,
1,9897229,بسته بندی بد,می‌تونست به عنوان یه کالای فرهنگی بهتر بسته بن...,16 تیر 1399,0.0,recommended,True,252058,['تجربه جالبی بود برام '],['بسته بندی جالبی نداشت'],1,0,دیجی‌کالا,5A52N,
2,38074516,برس ریمل,بسته بندیش خوب بود\r\n کاربرد و کیفیتشم خیلی خ...,26 مرداد 1401,0.0,recommended,True,3331597,,,0,0,آرالیا بیوتی,ADM47,
3,18628562,خوبه و خوشرنگ,به نظرم خوبه فقط یکم ظریفه. از رنگش خوشم اومد ...,28 اسفند 1399,0.0,recommended,True,3331329,,,0,0,اینجاست آ,9ZMCZ,
4,53301258,برس رنگ مو,معمولیه اگه واسه خونه رنگ کردن شخصی میخواین او...,12 شهریور 1402,3.0,recommended,True,3255700,,,0,0,گالری آرایشی به سیما,CDWHA,


In [106]:
corpus['recommendation_status'].values
corpus = corpus.dropna(subset=['recommendation_status'])
corpus = corpus.drop(corpus[(corpus.recommendation_status == 'no_idea')].index)
print(corpus.shape)

cols= ['body','recommendation_status']
cps = corpus[cols].values
new_corpus = corpus[cols]

(76909, 15)


In [107]:
le_status = preprocessing.LabelEncoder()
le_status.fit(['not_recommended','recommended'])
cps[:,1] = le_status.transform(cps[:,1])

In [108]:
list1 = cps[:,1].tolist()
new_corpus['status_label'] = list1
new_corpus.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_corpus['status_label'] = list1


Unnamed: 0,body,recommendation_status,status_label
0,به درد نمیخوره,not_recommended,0
1,می‌تونست به عنوان یه کالای فرهنگی بهتر بسته بن...,recommended,1
2,بسته بندیش خوب بود\r\n کاربرد و کیفیتشم خیلی خ...,recommended,1
3,به نظرم خوبه فقط یکم ظریفه. از رنگش خوشم اومد ...,recommended,1
4,معمولیه اگه واسه خونه رنگ کردن شخصی میخواین او...,recommended,1


# Preprocess and Cleaning Data

In [10]:
def _multiple_replace(mapping, text):
    pattern = "|".join(map(re.escape, mapping.keys()))
    return re.sub(pattern, lambda m: mapping[m.group()], str(text))

def convert_fa_numbers(input_str):
    mapping ={
        '١' :'۱',
        '٢' :'۲',
        '٣' :'۳',
        '٤' :'۴',
        '٥' :'۵',
        '٦' :'۶',
        '٧' :'۷',
        '٨' :'۸',
        '٩' :'۹',
        '٠' :'۰',
        '۰' :'0',
        '۱' :'1',
        '۲' :'2',
        '۳' :'3',
        '۴' :'4',
        '۵' :'5',
        '۶' :'6',
        '۷' :'7',
        '۸' :'8',
        '۹' :'9',
        '.' :'.',
    }
    return _multiple_replace(mapping, input_str)

def convert_ar_characters(input_str):
    #Converts Arabic chars to related Persian unicode char
    
    mapping ={
        'ك' :'ک',
        'دِ': 'د',
        'بِ': 'ب',
        'زِ': 'ز',
        'ذِ': 'ذ',
        'شِ': 'ش',
        'سِ': 'س',
        'ى' :'ی',
        'ي' :'ی',
        
    }
    return _multiple_replace(mapping, input_str)

def preprocess(text):
    extractor = URLExtract()
    for url in extractor.gen_urls(text):
        text = text.replace(url,'<URL>')
    emj = emojis.get(text)
    for i in emj:
        if i in text:
            text = text.replace(i,'<emoji>')
    text = convert_fa_numbers(text)
    text = convert_ar_characters(text)
    # regex to detect and replace all smilies in the text with <smiley>
    text = re.sub(r"(:\s?\)|:-\)|\(\s?:|\(-:|:\'\)|:\s?D|8-\)|:s?\||;\s?\)|:-\*|:-\||:-\(|:\s?P|:-P|:-p|:-O|:-o|:-0|:-\@|:\$|:-\^|:-&|:-\*|:-\+|:-\~|:-\'|:-\>|:-\<|:-\}|:-\{|\[:\s?\]|\[:\s?\]|:\s?\]|:\s?\}|:s?\{)",'<smiley>',text) 
    text = text.strip()
    text = re.sub(r'[<>#.:()"\'!?؟،,@$%^&*_+\[\]/]','',text)
    text = re.sub(r'[\s]{2,}',' ',text)
    text = re.sub(r'(\w)\1{2,}', r'\1',text)
    if re.search(r'[\u0600-\u06FF]', text):
        return (text)
    else:
        return 'None'
    
    
def modelEvaluation(y_test, predictions):    
    #Print model evaluation to predicted result 
    print ("\nAccuracy on validation set: {:.4f}".format(metrics.accuracy_score(y_test, predictions)))
    #print("\nAUC score : {:.4f}".format(roc_auc_score(y_test, predictions)))
    print("\nClassification report : \n", metrics.classification_report(y_test, predictions))
    print("\nConfusion Matrix : \n", metrics.confusion_matrix(y_test, predictions))

In [110]:
new_corpus['body'] = new_corpus['body'].astype("string")
new_corpus['body'] = new_corpus['body'].apply(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_corpus['body'] = new_corpus['body'].astype("string")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_corpus['body'] = new_corpus['body'].apply(str)


In [112]:
new_corpus['Cleaned'] = new_corpus['body'].apply(preprocess)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_corpus['Cleaned'] = new_corpus['body'].apply(preprocess)


In [113]:
new_corpus.head()

Unnamed: 0,body,recommendation_status,status_label,Cleaned
0,به درد نمیخوره,not_recommended,0,به درد نمیخوره
1,می‌تونست به عنوان یه کالای فرهنگی بهتر بسته بن...,recommended,1,می‌تونست به عنوان یه کالای فرهنگی بهتر بسته بن...
2,بسته بندیش خوب بود\r\n کاربرد و کیفیتشم خیلی خ...,recommended,1,بسته بندیش خوب بود کاربرد و کیفیتشم خیلی خوبه ...
3,به نظرم خوبه فقط یکم ظریفه. از رنگش خوشم اومد ...,recommended,1,به نظرم خوبه فقط یکم ظریفه از رنگش خوشم اومد م...
4,معمولیه اگه واسه خونه رنگ کردن شخصی میخواین او...,recommended,1,معمولیه اگه واسه خونه رنگ کردن شخصی میخواین او...


In [114]:
new_corpus = new_corpus.dropna()
new_corpus.shape

(76909, 4)

In [115]:
new_corpus.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 76909 entries, 0 to 99998
Data columns (total 4 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   body                   76909 non-null  object
 1   recommendation_status  76909 non-null  object
 2   status_label           76909 non-null  int64 
 3   Cleaned                76909 non-null  object
dtypes: int64(1), object(3)
memory usage: 2.9+ MB


# Save Clean Data for next Runs

In [119]:
new_corpus.to_csv('digikala_comments_preprocessed.csv')

In [128]:
new_corpus = pd.read_csv('digikala_comments_preprocessed.csv', on_bad_lines= 'skip')

In [65]:
with open('stopwords.txt', 'r', encoding ='utf-8') as f :
        stopwords = f.read().splitlines()
def rmv_sw (text):
    return ' '.join([word for word in text.split() if word not in stopwords])

new_corpus['Cleaned_sw_rmvd'] = new_corpus['Cleaned'].apply(rmv_sw)   

In [131]:
new_corpus.recommendation_status.value_counts()

recommended        70559
not_recommended     6350
Name: recommendation_status, dtype: int64

In [132]:
nrec = new_corpus[new_corpus['recommendation_status'].isin(['not_recommended'])]
rec = new_corpus[new_corpus['recommendation_status'].isin(['recommended'])]
rec = rec.head(6350)
new_corpus = pd.concat([rec, nrec])
new_corpus.recommendation_status.value_counts()

recommended        6350
not_recommended    6350
Name: recommendation_status, dtype: int64

# Count Vectorizing & TFIDF Vectorizing

In [133]:
count_vectorizer = CountVectorizer()
X_count_vectorized = count_vectorizer.fit_transform(new_corpus.Cleaned).todense()

In [134]:
vectorizer = TfidfVectorizer(min_df = 2, max_features = 10000 )
X_tfidf_vectorized =  vectorizer.fit_transform(new_corpus.Cleaned).todense()

In [135]:
labels = new_corpus['status_label'].values

In [136]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_count_vectorized, labels, test_size = 0.3 , random_state = 42) 

In [137]:
X_tfidf_train, X_tfidf_test, y_tfidf_train, y_tfidf_test = train_test_split(X_tfidf_vectorized, labels, test_size = 0.3 , random_state = 42) 

In [138]:
input_dim = X_tfidf_train.shape
print(input_dim)

(8890, 5677)


# Logistic Regression for Count Vectorization

In [139]:
xtr = np.asarray(X_train)
ytr = np.asarray(y_train)
xts = np.asarray(X_test)
yts = np.asarray(y_test)
classifier = LogisticRegression()
classifier.fit(xtr, ytr)
c_pred = classifier.predict(xts)
modelEvaluation(yts, c_pred)


Accuracy on validation set: 0.8874

Classification report : 
               precision    recall  f1-score   support

           0       0.89      0.88      0.89      1867
           1       0.89      0.89      0.89      1943

    accuracy                           0.89      3810
   macro avg       0.89      0.89      0.89      3810
weighted avg       0.89      0.89      0.89      3810


Confusion Matrix : 
 [[1652  215]
 [ 214 1729]]


# Logistic Regression for TF-IDF Vectorization

In [140]:
x_tfidf_tr = np.asarray(X_tfidf_train)
y_tfidf_tr = np.asarray(y_tfidf_train)
x_tfidf_ts = np.asarray(X_tfidf_test)
y_tfidf_ts = np.asarray(y_tfidf_test)

clf = LogisticRegression()
clf.fit(x_tfidf_tr, y_tfidf_tr)
tf_pred = clf.predict(x_tfidf_ts)
modelEvaluation(y_tfidf_ts, tf_pred)


Accuracy on validation set: 0.8885

Classification report : 
               precision    recall  f1-score   support

           0       0.87      0.91      0.89      1867
           1       0.91      0.87      0.89      1943

    accuracy                           0.89      3810
   macro avg       0.89      0.89      0.89      3810
weighted avg       0.89      0.89      0.89      3810


Confusion Matrix : 
 [[1693  174]
 [ 251 1692]]


# Neural Network on TF-IDF

In [141]:
nb_classes = 2
batch_size = 32
nb_epochs = 10

In [142]:
y_tfidf_train_cat = np_utils.to_categorical(y_tfidf_train,)

In [143]:
model = Sequential()

model.add(Dense(1000,input_shape= (input_dim[1],)))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(500))

model.add(Activation('relu'))

model.add(Dropout(0.5))

model.add(Dense(50))

model.add(Activation('relu'))

model.add(Dropout(0.5))

model.add(Dense(nb_classes))
model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam')

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [144]:
tf.config.run_functions_eagerly(True)

In [145]:
model.fit(X_tfidf_train, y_tfidf_train_cat, batch_size = batch_size, epochs= nb_epochs, verbose=2)



Epoch 1/10
278/278 - 49s - 175ms/step - loss: 0.3934
Epoch 2/10
278/278 - 48s - 173ms/step - loss: 0.2108
Epoch 3/10
278/278 - 47s - 169ms/step - loss: 0.1315
Epoch 4/10
278/278 - 47s - 170ms/step - loss: 0.0795
Epoch 5/10
278/278 - 83s - 299ms/step - loss: 0.0634
Epoch 6/10
278/278 - 48s - 172ms/step - loss: 0.0497
Epoch 7/10
278/278 - 48s - 172ms/step - loss: 0.0418
Epoch 8/10
278/278 - 48s - 173ms/step - loss: 0.0376
Epoch 9/10
278/278 - 82s - 294ms/step - loss: 0.0379
Epoch 10/10
278/278 - 82s - 295ms/step - loss: 0.0370


<keras.src.callbacks.history.History at 0x1cf67fe9f60>

In [150]:
y_test_pred = model.predict(X_tfidf_test)
y_test_predclass = np.argmax(y_test_pred, axis = 1)
y_train_pred= model.predict(X_tfidf_train)
y_train_predclass = np.argmax(y_train_pred, axis = 1)

[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 20ms/step
[1m278/278[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 20ms/step


In [151]:
from sklearn.metrics import accuracy_score,classification_report
print("nDeep Neural Network - Test Accuracy:", (round(accuracy_score(y_tfidf_test, y_test_predclass),4))*100)
print("nDeep Neural Network - Train Accuracy:", (round(accuracy_score(y_tfidf_train, y_train_predclass),4))*100)

nDeep Neural Network - Test Accuracy: 88.14
nDeep Neural Network - Train Accuracy: 99.15


In [152]:
modelEvaluation(y_tfidf_test, y_test_predclass)


Accuracy on validation set: 0.8814

Classification report : 
               precision    recall  f1-score   support

           0       0.89      0.87      0.88      1867
           1       0.87      0.90      0.89      1943

    accuracy                           0.88      3810
   macro avg       0.88      0.88      0.88      3810
weighted avg       0.88      0.88      0.88      3810


Confusion Matrix : 
 [[1618  249]
 [ 203 1740]]


In [154]:
X_pred = vectorizer.transform([preprocess('بد بود ')]).todense()
print(np.argmax(model.predict(X_pred)))
print(model.predict(X_pred))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
0
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 52ms/step
[[9.9999976e-01 2.6290570e-07]]


In [155]:
ncrps = new_corpus
ncrps.head()

Unnamed: 0.1,Unnamed: 0,body,recommendation_status,status_label,Cleaned
1,1,می‌تونست به عنوان یه کالای فرهنگی بهتر بسته بن...,recommended,1,می‌تونست به عنوان یه کالای فرهنگی بهتر بسته بن...
2,2,بسته بندیش خوب بود\r\n کاربرد و کیفیتشم خیلی خ...,recommended,1,بسته بندیش خوب بود کاربرد و کیفیتشم خیلی خوبه ...
3,3,به نظرم خوبه فقط یکم ظریفه. از رنگش خوشم اومد ...,recommended,1,به نظرم خوبه فقط یکم ظریفه از رنگش خوشم اومد م...
4,4,معمولیه اگه واسه خونه رنگ کردن شخصی میخواین او...,recommended,1,معمولیه اگه واسه خونه رنگ کردن شخصی میخواین او...
5,5,قبلا هم استفاده کردم اگه بلد باشین کار کردن با...,recommended,1,قبلا هم استفاده کردم اگه بلد باشین کار کردن با...


# LSTM with word embedding

In [156]:
Xx = ncrps['Cleaned']
tokenizer = Tokenizer(num_words=20000)
tokenizer.fit_on_texts(Xx)
Xx = tokenizer.texts_to_sequences(Xx)
Xx = pad_sequences(Xx , maxlen=100)

In [157]:
Xx.shape

(12700, 100)

In [158]:
from sklearn.model_selection import train_test_split
Xx_train, Xx_test, yy_train, yy_test = train_test_split(Xx, labels, test_size = 0.2 , random_state = 42)

In [159]:
yy_train_cat = np_utils.to_categorical(yy_train, 2)
yy_test_cat = np_utils.to_categorical(yy_test, 2)

In [160]:
from tensorflow.keras import layers

In [161]:
rnn_model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=20000, output_dim=32, input_shape=(Xx_train.shape[1],)),
    tf.keras.layers.Bidirectional(layers.LSTM(128,return_sequences=True)),
    tf.keras.layers.Bidirectional(layers.LSTM(128)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(2, activation='softmax')
])

  super().__init__(**kwargs)


In [162]:
rnn_model.compile(loss=tf.keras.losses.categorical_crossentropy,
                 optimizer= tf.keras.optimizers.Adam(1e-4),
                 metrics=['accuracy'])

In [163]:
rnn_model.fit(Xx_train,yy_train_cat, epochs=3,
             validation_data =(Xx_test, yy_test_cat),
             validation_steps = 30)

Epoch 1/3




[1m318/318[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1056s[0m 3s/step - accuracy: 0.6079 - loss: 0.6530 - val_accuracy: 0.8591 - val_loss: 0.3451
Epoch 2/3
[1m318/318[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1394s[0m 4s/step - accuracy: 0.8805 - loss: 0.3097
Epoch 3/3


  self.gen.throw(typ, value, traceback)


[1m318/318[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1074s[0m 3s/step - accuracy: 0.9120 - loss: 0.2505 - val_accuracy: 0.8787 - val_loss: 0.3039


<keras.src.callbacks.history.History at 0x1cf67fc8fd0>

In [164]:
yy_test_pred = rnn_model.predict(Xx_test)
yy_test_predclass = np.argmax(yy_test_pred, axis=1)
yy_train_pred = rnn_model.predict(Xx_train)
yy_train_predclass = np.argmax(yy_train_pred, axis=1)

[1m80/80[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m128s[0m 2s/step
[1m318/318[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m501s[0m 2s/step


In [165]:
from sklearn.metrics import accuracy_score,classification_report
print("nDeep Neural Network - Test Accuracy:", (round(accuracy_score(yy_test, yy_test_predclass),4))*100)
print("nDeep Neural Network - Train Accuracy:", (round(accuracy_score(yy_train, yy_train_predclass),4))*100)

nDeep Neural Network - Test Accuracy: 87.87
nDeep Neural Network - Train Accuracy: 93.26


In [166]:
modelEvaluation(yy_test, yy_test_predclass)


Accuracy on validation set: 0.8787

Classification report : 
               precision    recall  f1-score   support

           0       0.85      0.92      0.88      1258
           1       0.91      0.84      0.88      1282

    accuracy                           0.88      2540
   macro avg       0.88      0.88      0.88      2540
weighted avg       0.88      0.88      0.88      2540


Confusion Matrix : 
 [[1154  104]
 [ 204 1078]]
