<h4>Loading the normalized Comments</h4>

In [1]:
import pandas as pd

comments = pd.read_csv("../../data/comments.normalized.csv")
comments['comment'].fillna(' ', inplace=True)
comments.head()

Unnamed: 0,rate,comment
0,1,در مقایسه آی‌پد نسل و مینی نسل هر...
1,1,بسیار زیبا و مناسب برای طراحی و کارای روز مره ...
2,1,عالیه حرف نداره بازی ها هم خیلی خوب اجرا میکنه
3,1,جنس بدنه با کیفیت نرم افزار روان و بی نقص در ...
4,1,کالا اصل بود


<h4>Count Vectorizer</h4>


In [2]:
import pickle
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(min_df=3, max_features= 10000)
X_vectorized = vectorizer.fit_transform(comments['comment']).todense()

print('Shape',X_vectorized.shape)
words = vectorizer.get_feature_names_out()
print(len(words),words)

# Save vectorizer for further usages
with open('vectorizer.pkl','wb') as f:
    pickle.dump(vectorizer,f)

Shape (144717, 10000)
10000 ['آب' 'آبان' 'آبدیت' ... 'یکیو' 'یگ' 'یی']


<h4>Split dataset for training</h4>

In [3]:
from sklearn.model_selection import train_test_split

labels = comments['rate'].values
unique_labels = sorted(list(set(labels)))
print('Unique Labels',unique_labels)

X_train, X_test, y_train, y_test = train_test_split(X_vectorized, labels, test_size=0.2)

print(f'X_test: {X_test.shape}')
print(f'X_train: {X_train.shape}')

Unique Labels [-1, 0, 1]
X_test: (28944, 10000)
X_train: (115773, 10000)


<h4>Train an ANN model</h4>

In [4]:
from keras.models import Sequential
from keras.utils import to_categorical
from keras.layers import Dense, Dropout, Activation

# Params
nb_epochs = 3
nb_classes = 3
batch_size = 64

input_dim = X_train.shape
y_train_cat = to_categorical(y_train,nb_classes)

model = Sequential()

model.add(Dense(512,input_shape= (input_dim[1],)))
model.add(Activation('relu'))
model.add(Dropout(0.5))

model.add(Dense(128))
model.add(Activation('relu'))
model.add(Dropout(0.5))

model.add(Dense(64))
model.add(Activation('relu'))
model.add(Dropout(0.5))

model.add(Dense(nb_classes))
model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy', optimizer='Adam',metrics=['accuracy'])

model.fit(X_train, y_train_cat, batch_size=batch_size, epochs=nb_epochs, verbose=1, validation_split=0.1)

model.save('saved.model')




Epoch 1/3


Epoch 2/3
Epoch 3/3
INFO:tensorflow:Assets written to: saved.model\assets


INFO:tensorflow:Assets written to: saved.model\assets


In [5]:
import numpy as np

y_test_pred = model.predict(X_test)
y_test_pred_class = np.argmax(y_test_pred, axis=1)
y_train_pred = model.predict(X_train)
y_train_pred_class = np.argmax(y_train_pred, axis=1)



In [6]:
from sklearn.metrics import accuracy_score
print ("Test accuracy:",(round(accuracy_score(y_test, y_test_pred_class),4)*100))
print ("Train accuracy:",(round(accuracy_score(y_train, y_train_pred_class),4)*100))

Test accuracy: 76.95
Train accuracy: 80.24
