In [18]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.utils import resample
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM,Dense,Embedding

In [2]:
prefix_url='https://graphemy.ir/datasets'
def get_file(name):
    return f'{prefix_url}/{name}'

In [3]:
df = pd.read_csv(get_file('dg_data.csv'))
df.head()

Unnamed: 0,Text,Score,Suggestion
0,این اولین تجربه من برای خرید ایفون هست امروز...,100,1
1,خرید این محصول رو توصیه میکنم,84,1
2,1 ساله این گوشی رو دارم هیچ نقطه ضعفی ازش ند...,60,1
3,سلام خدمت دوستان این گوشی از همه نظر عالی کیف...,96,1
4,سلام دوستانی که نگران شکستن صفحه نمایش هستند ا...,92,1


In [4]:
df.Suggestion.value_counts()

Suggestion
1    2382
3     460
2     419
Name: count, dtype: int64

In [5]:
df['Suggestion'] = df['Suggestion'].replace(2,1)
df['Suggestion'] = df['Suggestion'].replace(3,0)
df.Suggestion.value_counts()

Suggestion
1    2801
0     460
Name: count, dtype: int64

In [7]:
minority = df[df['Suggestion']==0]
majority = df[df['Suggestion']==1]
df_upsampled = resample(minority,replace=True,n_samples=len(majority),random_state=43)
df = pd.concat([majority,df_upsampled])
df.Suggestion.value_counts()

Suggestion
1    2801
0    2801
Name: count, dtype: int64

In [8]:
embedding_vector_length = 300
sentences_max_length = 70
lstm_units = 64
oov_token= "<OOV>"

In [10]:
texts = df.Text.values
labels = df.Suggestion.values
tokenizer = Tokenizer(num_words = 3000,oov_token = oov_token)
tokenizer.fit_on_texts(texts)
vocab_size = len(tokenizer.word_index) + 1
print(vocab_size)

12126


In [11]:
tokenizer.texts_to_sequences(['من خیلی حالم خوبه'])

[[11, 12, 2141, 63]]

In [12]:
tokenizer.texts_to_sequences(['من حالم خوبه'])

[[11, 2141, 63]]

In [13]:
encoded_texts = tokenizer.texts_to_sequences(['من حالم خوبه'])
encoded_texts

[[11, 2141, 63]]

In [14]:
encoded_texts = tokenizer.texts_to_sequences(texts)
len(encoded_texts)

5602

In [15]:
padded_sequence = pad_sequences(encoded_texts,maxlen=sentences_max_length,padding='post')
padded_sequence.shape

(5602, 70)

In [17]:
X_train,X_test,y_train,y_test = train_test_split(padded_sequence,labels,test_size=0.3,shuffle=True,random_state=1403)
X_train.shape,X_test.shape

((3921, 70), (1681, 70))

In [19]:
model = Sequential()
model.add(Embedding(input_dim = vocab_size,output_dim = embedding_vector_length,input_length = sentences_max_length))
model.add(LSTM(64))
model.add(Dense(64,activation='relu'))
model.add(Dense(1,activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
model.summary()

2025-02-16 17:06:42.173474: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2025-02-16 17:06:42.173538: W tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.cc:265] failed call to cuInit: UNKNOWN ERROR (303)
2025-02-16 17:06:42.173570: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (milad-HP): /proc/driver/nvidia/version does not exist
2025-02-16 17:06:42.174004: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 70, 300)           3637800   
                                                                 
 lstm (LSTM)                 (None, 64)                93440     
                                                                 
 dense (Dense)               (None, 64)                4160      
                                                                 
 dense_1 (Dense)             (None, 1)                 65        
                                                                 
Total params: 3,735,465
Trainable params: 3,735,465
Non-trainable params: 0
_________________________________________________________________


In [20]:
n_epochs = 20
history = model.fit(X_train,y_train,epochs = n_epochs, validation_data=[X_test,y_test],verbose =1 )

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [21]:
model.evaluate(X_test,y_test)



[0.30675095319747925, 0.9000594615936279]

In [22]:
text1 = 'این محصول اصلا به درد نمی‌خوره'
text2 = 'خیلی کار من رو راه انداخت'

In [23]:
seq1 = tokenizer.texts_to_sequences([text1])
seq2 = tokenizer.texts_to_sequences([text2])
pad1 = pad_sequences(seq1,maxlen = sentences_max_length,padding='post')
pad2 = pad_sequences(seq2,maxlen = sentences_max_length,padding='post')

In [24]:
model.predict(pad1)



array([[0.24846216]], dtype=float32)

In [25]:
model.predict(pad2)



array([[0.9537939]], dtype=float32)