In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
prefix_url='https://graphemy.ir/datasets'
def get_file(name):
    return f'{prefix_url}/{name}'

In [2]:
df=pd.read_csv(get_file('dg_data.csv'))
df.head()

Unnamed: 0,Text,Score,Suggestion
0,این اولین تجربه من برای خرید ایفون هست امروز...,100,1
1,خرید این محصول رو توصیه میکنم,84,1
2,1 ساله این گوشی رو دارم هیچ نقطه ضعفی ازش ند...,60,1
3,سلام خدمت دوستان این گوشی از همه نظر عالی کیف...,96,1
4,سلام دوستانی که نگران شکستن صفحه نمایش هستند ا...,92,1


In [3]:
df.Suggestion.value_counts() #imbalanced

Suggestion
1    2382
3     460
2     419
Name: count, dtype: int64

In [4]:
df['Suggestion'] = df['Suggestion'].replace(2,1)
df['Suggestion'] = df['Suggestion'].replace(3,0)
df.Suggestion.value_counts() #imbalanced

Suggestion
1    2801
0     460
Name: count, dtype: int64

In [5]:
from sklearn.utils import resample

In [6]:
minority = df[df['Suggestion']==0]
majority = df[df['Suggestion']==1]

df_upsampled = resample(minority,
                                 replace=True,     # sample with replacement
                                 n_samples=len(majority),    # match number in majority class
                                 random_state=27
)

df=pd.concat([majority,df_upsampled])
df.Suggestion.value_counts()

Suggestion
1    2801
0    2801
Name: count, dtype: int64

In [7]:
embedding_vector_length = 300
sentences_max_length = 70
lstm_units = 64
oov_token= "<OOV>"

In [8]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

2024-07-28 19:32:09.322851: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-07-28 19:32:09.329957: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-07-28 19:32:09.404996: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [9]:
texts=df.Text.values
labels=df.Suggestion.values
tokenizer=Tokenizer(num_words=3000,oov_token=oov_token)
tokenizer.fit_on_texts(texts)

vocab_size = len(tokenizer.word_index) + 1
print(vocab_size)

12120


In [10]:
tokenizer.texts_to_sequences(['من خیلی حالم خوبه'])

[[11, 12, 2988, 62]]

In [11]:
tokenizer.texts_to_sequences(['من حالم خوبه'])

[[11, 2988, 62]]

In [12]:
encoded_texts=tokenizer.texts_to_sequences(['من حالم خوبه'])
encoded_texts

[[11, 2988, 62]]

In [13]:
encoded_texts=tokenizer.texts_to_sequences(texts)
len(encoded_texts)

5602

In [14]:
padded_sequence = pad_sequences(encoded_texts,maxlen=sentences_max_length,padding='post')
padded_sequence.shape

(5602, 70)

In [15]:
X_train,X_test,y_train,y_test = train_test_split(padded_sequence,labels,test_size=0.3,shuffle=True,random_state=1403)

In [16]:
X_train.shape,X_test.shape

((3921, 70), (1681, 70))

In [21]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM,Dense,Embedding
model = Sequential()

model.add(Embedding(input_dim=vocab_size,output_dim=embedding_vector_length,input_length=sentences_max_length)) # randomization
model.add(LSTM(64))
model.add(Dense(64,activation='relu'))
model.add(Dense(1,activation='sigmoid'))

model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
model.summary()

In [None]:
n_epochs = 20
history = model.fit(X_train,y_train,epochs = n_epochs, validation_data=[X_test,y_test],verbose =1 )

In [None]:
model.evaluate(X_test,y_test)

In [None]:
text1='این محصول اصلا به درد نمی‌خوره'
text2='خیلی کار من رو راه انداخت'

In [None]:
seq1=tokenizer.texts_to_sequences([text1])
seq2=tokenizer.texts_to_sequences([text2])
pad1=pad_sequences(seq1,maxlen=sentences_max_length,padding='post')
pad2=pad_sequences(seq2,maxlen=sentences_max_length,padding='post')

In [None]:
model.predict(pad1)

In [None]:
model.predict(pad2)