In [1]:
import pandas as pd

In [2]:
df1 = pd.read_csv("amazon_cells_labelled.csv",  names=["sentences","label"])
df2= pd.read_csv("yelp_labelled.csv",  names=["sentences","label"])

In [3]:
df= pd.concat([df1, df2], ignore_index=True)

In [4]:
df.head()

Unnamed: 0,sentences,label
0,So there is no way for me to plug it in here i...,0
1,"Good case, Excellent value.",1
2,Great for the jawbone.,1
3,Tied to charger for conversations lasting more...,0
4,The mic is great.,1


In [5]:
df.isnull().sum()

sentences    0
label        0
dtype: int64

In [7]:
df.shape

(2000, 2)

In [8]:
#getting independent features
X = df.drop('label', axis=1)

In [9]:
X

Unnamed: 0,sentences
0,So there is no way for me to plug it in here i...
1,"Good case, Excellent value."
2,Great for the jawbone.
3,Tied to charger for conversations lasting more...
4,The mic is great.
...,...
1995,I think food should have flavor and texture an...
1996,Appetite instantly gone.
1997,Overall I was not impressed and would not go b...
1998,"The whole experience was underwhelming, and I ..."


In [10]:
#getting dependent features
y = df['label']

In [11]:
import nltk
import re
from nltk.corpus import stopwords

In [12]:
sentences = X.copy()

In [13]:
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
ps = PorterStemmer()
lemmatizer = WordNetLemmatizer()

In [14]:
corpus = []

for i in range(0, len(sentences)):
    review = re.sub('[^a-zA-Z]',' ' ,sentences['sentences'][i])
    review = review.lower()
    review = review.split()
    #review= [ps.stem(word) for word in review if  not word in stopwords.words('english')]
    review= [lemmatizer.lemmatize(word,pos='v') for word in review if word not in set (stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)



In [15]:
corpus

['way plug us unless go converter',
 'good case excellent value',
 'great jawbone',
 'tie charger conversations last minutes major problems',
 'mic great',
 'jiggle plug get line right get decent volume',
 'several dozen several hundred contact imagine fun send one one',
 'razr owner must',
 'needless say waste money',
 'waste money time',
 'sound quality great',
 'impress go original battery extend battery',
 'two seperated mere ft start notice excessive static garble sound headset',
 'good quality though',
 'design odd ear clip comfortable',
 'highly recommend one blue tooth phone',
 'advise everyone fool',
 'far good',
 'work great',
 'click place way make wonder long mechanism would last',
 'go motorola website follow directions could get pair',
 'buy use kindle fire absolutely love',
 'commercials mislead',
 'yet run new battery two bar three days without charge',
 'buy mother problem battery',
 'great pocket pc phone combination',
 'own phone months say best mobile phone',
 'thin

In [16]:
import tensorflow as tf

In [17]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM

In [18]:
voc_size = 1000

In [19]:
onehot_repr = [one_hot(words,voc_size) for words in corpus]

In [20]:
onehot_repr

[[254, 333, 112, 737, 52, 487],
 [763, 842, 939, 839],
 [899, 239],
 [101, 717, 781, 663, 783, 527, 240],
 [49, 899],
 [687, 333, 314, 336, 666, 314, 231, 780],
 [865, 768, 865, 662, 848, 752, 927, 2, 339, 339],
 [374, 615, 388],
 [561, 219, 40, 773],
 [40, 773, 958],
 [618, 680, 899],
 [768, 52, 122, 842, 361, 842],
 [56, 493, 470, 309, 773, 37, 37, 39, 857, 618, 65],
 [763, 680, 822],
 [134, 288, 856, 951, 26],
 [939, 847, 339, 356, 480, 406],
 [726, 268, 642],
 [500, 763],
 [687, 899],
 [257, 165, 254, 690, 2, 104, 322, 79, 663],
 [52, 458, 873, 134, 21, 665, 314, 569],
 [930, 901, 879, 867, 633, 333],
 [29, 281],
 [76, 532, 414, 842, 56, 534, 961, 962, 285, 324],
 [930, 188, 16, 842],
 [899, 373, 208, 406, 471],
 [296, 406, 459, 219, 488, 811, 406],
 [161, 105, 186, 235],
 [682, 343, 560, 489, 557, 410, 489, 406],
 [817, 324],
 [281, 793, 406, 901, 383, 657],
 [201, 575, 682, 949, 628, 629, 903],
 [550, 468, 836, 387, 117],
 [56, 234, 304, 390, 141, 406],
 [836, 717, 88, 928, 717, 

In [21]:
sent_length = 25

In [22]:
embedded_docs = pad_sequences(onehot_repr, padding="post", maxlen=sent_length)

In [23]:
embedded_docs[1]

array([763, 842, 939, 839,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0])

In [30]:
#from tensorflow.keras.layers import Dropout
embedding_vector_features = 20

model = Sequential()
model.add(Embedding(voc_size,embedding_vector_features,input_length = sent_length))
#model.add(Dropout(0.3))
model.add(LSTM(100))
#model.add(Dropout(0.3))
model.add(Dense(1,activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

None


In [31]:
len(embedded_docs),y.shape

(2000, (2000,))

In [32]:
import numpy as np
X_final=np.array(embedded_docs)
y_final=np.array(y)

In [33]:
X_final.shape,y_final.shape

((2000, 25), (2000,))

In [34]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_final, y_final, test_size=0.3, random_state=42)

In [35]:
### Finally Training
model.fit(X_train,y_train,validation_data=(X_test,y_test),epochs=10,batch_size=64)

Epoch 1/10
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 35ms/step - accuracy: 0.5055 - loss: 0.6934 - val_accuracy: 0.4767 - val_loss: 0.6949
Epoch 2/10
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step - accuracy: 0.4984 - loss: 0.6933 - val_accuracy: 0.5350 - val_loss: 0.6901
Epoch 3/10
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 23ms/step - accuracy: 0.6261 - loss: 0.6373 - val_accuracy: 0.7183 - val_loss: 0.5444
Epoch 4/10
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step - accuracy: 0.8270 - loss: 0.4196 - val_accuracy: 0.7550 - val_loss: 0.5349
Epoch 5/10
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step - accuracy: 0.8645 - loss: 0.3321 - val_accuracy: 0.7400 - val_loss: 0.6110
Epoch 6/10
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step - accuracy: 0.8934 - loss: 0.2910 - val_accuracy: 0.7183 - val_loss: 0.6621
Epoch 7/10
[1m22/22[0m [32m━━━━

<keras.src.callbacks.history.History at 0x2595ac94c20>

In [36]:
y_pred = model.predict(X_test)

[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step


In [37]:
y_pred = np.where(y_pred> 0.6,1,0)

In [40]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

In [39]:
accuracy_score(y_pred, y_test)

0.7116666666666667

In [42]:
print(classification_report(y_pred, y_test))

              precision    recall  f1-score   support

           0       0.76      0.71      0.73       335
           1       0.66      0.71      0.69       265

    accuracy                           0.71       600
   macro avg       0.71      0.71      0.71       600
weighted avg       0.72      0.71      0.71       600

