In [None]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Data cleaning and Feature engineering

In [None]:
from google.colab import drive
drive.mount('/content/drive') 

df = pd.read_csv("/content/drive/MyDrive/DeGatto Project/EDA + Models/Womens Clothing E-Commerce Reviews.csv")

Mounted at /content/drive


In [None]:
df = df.drop(["Unnamed: 0", "Clothing ID","Age", "Title", "Positive Feedback Count", "Division Name", "Department Name", "Class Name"], axis = 1)
df.dropna(inplace = True)
## We removed the unnecessary columns and the na rows. 

In [None]:
def categorise(row):  
    if row['Rating'] >= 4 and row['Recommended IND'] == 1:
        return 'Positive'
    elif row['Rating'] >= 4 and row['Recommended IND'] == 0:
        return 'Neutral'
    elif row['Rating'] == 3:
        return 'Neutral'
    elif row['Rating'] <= 2 and row['Recommended IND'] == 1:
        return 'Neutral'
    elif row['Rating'] <= 2 and row['Recommended IND'] == 0:
        return 'Negative'

df['Sentiment'] = df.apply(lambda row: categorise(row), axis = 1)

In [None]:
## As now there is no more use for Rating and Recommended IND, we decided to remove them from our dataset.
df.drop(['Rating', 'Recommended IND'], axis = 1, inplace = True)

# Data Preprocessing

In [None]:
df.reset_index(drop=True,inplace=True)

In [None]:
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import re
ps = PorterStemmer()
corpus = []
for i in range(0, len(df)):
    review = re.sub('[^a-zA-Z]', ' ', df.loc[i,'Review Text'])
    review = review.lower()
    review = review.split()
    
    review = ' '.join(review)
    corpus.append(review)

In [None]:
df['Review Text']=corpus

In [None]:
words=df['Review Text']
y=df['Sentiment']

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_v=TfidfVectorizer(max_features=3000,ngram_range=(1,3))
words=tfidf_v.fit_transform(words).toarray()

# ML Algorithms

## Prediction Using Bidirectional LSTM

In [None]:
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.layers import Bidirectional
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout

In [None]:
voc_size=5000

In [None]:
onehot_repr=[one_hot(words,voc_size)for words in corpus] 
#onehot_repr

In [None]:
sent_length=70
embedded_docs=pad_sequences(onehot_repr,padding='pre',maxlen=sent_length)
print(embedded_docs)

[[   0    0    0 ...  496 1986 2898]
 [   0    0    0 ... 2978 4633 4269]
 [3988 4472 3437 ... 2928 3276 3537]
 ...
 [   0    0    0 ... 3969 4379 2626]
 [2021 1376 4098 ... 3243  223 4480]
 [   0    0    0 ... 4589 3594 2104]]


In [None]:
## Creating model
embedding_vector_features=50
model=Sequential()
model.add(Embedding(voc_size,embedding_vector_features,input_length=70))
model.add(Dropout(0.3))
model.add(Bidirectional(LSTM(100,return_sequences=True)))
model.add(Dropout(0.3))
model.add(Bidirectional(LSTM(100)))
model.add(Dropout(0.33))
model.add(Dense(3,activation='softmax'))
model.compile(loss='sparse_categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
print(model.summary())

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 70, 50)            250000    
                                                                 
 dropout_3 (Dropout)         (None, 70, 50)            0         
                                                                 
 bidirectional_2 (Bidirectio  (None, 70, 200)          120800    
 nal)                                                            
                                                                 
 dropout_4 (Dropout)         (None, 70, 200)           0         
                                                                 
 bidirectional_3 (Bidirectio  (None, 200)              240800    
 nal)                                                            
                                                                 
 dropout_5 (Dropout)         (None, 200)              

In [None]:
import numpy as np
X_final=np.array(embedded_docs)
y_final=np.array(y)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_final, y_final, test_size=0.2, random_state=42)

In [None]:
y_train.shape

(18112,)

In [None]:
model.fit(X_train,y_train,validation_data=(X_test,y_test),epochs=10,batch_size=64)

Epoch 1/10


UnimplementedError: ignored

In [None]:
y_pred=model.predict_classes(X_test)
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

In [None]:
predictions = model.predict_classes(X_test)
print(classification_report(y_test, predictions))

#### Here we can see that the f1 score of 0 and 1 has been improved