In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Data cleaning and Feature engineering

In [None]:
from google.colab import drive
drive.mount('/content/drive')
df = pd.read_csv("/content/drive/MyDrive/DeGatto Project/EDA + Models/Womens Clothing E-Commerce Reviews.csv")

Mounted at /content/drive


In [None]:
df = df.drop(["Unnamed: 0", "Clothing ID","Age", "Title", "Positive Feedback Count", "Division Name", "Department Name", "Class Name"], axis = 1)
df.dropna(inplace = True)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 22641 entries, 0 to 23485
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Review Text      22641 non-null  object
 1   Rating           22641 non-null  int64 
 2   Recommended IND  22641 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 707.5+ KB


In [None]:
#df[df['Division Name'].isna()]

In [None]:
def categorise(row):
    if row['Rating'] >= 4 and row['Recommended IND'] == 1:
        return 'Positive'
    elif row['Rating'] >= 4 and row['Recommended IND'] == 0:
        return 'Neutral'
    elif row['Rating'] == 3:
        return 'Neutral'
    elif row['Rating'] <= 2 and row['Recommended IND'] == 1:
        return 'Neutral'
    elif row['Rating'] <= 2 and row['Recommended IND'] == 0:
        return 'Negative'

df['Sentiment'] = df.apply(lambda row: categorise(row), axis = 1)

In [None]:
df.drop(['Rating', 'Recommended IND'], axis = 1, inplace = True)

In [None]:
df.head()

Unnamed: 0,Review Text,Sentiment
0,Absolutely wonderful - silky and sexy and comf...,Positive
1,Love this dress! it's sooo pretty. i happene...,Positive
2,I had such high hopes for this dress and reall...,Neutral
3,"I love, love, love this jumpsuit. it's fun, fl...",Positive
4,This shirt is very flattering to all due to th...,Positive


# Data Preprocessing

In [None]:
df.reset_index(drop=True,inplace=True)

In [None]:
df

Unnamed: 0,Review Text,Sentiment
0,Absolutely wonderful - silky and sexy and comf...,Positive
1,Love this dress! it's sooo pretty. i happene...,Positive
2,I had such high hopes for this dress and reall...,Neutral
3,"I love, love, love this jumpsuit. it's fun, fl...",Positive
4,This shirt is very flattering to all due to th...,Positive
...,...,...
22636,I was very happy to snag this dress at such a ...,Positive
22637,"It reminds me of maternity clothes. soft, stre...",Neutral
22638,"This fit well, but the top was very see throug...",Neutral
22639,I bought this dress for a wedding i have this ...,Neutral


In [None]:
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import re
ps = PorterStemmer()
corpus = []
for i in range(0, len(df)):
    review = re.sub('[^a-zA-Z]', ' ', df.loc[i,'Review Text'])
    review = review.lower()
    review = review.split()

    review = ' '.join(review)
    corpus.append(review)

In [None]:
df['Review Text']=corpus

In [None]:
df

Unnamed: 0,Review Text,Sentiment
0,absolutely wonderful silky and sexy and comfor...,Positive
1,love this dress it s sooo pretty i happened to...,Positive
2,i had such high hopes for this dress and reall...,Neutral
3,i love love love this jumpsuit it s fun flirty...,Positive
4,this shirt is very flattering to all due to th...,Positive
...,...,...
22636,i was very happy to snag this dress at such a ...,Positive
22637,it reminds me of maternity clothes soft stretc...,Neutral
22638,this fit well but the top was very see through...,Neutral
22639,i bought this dress for a wedding i have this ...,Neutral


In [None]:
df.columns

Index(['Review Text', 'Sentiment'], dtype='object')

In [None]:
#df=df.sort_values("Clothing ID")
#df.reset_index(drop=True,inplace=True)

In [None]:
words=df['Review Text']
y=df['Sentiment']

In [None]:
from sklearn import preprocessing

encoding = preprocessing.LabelEncoder()
y=encoding.fit_transform(y)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_v=TfidfVectorizer(max_features=5000,ngram_range=(1,3))
words=tfidf_v.fit_transform(words).toarray()

In [None]:
from imblearn.over_sampling import RandomOverSampler
over = RandomOverSampler(sampling_strategy = 'all', random_state = 45)
words, y = over.fit_resample(words, y)

In [None]:
words.shape

(51783, 5000)

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(words, y, test_size = 0.2, stratify = y, random_state = 42)

In [None]:
#pd.DataFrame(words).to_csv("words.csv")

# ML Algorithms

## Prediction Using Bidirectional LSTM

In [None]:
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.layers import Bidirectional
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout

In [None]:
voc_size=5000

In [None]:
words.shape

(51783, 5000)

In [None]:
#onehot_repr=[one_hot(words,voc_size)for words in corpus]
#onehot_repr

In [None]:
#sent_length=70
#embedded_docs=pad_sequences(onehot_repr,padding='pre',maxlen=sent_length)
#print(embedded_docs.shape)

In [None]:
words.shape

(51783, 5000)

In [None]:
## Creating model
embedding_vector_features=50
model=Sequential()
model.add(Embedding(voc_size,embedding_vector_features,input_length=3000))
model.add(Dropout(0.3))
model.add((LSTM(100,return_sequences=True)))
model.add(Dropout(0.3))
model.add((LSTM(100)))
model.add(Dropout(0.33))
model.add(Dense(3,activation='softmax'))
model.compile(loss='sparse_categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 3000, 50)          250000    
                                                                 
 dropout (Dropout)           (None, 3000, 50)          0         
                                                                 
 lstm (LSTM)                 (None, 3000, 100)         60400     
                                                                 
 dropout_1 (Dropout)         (None, 3000, 100)         0         
                                                                 
 lstm_1 (LSTM)               (None, 100)               80400     
                                                                 
 dropout_2 (Dropout)         (None, 100)               0         
                                                                 
 dense (Dense)               (None, 3)                 3

In [None]:
import numpy as np
X_final=np.array(words)
y_final=np.array(y)

In [None]:
y_final.shape

(51783,)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(words, y, test_size=0.2, random_state=42)

In [None]:
model.fit(X_train,y_train,validation_data=(X_test,y_test),epochs=10,batch_size=64)

Epoch 1/10


ValueError: ignored

In [None]:
y_pred=model.predict_classes(X_test)
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

In [None]:
predictions = model.predict_classes(X_test)
print(classification_report(y_test, predictions))

#### Here we can see that the f1 score of 0 and 1 has been improved