In [66]:
import numpy as np 
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')



In [67]:
# Load the data
df = pd.read_csv("../input/womens-ecommerce-clothing-reviews/Womens Clothing E-Commerce Reviews.csv", index_col=0)

# 1. **About the data:**

In [68]:
# The first 5 rows of the data
df.head()

In [5]:
# The dimension f the data
df.shape

In [6]:
# The name of columns in the data
df.columns

In [7]:
# Data types of each column
df.dtypes

In [8]:
# The number of unique Clothing IDs
df['Clothing ID'].nunique()

In [9]:
# List of Division Names
df['Division Name'].unique()

In [10]:
# List of Department Names
df['Department Name'].unique()

In [11]:
# List of Class Names
df['Class Name'].unique()

In [12]:
# Number of null in each column
df.isnull().sum()

# 2.1 **Exploratory data analysis - Univariate Analysis**

In [13]:
# Distribution of Age column
sns.histplot(df.Age, bins = 15)

The majority of reviewers age between 30 and 50 yeare old.

In [14]:
sns.countplot(df["Rating"])

The majority of reviewers rated 5, and the minority rated 1.

In [15]:
(df['Rating'].value_counts()).plot(kind = "pie", autopct='%1.1f')

The majority of reviewers (55.9%) rated "5".
The minority of reviewers (3.6%) rated "1".
77.5% of reviewers rated "4" and "5" (satisfied).
10.3% of reviewers rated "1" and "2" (not satisfied).
12.2% of reviewers rated "3" (neutral).

In [16]:
sns.countplot(df["Recommended IND"])

The majority of reviewers recommended the products.

In [17]:
(df["Recommended IND"].value_counts()).plot(kind = "pie", autopct='%1.1f')

The majority of reviewers (82.2%) recommended the products.
The minority of reviewers (17.8%) did not recommend the products.

In [18]:
# Distribution of Division Names
df['Division Name'].value_counts().plot(kind = "bar")

The majority of Division Names are General, and the minority are Intimates. 

In [19]:
df['Division Name'].value_counts().plot(kind = "pie", autopct='%1.1f')

The majority of Division Names (59%) are General, and the minority (6.4%) are Intimates. 

In [20]:
# Distribution of Department Names
df['Department Name'].value_counts().plot(kind = "bar")

The majority of Department Names are Tops, and the minority are Trend. 

In [21]:
df['Department Name'].value_counts().plot(kind = "pie", autopct='%1.1f')

The majority of Department Names (44.6%) are Tops, and the minority (0.5%) are Trend. 

# 2.2. **Exploratory data analysis - Bivariate Analysis**

In [22]:
g = sns.FacetGrid(df, col="Division Name", col_wrap=3)
g.map(sns.countplot, "Rating")

The shape of the Rating distribution is almost the same for different Division Names. The majority of reviewers rated 5 and the minority rated 1 for each Division Name.

In [23]:
g = sns.FacetGrid(df, col="Department Name", col_wrap=3)
g.map(sns.countplot, "Rating")

The shape of the Rating distribution is almost the same for different Department Names. The majority of reviewers rated 5 and the minority rated 1 for each Department Name.

In [24]:
# Age distribution for both recommended and not-recommended groups 
sns.boxplot(y="Age", x="Recommended IND", data=df)

For both recommended and not-recommended groups, the median age is between 40 and 45 years old.    

In [25]:
# Age distribution for each Rating group 
sns.boxplot(y="Age", x="Rating", data=df)

For all Rating groups, the median age is between 40 and 45 years old.    

In [26]:
# Age distribution for each Devision Name 
sns.boxplot(y="Age", x="Division Name", data=df)

For all Division groups, the median age is almost between 40 and 45 years old.    

In [27]:
# Relation between Rating and recommendation
sns.stripplot(y="Age", x="Rating", hue="Recommended IND", data=df)

Almost all reviewers who rate 1 and 2 did not recommend the product and almost all reviewers who rate 4 and 5 did recommend.  

In [28]:
# Number of recommoned products for each Division
sns.stripplot(y="Age", x="Division Name", hue="Recommended IND", data=df)

The majority of reviewers recommended the product for each Division Name.

In [29]:
# Number of recommoned products for each Department
sns.stripplot(y="Age", x="Department Name", hue="Recommended IND", data=df)

The majority of reviewers recommended the product for each Department Name.

# **3. Data Preprocessing**

**3.1 Removing duplicates**

In [69]:
# Checking for duplicated entries
duplicates = df.duplicated().sum() 
if  duplicates == 0:
    print("There are no duplicted rows in this data")
else:
    print(f"There are: {duplicates} duplicated rows")

In [70]:
# Drop the duplicated rows
df=df.drop_duplicates()

**3.2 Handling missing values**

In [71]:
# Number of null in each column
df.isnull().sum()

In [72]:
# Drop rows with null values in "Review Text" column
df = df.dropna(subset=["Review Text"])

In [73]:
# Number of null in each column
df.isnull().sum()

In [74]:
# selecting only required columns for ML models
df_text = df[["Review Text", "Recommended IND"]]

**3.3 Preprocess text with tokenization, removing stopwords and punctuations, and Lemmatisation**  

In [75]:
# Preprocess the text with function processtext()
import nltk
import re
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download("stopwords")

stop_words = stopwords.words('english')
# Remove Negative verbs and "not" from stop_words
stop_words_new = stop_words[:-36]
stop_words_new.remove("not")

punctuations = list(string.punctuation)
lemma = WordNetLemmatizer() # for Lemmatization

def processtext(text):
    text=re.sub("[^a-zA-Z]"," ",text) # Filter to allow only alphabets in text
    text=text.lower() # Convert the text to lowercase to maintain consistency
    tokens=word_tokenize(text) # Tokenize the text
    # Remove stopwords and punctuations
    tokens=[token for token in tokens if token not in stop_words_new and token not in punctuations] 
    tokens=[lemma.lemmatize(token) for token in tokens] # Lemmatization of tokens
    text=" ".join(tokens)
    return text

In [77]:
# Preprocess the "Review Text" column with function processtext()
df_text["processed_Review"] = df_text["Review Text"].apply(lambda x: processtext(x))

**3.4 Over-samplig**

Since the classification data (recommended and not-recommended) is unbalanced, we will do over-sampling which involves adding more examples from the minority class (not-recommended).

In [78]:
# number of reviews that recommeneded the product
df_text[(df_text['Recommended IND']==1)].shape[0]

In [79]:
from sklearn.utils import resample

# Create two different dataframes of majority and minority class 
df_majority = df_text[(df_text['Recommended IND']==1)] 
df_minority = df_text[(df_text['Recommended IND']==0)] 

# Oversample the minority class
df_minority_oversampled = resample(df_minority, 
                                 replace=True,    # sample with replacement
                                 n_samples=18539 , # to match majority class with 18539 rows
                                 random_state=42)  # reproducible results
                                 
# Combine majority class with oversampled minority class
df_oversampled = pd.concat([df_minority_oversampled, df_majority])

In [80]:
# Distribution of the new oversampled dataframe
df_oversampled['Recommended IND'].value_counts().plot(kind="pie", autopct='%1.1f')

The classification data (recommended and not-recommended) is now balanced,

In [81]:
# Define X and y for ML models
X = df_oversampled["processed_Review"]
y = df_oversampled["Recommended IND"]

**3.5 Vectorization of processed text**

In [47]:
# Convert a collection of text features to a matrix of word counts
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(max_features=5000)
X = (tfidf.fit_transform(X)).toarray()
# pd.DataFrame(X, columns = tfidf.get_feature_names())

# split the data (X, y) to train-data and test-data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2, stratify=y, random_state=101)

from sklearn.metrics import recall_score, precision_score, f1_score, roc_auc_score, confusion_matrix

# **4. Machine Learning models**

In [48]:
def ML_models_performance(model, X_train, y_train ,X_test ,y_test, y_pred, model_name):
 
    performance_df=pd.DataFrame({'Train_accuracy':model.score(X_train,y_train),"Test_accuracy":model.score(X_test,y_test),
                       "Precision":precision_score(y_pred,y_test),"Recall":recall_score(y_pred,y_test),
                       "F1_Score":f1_score(y_pred,y_test)}, index=[model_name])
    return performance_df

In [49]:
from sklearn.linear_model import LogisticRegression

# Create the LogisticRegression model
lr = LogisticRegression( )
# Fit the model
lr.fit(X_train, y_train)
# Use the trained model to predict
y_pred = lr.predict(X_test)
# model accuracy
print(f'Model train accuracy: {lr.score(X_train, y_train)*100:.3f}%')
print(f'Model test accuracy: {lr.score(X_test, y_test)*100:.3f}%')
print(f'Model test precision: {precision_score(y_pred,y_test):.3f}')
print(f'Model test recall: {recall_score(y_pred,y_test):.3f}')
print(f'Model test f1_score: {f1_score(y_pred,y_test):.3f}')

In [50]:
lr_performance = ML_models_performance(lr, X_train, y_train ,X_test ,y_test, y_pred, "Logisitc Regression")
lr_performance

In [51]:
from sklearn.svm import LinearSVC

# Create the LinearSVC model with some regularization
LSVC = LinearSVC(random_state=1, dual=False, C=0.5)
# Fit the model
LSVC.fit(X_train, y_train)
# Use the trained model to predict
y_pred = LSVC.predict(X_test)

# model accuracy
print(f'Model train accuracy: {LSVC.score(X_train, y_train)*100:.3f}%')
print(f'Model test accuracy: {LSVC.score(X_test, y_test)*100:.3f}%')
print(f'Model test precision: {precision_score(y_pred,y_test):.3f}')
print(f'Model test recall: {recall_score(y_pred,y_test):.3f}')
print(f'Model test f1_score: {f1_score(y_pred,y_test):.3f}')

In [52]:
LSVC_performance = ML_models_performance(LSVC, X_train, y_train ,X_test ,y_test, y_pred, "LinearSVC")
LSVC_performance

In [53]:
from sklearn.ensemble import RandomForestClassifier

# Create the RandomForestClassifier model
rfc = RandomForestClassifier(random_state=1)
# Fit the model
rfc.fit(X_train, y_train)
# Use the trained model to predict
y_pred = rfc.predict(X_test)
# model accuracy
print(f'Model train accuracy: {rfc.score(X_train, y_train)*100:.3f}%')
print(f'Model test accuracy: {rfc.score(X_test, y_test)*100:.3f}%')
print(f'Model test precision: {precision_score(y_pred,y_test):.3f}')
print(f'Model test recall: {recall_score(y_pred,y_test):.3f}')
print(f'Model test f1_score: {f1_score(y_pred,y_test):.3f}')

In [54]:
rfc_performance = ML_models_performance(rfc, X_train, y_train ,X_test ,y_test, y_pred, "Random Forest")
rfc_performance

In [55]:
comparison_df = pd.concat([lr_performance, LSVC_performance, rfc_performance])
comparison_df

From the comparison, Random Forest model works quite well with test accuracy 0.97, precision 0.95, recall 0.99, and f1_score 0.97.

# **5. Deep Learning Models**

In [82]:
x = df_oversampled['Review Text'].values
y = df_oversampled['Recommended IND'].values

I now vectorize 'Review Text' in Keras, by turning each review into a sequence of integers. 

In [83]:
# Turning each review into a sequence of integers 
from keras.preprocessing.text import Tokenizer
num_words = 10000
tokenizer = Tokenizer(num_words=num_words)
tokenizer.fit_on_texts(x)
xtokens= tokenizer.texts_to_sequences(x)

In [84]:
len_seqs = []
for i in range(len(xtokens)):
    len_seqs.append(len(xtokens[i]))
    
print(f'maximum seuence length is {max(len_seqs)}.') 
#print(max(set(len_seqs), key = len_seqs.count))

len_seqs_array = np.array(len_seqs)
print(f'The number of sequences with lengh more than 50 is  {(len_seqs_array>50).sum()}.')

In [85]:
# Pad sequences to the same length (maxlen)
from keras.preprocessing.sequence import pad_sequences

maxlen=50
xpad=pad_sequences(xtokens,padding='post', maxlen=maxlen)

In [86]:
xpad.shape

In [87]:
# train_test splot
x_train, x_test, y_train, y_test = train_test_split(xpad, y, test_size=0.2, stratify=y,random_state=42)

In [88]:
x_train.shape

In [89]:
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout,GRU,Bidirectional
from keras.initializers import Constant
from tensorflow.keras.optimizers import Adam
from keras.regularizers import l2

model=Sequential()

model.add(Embedding(input_dim=num_words,output_dim=32,input_length=maxlen))

model.add(LSTM(units=64,dropout=0.4,return_sequences=True))
model.add(LSTM(units=32,dropout=0.4,return_sequences=True))
model.add(LSTM(units=16))
model.add(Dropout(0.5))
model.add(Dense(1, activation="sigmoid"))

model.compile(optimizer="adam", loss="binary_crossentropy", metrics=['accuracy'])
model.summary()


#model.compile('rmsprop', 'mse')
#model.summary()
#output = model.predict(xpad)
#print(output.shape)
#print(output)

In [90]:
from tensorflow.keras.callbacks import EarlyStopping

early_stop = EarlyStopping(monitor="val_loss", mode="auto", 
                           verbose=1, patience = 10, restore_best_weights=True)

In [91]:
model.fit(x_train,y_train, epochs=20, batch_size=32, validation_data=(x_test , y_test), callbacks= [early_stop])

In [92]:
model_loss = pd.DataFrame(model.history.history)
model_loss.head()

In [93]:
model_loss.plot()

In [94]:
y_pred = (model.predict(x_test)>=0.5).astype("int32")

In [95]:
LSTM_performance=pd.DataFrame({'Train_accuracy':(model.evaluate(x_train,y_train))[1],"Test_accuracy":(model.evaluate(x_test,y_test))[1],
                       "Precision":precision_score(y_pred,y_test),"Recall":recall_score(y_pred,y_test),
                       "F1_Score":f1_score(y_pred,y_test)}, index=["LSTM"])
LSTM_performance

In [97]:
comparison_df = pd.concat([lr_performance, LSVC_performance, rfc_performance, LSTM_performance])
comparison_df

In [99]:
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout,GRU,Bidirectional
from keras.initializers import Constant
from tensorflow.keras.optimizers import Adam
from keras.regularizers import l2

model2 = Sequential()

model2.add(Embedding(input_dim=num_words,output_dim=32,input_length=maxlen)) 

model2.add(GRU(units=64, return_sequences=True))  
model2.add(GRU(units=32, return_sequences=True))
model2.add(GRU(units=16)) 
model2.add(Dropout(0.4))
model2.add(Dense(1, activation='sigmoid')) 
 
model2.compile(loss="binary_crossentropy", optimizer='Adam', metrics=["Accuracy"])
model2.summary()

In [100]:
model2.fit(x_train,y_train, epochs=20, batch_size=32, validation_data=(x_test , y_test), callbacks= [early_stop])

In [101]:
model2_loss = pd.DataFrame(model2.history.history)
model2_loss.head()

In [102]:
model2_loss.plot()

In [103]:
y_pred = (model2.predict(x_test)>=0.5).astype("int32")

In [104]:
GRU_performance=pd.DataFrame({'Train_accuracy':(model2.evaluate(x_train,y_train))[1],"Test_accuracy":(model2.evaluate(x_test,y_test))[1],
                       "Precision":precision_score(y_pred,y_test),"Recall":recall_score(y_pred,y_test),
                       "F1_Score":f1_score(y_pred,y_test)}, index=["GRU"])
GRU_performance

In [105]:
comparison_df = pd.concat([lr_performance, LSVC_performance, rfc_performance, LSTM_performance, GRU_performance])
comparison_df

In [None]:
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout,GRU,Bidirectional
from keras.initializers import Constant
from tensorflow.keras.optimizers import Adam
from keras.regularizers import l2

model3 = Sequential()

model3.add(Embedding(input_dim = num_words, output_dim = 32,input_length = maxlen))

model3.add(Bidirectional(LSTM(units = 64,return_sequences = True )))
model3.add(Bidirectional(LSTM(units = 32, return_sequences = True)))
model3.add(Bidirectional(LSTM(units = 16)))
model3.add(Dropout(0.5))
model3.add(Dense(1, activation = 'sigmoid'))

model3.compile(loss = 'binary_crossentropy', optimizer = 'Adam', metrics = ["Accuracy"])
model3.summary()

In [None]:
model3.fit(x_train, y_train, epochs = 20, batch_size = 32,
         validation_data = (x_test, y_test), callbacks = [early_stop])

In [None]:
model3_loss = pd.DataFrame(model3.history.history)
model3_loss.head()

In [None]:
model3_loss.plot()

In [None]:
y_pred = (model3.predict(x_test)>=0.5).astype("int32")

In [None]:
Bidirectional_LSTM_performance=pd.DataFrame({'Train_accuracy':(model3.evaluate(x_train,y_train))[1],"Test_accuracy":(model3.evaluate(x_test,y_test))[1],
                       "Precision":precision_score(y_pred,y_test),"Recall":recall_score(y_pred,y_test),
                       "F1_Score":f1_score(y_pred,y_test)}, index=["Bidirectional LSTM"])
Bidirectional_LSTM_performance

In [None]:
comparison_df = pd.concat([lr_performance, LSVC_performance, rfc_performance, LSTM_performance, GRU_performance, Bidirectional_LSTM_performance])
comparison_df

All 3 DL models we used give quite well performance, very close to the Random Forest performance.  