In [126]:
import pandas as pd
df=pd.read_csv("Cleaned_Womens_Clothing_Reviews.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,Clothing ID,Age,Title,Review Text,Rating,Recommended IND,Positive Feedback Count,Division Name,Department Name,Class Name
0,1,1080,34,,Love this dress! it's sooo pretty. i happene...,5,1,4,General,Dresses,Dresses
1,2,1077,60,Some major design flaws,I had such high hopes for this dress and reall...,3,0,0,General,Dresses,Dresses
2,3,1049,50,My favorite buy!,"I love, love, love this jumpsuit. it's fun, fl...",5,1,0,General Petite,Bottoms,Pants
3,4,847,47,Flattering shirt,This shirt is very flattering to all due to th...,5,1,6,General,Tops,Blouses
4,5,1080,49,Not for the very petite,"I love tracy reese dresses, but this one is no...",2,0,4,General,Dresses,Dresses


In [128]:
df.columns

Index(['Unnamed: 0', 'Clothing ID', 'Age', 'Title', 'Review Text', 'Rating',
       'Recommended IND', 'Positive Feedback Count', 'Division Name',
       'Department Name', 'Class Name'],
      dtype='object')

In [130]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23332 entries, 0 to 23331
Data columns (total 11 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   Unnamed: 0               23332 non-null  int64 
 1   Clothing ID              23332 non-null  int64 
 2   Age                      23332 non-null  int64 
 3   Title                    19556 non-null  object
 4   Review Text              22494 non-null  object
 5   Rating                   23332 non-null  int64 
 6   Recommended IND          23332 non-null  int64 
 7   Positive Feedback Count  23332 non-null  int64 
 8   Division Name            23318 non-null  object
 9   Department Name          23318 non-null  object
 10  Class Name               23318 non-null  object
dtypes: int64(6), object(5)
memory usage: 2.0+ MB


In [132]:
df.isnull().sum()

Unnamed: 0                    0
Clothing ID                   0
Age                           0
Title                      3776
Review Text                 838
Rating                        0
Recommended IND               0
Positive Feedback Count       0
Division Name                14
Department Name              14
Class Name                   14
dtype: int64

In [134]:
#dropping the features that are not required
df.drop(columns=['Unnamed: 0'],inplace=True)
df.drop(columns=['Clothing ID'],inplace=True)
df.drop(columns=['Title'],inplace=True)
df.drop(columns=['Division Name'],inplace=True)

In [136]:
#categorical columns=['Title','Review Text','Division Name','Department Name','Class Name']
#numerical columns=['Unnamed: 0','Clothing ID','Age','Rating','Recommended IND','Positive Feedback Count']

In [138]:
#handling missing data in categorical columns
#there are no missing data in numerical columns
from sklearn.impute import SimpleImputer
cat_cols=df.select_dtypes(include=['object']).columns
handled_cat=SimpleImputer(strategy='most_frequent')
df[cat_cols]=handled_cat.fit_transform(df[cat_cols])

In [140]:
df.isnull().sum()

Age                        0
Review Text                0
Rating                     0
Recommended IND            0
Positive Feedback Count    0
Department Name            0
Class Name                 0
dtype: int64

In [142]:
#converting all categorical data into numerical data
from sklearn.preprocessing import LabelEncoder
cat_to_num={}
for col in cat_cols:
    cat_to_num[col]=LabelEncoder()
    df[col]=cat_to_num[col].fit_transform(df[col])

In [234]:
df.head(3)

Unnamed: 0,Rating,Recommended IND,Positive Feedback Count,Department Name,Class Name
0,5,1,4,1,3
1,3,0,0,1,3
2,5,1,0,0,12


In [146]:
#checking if the target feature is balanced
df['Recommended IND'].value_counts()


Recommended IND
1    19182
0     4150
Name: count, dtype: int64

In [148]:
#applying LOGISTIC REGRESSION because we are performing classification task where the target feature has two possible outcomes
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [212]:
df.drop(columns=['Age'],inplace=True)

In [190]:
df.drop(columns=['Review Text'],inplace=True)

In [216]:
feature_cols=['Rating','Positive Feedback Count','Department Name','Class Name']
x=df[feature_cols] #input features
y=df['Recommended IND'] #target features

In [218]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.25,random_state=0,stratify=y)

In [220]:
print("X_train:",x_train.shape)
print("Y_train:",y_train.shape)
print("X_test:",x_test.shape)
print("Y_test:",y_test.shape)

X_train: (17499, 4)
Y_train: (17499,)
X_test: (5833, 4)
Y_test: (5833,)


In [222]:
log_reg=LogisticRegression(class_weight='balanced',penalty='l2',solver='lbfgs',max_iter=1000)
log_reg.fit(x_train,y_train)

In [224]:
log_reg.score(x_test,y_test)

0.9273101320075433

In [226]:
#applying cross validation to improve model's performance and prevent overfitting 
from sklearn.model_selection import cross_val_score
cv_scores=cross_val_score(log_reg,x,y,cv=5)
print("Cross-validation Accuracy:",np.mean(cv_scores))
print(cv_scores)

Cross-validation Accuracy: 0.9364391582708883
[0.93679023 0.93764731 0.93656237 0.93484784 0.93634805]


In [186]:
y_pred=log_reg.predict(x_test)
y_pred

array([1, 1, 1, ..., 1, 1, 0], dtype=int64)

In [228]:
#new prediction
new_pred=log_reg.predict([[2,7,4,12]])
new_pred



array([0], dtype=int64)

In [230]:
con_mtx=metrics.confusion_matrix(y_pred,y_test)
con_mtx

array([[ 988,  374],
       [  50, 4421]], dtype=int64)

In [232]:
#evaluation metrics
print("Accuracy:",metrics.accuracy_score(y_test,y_pred))
print("Precision:",metrics.precision_score(y_test,y_pred))
print("Recall:",metrics.recall_score(y_test,y_pred))

Accuracy: 0.9273101320075433
Precision: 0.9888168195034668
Recall: 0.9220020855057351
