# Predicting the rating of a dress from online reviews

In [30]:
import seaborn as sns
import sklearn as sk
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

## Data set

In [2]:
df = pd.read_csv("clothingreviews.csv")
df.head(30) 

Unnamed: 0.1,Unnamed: 0,Clothing ID,Age,Title,Review Text,Rating,Recommended IND,Positive Feedback Count,Division Name,Department Name,Class Name
0,0,767,33,,Absolutely wonderful - silky and sexy and comf...,4,1,0,Initmates,Intimate,Intimates
1,1,1080,34,,Love this dress! it's sooo pretty. i happene...,5,1,4,General,Dresses,Dresses
2,2,1077,60,Some major design flaws,I had such high hopes for this dress and reall...,3,0,0,General,Dresses,Dresses
3,3,1049,50,My favorite buy!,"I love, love, love this jumpsuit. it's fun, fl...",5,1,0,General Petite,Bottoms,Pants
4,4,847,47,Flattering shirt,This shirt is very flattering to all due to th...,5,1,6,General,Tops,Blouses
5,5,1080,49,Not for the very petite,"I love tracy reese dresses, but this one is no...",2,0,4,General,Dresses,Dresses
6,6,858,39,Cagrcoal shimmer fun,I aded this in my basket at hte last mintue to...,5,1,1,General Petite,Tops,Knits
7,7,858,39,"Shimmer, surprisingly goes with lots","I ordered this in carbon for store pick up, an...",4,1,4,General Petite,Tops,Knits
8,8,1077,24,Flattering,I love this dress. i usually get an xs but it ...,5,1,0,General,Dresses,Dresses
9,9,1077,34,Such a fun dress!,"I'm 5""5' and 125 lbs. i ordered the s petite t...",5,1,0,General,Dresses,Dresses


To make sure I have only dresses in my dataframe I'm going to subset the dataframe.

In [7]:
df_subset = df[df["Department Name"] == "Dresses"]
df_subset.head(30)

Unnamed: 0.1,Unnamed: 0,Clothing ID,Age,Title,Review Text,Rating,Recommended IND,Positive Feedback Count,Division Name,Department Name,Class Name
1,1,1080,34,,Love this dress! it's sooo pretty. i happene...,5,1,4,General,Dresses,Dresses
2,2,1077,60,Some major design flaws,I had such high hopes for this dress and reall...,3,0,0,General,Dresses,Dresses
5,5,1080,49,Not for the very petite,"I love tracy reese dresses, but this one is no...",2,0,4,General,Dresses,Dresses
8,8,1077,24,Flattering,I love this dress. i usually get an xs but it ...,5,1,0,General,Dresses,Dresses
9,9,1077,34,Such a fun dress!,"I'm 5""5' and 125 lbs. i ordered the s petite t...",5,1,0,General,Dresses,Dresses
10,10,1077,53,Dress looks like it's made of cheap material,Dress runs small esp where the zipper area run...,3,0,14,General,Dresses,Dresses
11,11,1095,39,,This dress is perfection! so pretty and flatte...,5,1,2,General Petite,Dresses,Dresses
12,12,1095,53,Perfect!!!,More and more i find myself reliant on the rev...,5,1,2,General Petite,Dresses,Dresses
14,14,1077,50,Pretty party dress with some issues,This is a nice choice for holiday gatherings. ...,3,1,1,General,Dresses,Dresses
19,19,1077,47,Stylish and comfortable,I love the look and feel of this tulle dress. ...,5,1,0,General,Dresses,Dresses


## Text pre-processing steps resulting in a document-feature matrix

In [9]:
text = df["Review Text"].values.astype("U") #Taking the text from the df. We need to convert it to Unicode
vect = CountVectorizer(stop_words= "english") #Create the CV object, with English stop words
vect = vect.fit(text) #We fit the model with the words from the review text
feature_names = vect.get_feature_names() #Get the words from the vocabulary

In [10]:
docu_feat = vect.transform(text) # make a matrix

In [11]:
print(docu_feat[0:500,0:500])

  (5, 10)	1
  (9, 98)	1
  (9, 354)	1
  (13, 289)	1
  (13, 297)	1
  (13, 352)	1
  (13, 371)	1
  (16, 16)	1
  (18, 133)	1
  (20, 122)	1
  (25, 310)	1
  (25, 377)	1
  (31, 150)	1
  (33, 398)	2
  (37, 118)	1
  (37, 133)	1
  (37, 310)	1
  (37, 474)	1
  (43, 98)	1
  (51, 146)	1
  (55, 377)	1
  (70, 424)	1
  (70, 496)	1
  (72, 355)	1
  (75, 49)	1
  :	:
  (422, 73)	1
  (422, 348)	1
  (423, 63)	1
  (423, 277)	1
  (423, 316)	1
  (428, 49)	1
  (429, 15)	1
  (433, 15)	1
  (434, 28)	1
  (434, 353)	1
  (436, 59)	1
  (438, 63)	1
  (440, 114)	1
  (446, 360)	1
  (450, 15)	1
  (457, 103)	1
  (461, 371)	1
  (461, 496)	1
  (462, 378)	1
  (475, 150)	1
  (475, 354)	1
  (477, 95)	1
  (481, 63)	1
  (491, 118)	1
  (496, 15)	2


## Building the model

In [13]:
X = docu_feat #the document-feature matrix is the X matrix
y = df["Rating"] #creating the y vector
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) #split the data and store it

In [17]:
nb = MultinomialNB() #create the model
nb = nb.fit(X_train, y_train) #fit the model X=features, y=character

## Model evaluation

In [18]:
nb_score = nb.score(X_test, y_test)
print(f"The accuracy for Naive Bayes: {nb_score}")

The accuracy for Naive Bayes: 0.6234743116661936


To make sure the accuracy of the model is better than if it would be random. I want to calculate the value counts of the rating.

In [21]:
df_subset["Rating"].value_counts() #Let's have a look at the 'genre' variable

5    3397
4    1395
3     838
2     461
1     228
Name: Rating, dtype: int64

In [24]:
228+461+838+1395+3397

6319

In [25]:
1395+3397

4792

In [27]:
(4792/9319)

0.5142182637622063

The accuracy of the Naive Bayes is better than if it would be random. 
NB is 0.62 whereas random is 0.51

I am a little bit confused how to do the test cases. It would be nice to have it explained one more time. 