# Modeling

In [155]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

In [1]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, AdaBoostClassifier
from sklearn.metrics import confusion_matrix

In [152]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten
from keras.layers import Conv2D, MaxPooling2D
from keras.utils import np_utils

Using TensorFlow backend.


In [2]:
import pandas as pd
import numpy as np
import re

In [165]:
# Read in final dataset
df = pd.read_csv('final_dataset.csv')

In [166]:
df.columns

Index(['Unnamed: 0', 'brand_name', 'category', 'imageAltText',
       'isLimitedEdition', 'isNew', 'isOnlineOnly', 'isSephoraExclusive',
       'listPrice', 'salePrice', 'skuId', 'skuType', 'valuePrice', 'target_x',
       '1 star', '2 stars', '3 stars', '4 stars', '5 stars', 'avg_rating',
       'review_count_y', 'total_ratings', 'compound_score', 'compound'],
      dtype='object')

In [167]:
df.drop(columns = ['Unnamed: 0', 'imageAltText', 'salePrice', 'valuePrice'], inplace = True)

In [34]:
df.head()

Unnamed: 0,brand_name,category,isLimitedEdition,isNew,isOnlineOnly,isSephoraExclusive,listPrice,skuId,skuType,target_x,1 star,2 stars,3 stars,4 stars,5 stars,avg_rating,review_count_y,total_ratings,compound_score,compound
0,FENTY BEAUTY by Rihanna,foundation-makeup,False,True,True,True,35.0,2268274,Standard,0,0.0,0.0,0.0,5.0,11.0,4.7,16.0,16.0,,0.860733
1,PAT McGRATH LABS,foundation-makeup,False,True,False,True,68.0,2257111,Standard,0,28.0,35.0,57.0,212.0,548.0,4.4,880.0,880.0,,0.767496
2,FENTY BEAUTY by Rihanna,foundation-makeup,False,False,False,True,35.0,2164671,Standard,0,1139.0,1279.0,1324.0,2149.0,8379.0,4.1,14270.0,14270.0,,0.61684
3,MILK MAKEUP,foundation-makeup,False,True,False,True,36.0,2242105,Standard,0,1.0,10.0,23.0,115.0,214.0,4.5,363.0,363.0,,0.775412
4,Estée Lauder,foundation-makeup,False,False,False,False,43.0,2112167,Standard,0,213.0,258.0,299.0,837.0,3758.0,4.4,5365.0,5365.0,0.012698,0.782008


In [83]:
df.isnull().sum()

brand_name               0
category                 0
isLimitedEdition         0
isNew                    0
isOnlineOnly             0
isSephoraExclusive       0
listPrice                0
skuId                    0
skuType                  0
target_x                 0
1 star                 900
2 stars                900
3 stars                900
4 stars                900
5 stars                900
avg_rating             900
review_count_y         900
total_ratings          900
compound_score        1687
compound               925
dtype: int64

## Feature Engineering

In [168]:
# Create dummy features for binary columns
df = pd.get_dummies(df, columns = ['isLimitedEdition', 'isNew', 'isOnlineOnly', 'isSephoraExclusive', 'brand_name', 'category'], drop_first = True)



In [132]:
# Create array values for non binary categorical columns
def create_array(column):
    enc = OneHotEncoder(handle_unknown = 'error', drop = 'first')
    encode = enc.fit_transform(df[[column]]).toarray()
    encode_df = pd.DataFrame(index = df['skuId'], data = encode)
    encode_df[column] = encode_df.apply(lambda x: tuple(x), axis =1).apply(np.array)
    encode_df = encode_df[[column]]
    return encode_df

In [134]:
category = create_array('category')

brand_name = create_array('brand_name')

df = pd.merge(left = df, right = category, how = 'inner', left_on = 'skuId', right_index = True)

df = pd.merge(left = df, right = brand_name, how = 'inner', left_on = 'skuId', right_index = True)

df_dummies = df.drop(columns = ['brand_name_x', 'category_x', 'skuType'])

df_dummies.fillna(0, inplace = True)

In [148]:
# df_dummies.drop(columns = 'skuType', inplace = True)

In [170]:
df.drop(columns = 'skuType', inplace = True)

In [177]:
df.fillna(0, inplace= True)

## Build Model

### Logistic Regression

In [178]:
X = df.drop(columns = 'target_x')

In [179]:
y = df_dummies['target_x']

In [180]:
X_train, X_test, y_train, y_test = train_test_split(X, y,stratify = y)

In [208]:
lr = LogisticRegression()
gs_lr = GridSearchCV(estimator = lr, 
             param_grid = {
                 'penalty': ['l1','l2'],
                 'C': [.25, .5, .75, 1],
                 'solver': ['liblinear']},
                    cv = 5)

gs_lr.fit(X_train, y_train)
print(f'Best training score for gs: {gs_lr.best_score_}')
print(f'Best parameters: {gs_lr.best_params_}')
lr = gs_lr.best_estimator_
print(f'Test score: {lr.score(X_test, y_test)}')

Best training score for gs: 0.9577754891864058
Best parameters: {'C': 0.75, 'penalty': 'l1', 'solver': 'liblinear'}
Test score: 0.9567901234567902


### Neural Network

In [216]:
ss = StandardScaler()
X_train_sc = ss.fit_transform(X_train)
X_test_sc = ss.transform(X_test)

In [219]:
X_train_sc.shape[1]

181

In [230]:
model = Sequential()

model.add(Dense(32, 
                activation='relu', 
                input_shape=(X_train_sc.shape[1],)))

model.add(Dense(1, activation='sigmoid'))

model.compile(loss = 'binary_crossentropy',
             optimizer = 'adam',
             metrics = ['acc'])

In [232]:
model.fit(X_train_sc, y_train, 
         epochs = 10, 
         batch_size = 128, 
         validation_data = (X_test_sc, y_test))

Train on 1942 samples, validate on 648 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1a33aeba58>