# Modeling

In [155]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

In [1]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, AdaBoostClassifier
from sklearn.metrics import confusion_matrix

In [152]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten
from keras.layers import Conv2D, MaxPooling2D
from keras.utils import np_utils

Using TensorFlow backend.


In [2]:
import pandas as pd
import numpy as np
import re

In [165]:
# Read in final dataset
df = pd.read_csv('final_dataset.csv')

In [167]:
df.drop(columns = ['Unnamed: 0', 'imageAltText', 'salePrice', 'valuePrice', 'skuType'], inplace = True)

In [34]:
df.head()

Unnamed: 0,brand_name,category,isLimitedEdition,isNew,isOnlineOnly,isSephoraExclusive,listPrice,skuId,skuType,target_x,1 star,2 stars,3 stars,4 stars,5 stars,avg_rating,review_count_y,total_ratings,compound_score,compound
0,FENTY BEAUTY by Rihanna,foundation-makeup,False,True,True,True,35.0,2268274,Standard,0,0.0,0.0,0.0,5.0,11.0,4.7,16.0,16.0,,0.860733
1,PAT McGRATH LABS,foundation-makeup,False,True,False,True,68.0,2257111,Standard,0,28.0,35.0,57.0,212.0,548.0,4.4,880.0,880.0,,0.767496
2,FENTY BEAUTY by Rihanna,foundation-makeup,False,False,False,True,35.0,2164671,Standard,0,1139.0,1279.0,1324.0,2149.0,8379.0,4.1,14270.0,14270.0,,0.61684
3,MILK MAKEUP,foundation-makeup,False,True,False,True,36.0,2242105,Standard,0,1.0,10.0,23.0,115.0,214.0,4.5,363.0,363.0,,0.775412
4,Estée Lauder,foundation-makeup,False,False,False,False,43.0,2112167,Standard,0,213.0,258.0,299.0,837.0,3758.0,4.4,5365.0,5365.0,0.012698,0.782008


In [83]:
df.isnull().sum()

brand_name               0
category                 0
isLimitedEdition         0
isNew                    0
isOnlineOnly             0
isSephoraExclusive       0
listPrice                0
skuId                    0
skuType                  0
target_x                 0
1 star                 900
2 stars                900
3 stars                900
4 stars                900
5 stars                900
avg_rating             900
review_count_y         900
total_ratings          900
compound_score        1687
compound               925
dtype: int64

## Feature Engineering

In [168]:
# Create dummy features for binary columns
df = pd.get_dummies(df, columns = ['isLimitedEdition', 'isNew', 'isOnlineOnly', 'isSephoraExclusive', 'brand_name', 'category'], drop_first = True)



In [132]:
# Create array values for non binary categorical columns
def create_array(column):
    enc = OneHotEncoder(handle_unknown = 'error', drop = 'first')
    encode = enc.fit_transform(df[[column]]).toarray()
    encode_df = pd.DataFrame(index = df['skuId'], data = encode)
    encode_df[column] = encode_df.apply(lambda x: tuple(x), axis =1).apply(np.array)
    encode_df = encode_df[[column]]
    return encode_df

In [134]:
category = create_array('category')

brand_name = create_array('brand_name')

df = pd.merge(left = df, right = category, how = 'inner', left_on = 'skuId', right_index = True)

df = pd.merge(left = df, right = brand_name, how = 'inner', left_on = 'skuId', right_index = True)

df_dummies = df.drop(columns = ['brand_name_x', 'category_x', 'skuType'])

df_dummies.fillna(0, inplace = True)

In [148]:
# df_dummies.drop(columns = 'skuType', inplace = True)

In [177]:
df.fillna(0, inplace= True)

## Build Model

### Logistic Regression

In [178]:
X = df.drop(columns = 'target_x')

In [179]:
y = df_dummies['target_x']

In [180]:
X_train, X_test, y_train, y_test = train_test_split(X, y,stratify = y)

In [208]:
lr = LogisticRegression()
gs_lr = GridSearchCV(estimator = lr, 
             param_grid = {
                 'penalty': ['l1','l2'],
                 'C': [.25, .5, .75, 1],
                 'solver': ['liblinear']},
                    cv = 5)

gs_lr.fit(X_train, y_train)
print(f'Best training score for gs: {gs_lr.best_score_}')
print(f'Best parameters: {gs_lr.best_params_}')
lr = gs_lr.best_estimator_
print(f'Test score: {lr.score(X_test, y_test)}')

Best training score for gs: 0.9577754891864058
Best parameters: {'C': 0.75, 'penalty': 'l1', 'solver': 'liblinear'}
Test score: 0.9567901234567902


In [277]:
lr_weights = pd.DataFrame(index = X.columns, data = lr.coef_.ravel())

In [281]:
lr_weights[lr_weights[0]>0]

Unnamed: 0,0
1 star,0.008565
4 stars,0.00808
isLimitedEdition_True,0.69661
isSephoraExclusive_True,0.894736
brand_name_BECCA,1.403317
brand_name_Black Up,4.465781
brand_name_Ciaté London,0.469001
brand_name_FENTY BEAUTY by Rihanna,1.527408
brand_name_GLAMGLOW,0.825459
brand_name_Kat Von D,0.899968


In [285]:
lr_weights[lr_weights[0]<0]

Unnamed: 0,0
listPrice,-0.004913
skuId,-1e-06
2 stars,-0.009362
3 stars,-0.004352
5 stars,-0.001843
review_count_y,-0.000718
total_ratings,-0.000246
compound,-0.470886
isNew_True,-1.22712
brand_name_CLINIQUE,-0.414393


### Neural Network

In [216]:
ss = StandardScaler()
X_train_sc = ss.fit_transform(X_train)
X_test_sc = ss.transform(X_test)

In [219]:
X_train_sc.shape[1]

181

In [305]:
model = Sequential()

model.add(Dense(32, 
                activation='relu', 
                input_shape=(X_train_sc.shape[1],)))

model.add(Dense(1, activation='sigmoid'))

model.compile(loss = 'binary_crossentropy',
             optimizer = 'adam',
             metrics = ['acc'])

In [306]:
model.fit(X_train_sc, y_train, 
         epochs = 9, 
         batch_size = 128, 
         validation_data = (X_test_sc, y_test))

Train on 1942 samples, validate on 648 samples
Epoch 1/9
Epoch 2/9
Epoch 3/9
Epoch 4/9
Epoch 5/9
Epoch 6/9
Epoch 7/9
Epoch 8/9
Epoch 9/9


<keras.callbacks.History at 0x1a38319978>

## Removing brand name from model

In [321]:
# Read in final dataset
df = pd.read_csv('final_dataset.csv')

In [322]:
df.drop(columns = ['Unnamed: 0', 'imageAltText', 'salePrice', 'valuePrice', 'brand_name', 'skuType'], inplace = True)

In [323]:
# Create dummy features for binary columns
df = pd.get_dummies(df, columns = ['isLimitedEdition', 'isNew', 'isOnlineOnly', 'isSephoraExclusive', 'category'], drop_first = True)



In [324]:
df.fillna(0, inplace= True)

In [325]:
X = df.drop(columns = 'target_x')

In [326]:
y = df_dummies['target_x']

In [327]:
X_train, X_test, y_train, y_test = train_test_split(X, y,stratify = y)

In [328]:
lr = LogisticRegression()
gs_lr = GridSearchCV(estimator = lr, 
             param_grid = {
                 'penalty': ['l1','l2'],
                 'C': [.25, .5, .75, 1],
                 'solver': ['liblinear']},
                    cv = 5)

gs_lr.fit(X_train, y_train)
print(f'Best training score for gs: {gs_lr.best_score_}')
print(f'Best parameters: {gs_lr.best_params_}')
lr = gs_lr.best_estimator_
print(f'Test score: {lr.score(X_test, y_test)}')

Best training score for gs: 0.9546858908341915
Best parameters: {'C': 0.25, 'penalty': 'l2', 'solver': 'liblinear'}
Test score: 0.9552469135802469


In [329]:
lr_weights = pd.DataFrame(index = X.columns, data = lr.coef_.ravel())

In [332]:
np.percentile(lr_weights, .99)

-0.0006335871416861624

In [339]:
lr_weights[lr_weights[0]>0.0000009].round(5)

Unnamed: 0,0
1 star,7e-05
isLimitedEdition_True,1e-05
isSephoraExclusive_True,1e-05
category_lip-gloss,0.0
category_lipstick,0.0
category_luminizer-luminous-makeup,0.0


In [340]:
lr_weights[lr_weights[0]<-0.0003].round(5)

Unnamed: 0,0
listPrice,-0.00034
5 stars,-0.0006
review_count_y,-0.00063
total_ratings,-0.00063
