# Modeling

In [192]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
import sklearn.metrics as metrics

In [2]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten
from keras.layers import Conv2D, MaxPooling2D
from keras.utils import np_utils

Using TensorFlow backend.


In [3]:
import pandas as pd
import numpy as np
import re

In [110]:
# Read in final dataset
df = pd.read_csv('/Users/myokim/Desktop/general_assembly/Projects/Capstone/datasets_clean/final_dataset.csv')

In [111]:
df.drop(columns = ['Unnamed: 0', 'imageAltText', 'salePrice', 'valuePrice', 'skuType'], inplace = True)

In [112]:
df.head()

Unnamed: 0,brand_name,category,isLimitedEdition,isNew,isOnlineOnly,isSephoraExclusive,listPrice,skuId,target_x,1 star,2 stars,3 stars,4 stars,5 stars,avg_rating,review_count_y,total_ratings,compound_score,compound
0,FENTY BEAUTY by Rihanna,foundation-makeup,False,True,True,True,35.0,2268274,0,0.0,0.0,0.0,5.0,11.0,4.7,16.0,16.0,,0.860733
1,PAT McGRATH LABS,foundation-makeup,False,True,False,True,68.0,2257111,0,28.0,35.0,57.0,212.0,548.0,4.4,880.0,880.0,,0.767496
2,FENTY BEAUTY by Rihanna,foundation-makeup,False,False,False,True,35.0,2164671,0,1139.0,1279.0,1324.0,2149.0,8379.0,4.1,14270.0,14270.0,,0.61684
3,MILK MAKEUP,foundation-makeup,False,True,False,True,36.0,2242105,0,1.0,10.0,23.0,115.0,214.0,4.5,363.0,363.0,,0.775412
4,Estée Lauder,foundation-makeup,False,False,False,False,43.0,2112167,0,213.0,258.0,299.0,837.0,3758.0,4.4,5365.0,5365.0,0.012698,0.782008


In [113]:
df.isnull().sum()

brand_name               0
category                 0
isLimitedEdition         0
isNew                    0
isOnlineOnly             0
isSephoraExclusive       0
listPrice                0
skuId                    0
target_x                 0
1 star                 900
2 stars                900
3 stars                900
4 stars                900
5 stars                900
avg_rating             900
review_count_y         900
total_ratings          900
compound_score        1687
compound               925
dtype: int64

## Feature Engineering

In [114]:
# Create dummy features for binary columns
df = pd.get_dummies(df, columns = ['isLimitedEdition', 'isNew', 'isOnlineOnly', 'isSephoraExclusive', 'brand_name', 'category'], drop_first = True)



In [9]:
# Create array values for non binary categorical columns
def create_array(column):
    enc = OneHotEncoder(handle_unknown = 'error', drop = 'first')
    encode = enc.fit_transform(df[[column]]).toarray()
    encode_df = pd.DataFrame(index = df['skuId'], data = encode)
    encode_df[column] = encode_df.apply(lambda x: tuple(x), axis =1).apply(np.array)
    encode_df = encode_df[[column]]
    return encode_df

In [10]:
# category = create_array('category')

# brand_name = create_array('brand_name')

# df = pd.merge(left = df, right = category, how = 'inner', left_on = 'skuId', right_index = True)

# df = pd.merge(left = df, right = brand_name, how = 'inner', left_on = 'skuId', right_index = True)

# df_dummies = df.drop(columns = ['brand_name_x', 'category_x', 'skuType'])

# df_dummies.fillna(0, inplace = True)

In [11]:
# df_dummies.drop(columns = 'skuType', inplace = True)

In [115]:
df.fillna(0, inplace= True)

## Build Model

### Logistic Regression

In [116]:
X = df.drop(columns = 'target_x')

In [117]:
y = df['target_x']

In [146]:
X_train, X_test, y_train, y_test = train_test_split(X, y,stratify = y, random_state = 42)

In [128]:
y_train.value_counts(normalize = True)

0    0.954686
1    0.045314
Name: target_x, dtype: float64

In [129]:
y_test.value_counts(normalize = True)

0    0.955247
1    0.044753
Name: target_x, dtype: float64

In [137]:
metrics.SCORERS.keys()

dict_keys(['explained_variance', 'r2', 'max_error', 'neg_median_absolute_error', 'neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_mean_squared_log_error', 'accuracy', 'roc_auc', 'balanced_accuracy', 'average_precision', 'neg_log_loss', 'brier_score_loss', 'adjusted_rand_score', 'homogeneity_score', 'completeness_score', 'v_measure_score', 'mutual_info_score', 'adjusted_mutual_info_score', 'normalized_mutual_info_score', 'fowlkes_mallows_score', 'precision', 'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', 'recall_weighted', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'jaccard', 'jaccard_macro', 'jaccard_micro', 'jaccard_samples', 'jaccard_weighted'])

In [179]:
lr = LogisticRegression()
gs_lr = GridSearchCV(
            estimator = lr, 
            param_grid = {
                 'penalty': ['l1','l2'],
                 'C': [.25, .5, .75, 1],
                 'solver': ['liblinear']},
            cv = 5,
            scoring = 'precision')

gs_lr.fit(X_train, y_train)
print(f'Best training score for gs: {gs_lr.best_score_}')
print(f'Best parameters: {gs_lr.best_params_}')
lr = gs_lr.best_estimator_
print(f'Test score: {lr.score(X_test, y_test)}')

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

Best training score for gs: 0.3662032269138345
Best parameters: {'C': 0.75, 'penalty': 'l1', 'solver': 'liblinear'}
Test score: 0.9583333333333334


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [180]:
sum(lr.predict(X_test))

8

In [181]:
sum(lr.predict(X_train))

8

In [150]:
lr_weights = pd.DataFrame(index = X.columns, data = lr.coef_.ravel())

In [151]:
lr_weights[lr_weights[0]>0]

Unnamed: 0,0
1 star,0.013473
2 stars,0.00808
4 stars,0.020555
avg_rating,0.057764
isLimitedEdition_True,1.012037
isSephoraExclusive_True,0.818622
brand_name_BECCA,0.773479
brand_name_Black Up,3.539354
brand_name_FENTY BEAUTY by Rihanna,1.066777
brand_name_GLAMGLOW,3.349824


In [152]:
lr_weights[lr_weights[0]<0]

Unnamed: 0,0
listPrice,-0.008502
skuId,-1e-06
3 stars,-0.04701
5 stars,-0.00403
review_count_y,-0.000191
total_ratings,-0.000667
compound,-0.680398
isNew_True,-1.400692
isOnlineOnly_True,-0.18722
brand_name_CLINIQUE,-0.475456


In [182]:
print(metrics.classification_report(y_test, lr.predict(X_test)))

              precision    recall  f1-score   support

           0       0.96      1.00      0.98       619
           1       0.62      0.17      0.27        29

    accuracy                           0.96       648
   macro avg       0.79      0.58      0.62       648
weighted avg       0.95      0.96      0.95       648



In [183]:
cm = confusion_matrix(y_test, lr.predict(X_test))

cm_df = pd.DataFrame(cm, columns = ['pred full price', 'pred sale'],
            index = ['actual full price', 'actual sale'])

In [184]:
cm_df

Unnamed: 0,pred full price,pred sale
actual full price,616,3
actual sale,24,5


### Neural Network

In [82]:
ss = StandardScaler()
X_train_sc = ss.fit_transform(X_train)
X_test_sc = ss.transform(X_test)

In [83]:
X_train_sc.shape[1]

181

In [84]:
model = Sequential()

model.add(Dense(32, 
                activation='relu', 
                input_shape=(X_train_sc.shape[1],)))

model.add(Dense(1, activation='sigmoid'))

model.compile(loss = 'binary_crossentropy',
             optimizer = 'adam',
             metrics = ['acc'])

In [85]:
model.fit(X_train_sc, y_train, 
         epochs = 7, 
         batch_size = 128, 
         validation_data = (X_test_sc, y_test))

Train on 1942 samples, validate on 648 samples
Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7


<keras.callbacks.History at 0x1a344e8748>

In [60]:
X_train.shape

(1942, 44)

In [61]:
y_test.shape

(648,)

In [92]:
sum(model.predict_classes(X_test))

array([0], dtype=int32)

## Removing brand name from model

In [93]:
# Read in final dataset
df_2 = pd.read_csv('/Users/myokim/Desktop/general_assembly/Projects/Capstone/datasets_clean/final_dataset.csv')

In [94]:
df_2.drop(columns = ['Unnamed: 0', 'imageAltText', 'salePrice', 'valuePrice', 'brand_name', 'skuType'], inplace = True)

In [96]:
# Create dummy features for binary columns
df_2 = pd.get_dummies(df_2, columns = ['isLimitedEdition', 'isNew', 'isOnlineOnly', 'isSephoraExclusive', 'category'], drop_first = True)



In [97]:
df_2.fillna(0, inplace= True)

In [98]:
X = df_2.drop(columns = 'target_x')

In [99]:
y = df_2['target_x']

In [100]:
X_train, X_test, y_train, y_test = train_test_split(X, y,stratify = y)

In [102]:
lr_2 = LogisticRegression()
gs_lr_2 = GridSearchCV(estimator = lr_2, 
             param_grid = {
                 'penalty': ['l1','l2'],
                 'C': [.25, .5, .75, 1],
                 'solver': ['liblinear']},
                    cv = 5)

gs_lr_2.fit(X_train, y_train)
print(f'Best training score for gs: {gs_lr_2.best_score_}')
print(f'Best parameters: {gs_lr_2.best_params_}')
lr_2 = gs_lr_2.best_estimator_
print(f'Test score: {lr_2.score(X_test, y_test)}')

Best training score for gs: 0.9546858908341915
Best parameters: {'C': 0.25, 'penalty': 'l2', 'solver': 'liblinear'}
Test score: 0.9552469135802469


In [103]:
lr_weights_2 = pd.DataFrame(index = X.columns, data = lr.coef_.ravel())

In [104]:
np.percentile(lr_weights_2, .99)

-0.0006398071045870977

In [105]:
lr_weights_2[lr_weights_2[0]>0.0000009].round(5)

Unnamed: 0,0
isLimitedEdition_True,0.0
isSephoraExclusive_True,0.0


In [106]:
lr_weights_2[lr_weights_2[0]<-0.0003].round(5)

Unnamed: 0,0
5 stars,-0.00043
review_count_y,-0.00064
total_ratings,-0.00064


In [107]:
sum(lr_2.predict(X_train))

0

In [108]:
sum(y_train)

88

In [109]:
len(y_train)

1942

## Random Forest

Because the classes are so unbalanced, try out random forest

In [185]:
# Read in final dataset
df = pd.read_csv('/Users/myokim/Desktop/general_assembly/Projects/Capstone/datasets_clean/final_dataset.csv')

In [186]:
df.drop(columns = ['Unnamed: 0', 'imageAltText', 'salePrice', 'valuePrice', 'skuType'], inplace = True)

In [187]:
# Create dummy features for binary columns
df = pd.get_dummies(df, columns = ['isLimitedEdition', 'isNew', 'isOnlineOnly', 'isSephoraExclusive', 'brand_name', 'category'], drop_first = True)



In [188]:
df.fillna(0, inplace= True)

In [189]:
X = df.drop(columns = 'target_x')

In [190]:
y = df['target_x']

In [191]:
X_train, X_test, y_train, y_test = train_test_split(X, y,stratify = y, random_state = 42)

In [204]:
rf = RandomForestClassifier()

gs_rf = GridSearchCV(
            estimator = rf, 
            param_grid = {
                'n_estimators': [10,50,100],
                'max_depth': [None, 2],
                'min_samples_split': [2, 5, 8],
                'class_weight':['balanced'],
                'max_features': ['auto', 'log2', None]
            },
            cv = 3)

gs_rf.fit(X_train, y_train)
print(f'Best training score for gs: {gs_rf.best_score_}')
print(f'Best parameters: {gs_rf.best_params_}')
rf = gs_lr.best_estimator_
print(f'Test score: {rf.score(X_test, y_test)}')

Best training score for gs: 0.9536560247167868
Best parameters: {'class_weight': 'balanced', 'max_depth': None, 'max_features': 'auto', 'min_samples_split': 2, 'n_estimators': 10}
Test score: 0.9583333333333334


In [205]:
sum(rf.predict(X_test))

8

In [206]:
sum(rf.predict(X_train))

8

In [207]:
metrics.roc_auc_score(y_test, rf.predict(X_test))

0.5837836332237759

In [208]:
metrics.roc_auc_score(y_test, lr.predict(X_test))

0.5837836332237759

In [None]:
metrics.roc_auc_score(y_test, lr.predict(X_test))