In [95]:
import tensorflow as tf
keras = tf.keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
# from tensorflow.keras.embeddings import Embedding
# from tensorflow.keras.utils.np_utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import Dropout

from scipy import stats
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import NearMiss, RandomUnderSampler
from imblearn.pipeline import make_pipeline as make_pipeline_imb
import pandas as pd
import seaborn as sns

from src.LSTM_cleaning import *

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

%matplotlib inline

[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [12]:
df = pd.read_csv('sephora_review_db.csv.zip')

In [13]:
model_df = df[['review_text', 'skin_type']].copy()
model_df.head()

Unnamed: 0,review_text,skin_type
0,Really good foundation I like it a lot but som...,oily
1,"As someone who is very VERY fair, I have alway...",combination
2,First purchase of a foundation of any kind. Wa...,combination
3,I absolutely love this foundation! I get compl...,combination
4,Love this foundation! I was using Too Faced Bo...,combination


In [14]:
model_df['review_text'] = model_df['review_text'].apply(format_strings)
model_df.head()

Unnamed: 0,review_text,skin_type
0,really good foundation like lot sometimes grab...,oily
1,"someone fair, always hard time finding foundat...",combination
2,first purchase foundation kind. willing give t...,combination
3,absolutely love foundation! get compliments da...,combination
4,love foundation! using faced born way bought b...,combination


In [15]:
model_df.shape

(276072, 2)

In [16]:
missing_zero_values_table(model_df)

Your selected dataframe has 2 columns and 276072 Rows.
There are 1 columns that have missing values.


Unnamed: 0,Zero Values,Missing Values,% of Total Values,Total Zero Missing Values,% Total Zero Missing Values,Data Type
skin_type,0,141630,51.3,141630,51.3,object


It looks like half of the skin types are missing, these rows will be dropped as an NaN skin type does not have much meaning in this context.

In [17]:
model_df.dropna(inplace=True, axis=0)

In [19]:
missing_zero_values_table(model_df)

Your selected dataframe has 2 columns and 134442 Rows.
There are 0 columns that have missing values.


Unnamed: 0,Zero Values,Missing Values,% of Total Values,Total Zero Missing Values,% Total Zero Missing Values,Data Type


In [25]:
model_df['skin_type'].value_counts()

combination    70782
oily           25003
dry            21082
normal         17575
Name: skin_type, dtype: int64

## LSTM Model with Undersampling

In [166]:
# max number of words to be used
max_nb_words = 20000
# max number of words in each review
max_seq_length = 250
embedding_dim = 100

tokenizer = Tokenizer(num_words=max_nb_words, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
tokenizer.fit_on_texts(model_df['review_text'].values)
word_index = tokenizer.word_index
print('Found {} unique tokens'.format(len(word_index)))

Found 33091 unique tokens


In [167]:
X = tokenizer.texts_to_sequences(model_df['review_text'].values)
# X = pad_sequences(X)
X = pad_sequences(X, maxlen=max_seq_length)
print('Shape of data tensor: ', X.shape)

Shape of data tensor:  (134442, 250)


In [168]:
bow = list(word_index.keys())
bow_250 = list(word_index.keys())[:250]

In [169]:
y = pd.get_dummies(model_df['skin_type']).values
print('Shape of label tensor: ', y.shape)

Shape of label tensor:  (134442, 4)


In [170]:
# split data into training and testing set
X_train1, X_test1, y_train1, y_test1 = train_test_split(X, y, test_size=0.25, random_state=42)

In [171]:
print('Combination: ', X_train1[y_train1[:, 0] == 1].shape)
print('Oily: ', X_train1[y_train1[:, 1] == 1].shape)
print('Dry: ', X_train1[y_train1[:, 2] == 1].shape)
print('Normal: ', X_train1[y_train1[:, 3] == 1].shape)

Combination:  (53128, 250)
Oily:  (15783, 250)
Dry:  (13165, 250)
Normal:  (18755, 250)


In [172]:
X_train1.shape

(100831, 250)

In [173]:
y_train1.shape

(100831, 4)

In [174]:
df_train1 = pd.DataFrame(X_train1, columns=bow_250)
df_train2 = pd.DataFrame(y_train1, columns=['Combination', 'Oily', 'Dry', 'Normal'])
df_train_final = pd.concat([df_train1, df_train2], axis=1)
df_train_final.head()

Unnamed: 0,foundation,skin,coverage,like,love,product,it,day,oily,face,...,everyday,orange,prone,undertones,problem,review,Combination,Oily,Dry,Normal
0,0,0,0,0,0,0,0,0,0,0,...,7,258,394,7,36,12887,0,0,1,0
1,0,0,0,0,0,0,0,0,0,0,...,38,329,60,57,209,375,1,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,1279,225,58,5,5,5,1,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,28,5,34,156,13259,0,1,0,0
4,0,0,0,0,0,0,0,0,0,0,...,52,62,80,154,4314,209,1,0,0,0


In [175]:
# separate majority and minority class
majority_class = df_train_final[df_train_final['Combination'] == 1]
minority_class = df_train_final[df_train_final['Normal'] == 1]

In [176]:
# downsample majority class
majority_downsampled = resample(majority_class, 
                                replace=False,
                               n_samples=18755,
                               random_state=123)
majority_downsampled.shape

(18755, 254)

In [178]:
majority_downsampled.head()

Unnamed: 0,foundation,skin,coverage,like,love,product,it,day,oily,face,...,everyday,orange,prone,undertones,problem,review,Combination,Oily,Dry,Normal
25069,0,0,0,0,0,0,0,0,0,0,...,6,135,222,1061,714,250,1,0,0,0
78446,0,0,0,0,0,0,0,0,0,0,...,33,62,54,150,38,64,1,0,0,0
68796,0,0,0,0,0,0,0,0,0,0,...,532,277,131,133,21,9,1,0,0,0
35690,0,0,0,0,0,0,0,0,0,0,...,511,176,484,950,630,160,1,0,0,0
51975,0,0,0,0,0,0,0,0,0,0,...,27,6854,2998,2028,468,25,1,0,0,0


In [180]:
# combine downsample majority class with remaining training set
X_train_ds = pd.concat([majority_downsampled,
                       df_train_final[df_train_final['Oily'] == 1]],
                      axis=0)
X_train_ds2 = pd.concat([X_train_ds,
                       df_train_final[df_train_final['Dry'] == 1]],
                      axis=0)
X_train_ds_final = pd.concat([X_train_ds2,
                       df_train_final[df_train_final['Normal'] == 1]],
                      axis=0)
X_train_ds_final.shape

(66458, 254)

In [191]:
# redefine X and y for 2nd iteration of training and testing set split
X = X_train_ds_final.iloc[:, :-4].values
y = X_train_ds_final.iloc[:, -4:].values
print(X.shape)
print(y.shape)

(66458, 250)
(66458, 4)


In [192]:
X_train2, X_test2, y_train2, y_test2 = train_test_split(X, y, test_size=0.25, random_state=42)

In [194]:
# LSTM model
model = Sequential()
model.add(Embedding(max_nb_words, embedding_dim, input_length=X.shape[1]))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(4, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 250, 100)          2000000   
_________________________________________________________________
spatial_dropout1d_1 (Spatial (None, 250, 100)          0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 100)               80400     
_________________________________________________________________
dense_1 (Dense)              (None, 4)                 404       
Total params: 2,080,804
Trainable params: 2,080,804
Non-trainable params: 0
_________________________________________________________________
None


In [96]:
# separate majority and minority class
majority_class = X_train1[y_train1[:, 0] == 1]
minority_class = X_train1[y_train1[:, 3] == 1]

In [99]:
# downsample majority class
majority_downsampled = resample(majority_class, 
                                replace=False,
                               n_samples=18755,
                               random_state=123)
majority_downsampled.shape

(18755, 250)

In [105]:
# combine downsample majority class with remaining training set
X_train_ds = np.append(majority_downsampled,
                       X_train1[y_train1[:, 1] == 1],
                      axis=0)
X_train_ds2 = np.append(X_train_ds,
                       X_train1[y_train1[:, 2] == 1],
                      axis=0)
X_train_ds_final = np.append(X_train_ds2,
                       X_train1[y_train1[:, 3] == 1],
                      axis=0)
X_train_ds_final.shape

(66458, 250)

In [106]:
# check total number of rows tie
18755+15783+13165+18755

66458

In [108]:
X_train2, X_test2, y_train2, y_test2 = train_test_split(X_train_ds_final, y, test_size=0.25, random_state=42)

ValueError: Found input variables with inconsistent numbers of samples: [66458, 134442]

In [89]:
minority_class_len = X_train1[y_train1[:, 3] == 1].shape[0]
minority_class_indices = np.argwhere(y_train1[:, 3] == 1)
minority_class_indices_reshape = minority_class_indices.reshape(1,-1)

majority_class_indices = np.argwhere(y_train1[:, 0] == 1)
majority_class_indices_reshape = majority_class_indices.reshape(1,-1)

# randomly select majority class indices 
random_majority_indices = np.random.choice(majority_class_indices_reshape[0],
                                       minority_class_len,
                                       replace=False)
minority_class_indices_reshape

array([[    12,     13,     14, ..., 100825, 100828, 100830]])

In [90]:
under_sample_indices = np.concatenate([minority_class_indices_reshape[0], random_majority_indices])

In [94]:
majority_class_rus = X_train1[under_sample_indices]
majority_class_rus.

(37510, 250)