In [2]:
import tensorflow as tf
keras = tf.keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
# from tensorflow.keras.embeddings import Embedding
# from tensorflow.keras.utils.np_utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import Dropout

from scipy import stats
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import NearMiss, RandomUnderSampler
from imblearn.pipeline import make_pipeline as make_pipeline_imb
import pandas as pd
import seaborn as sns

from src.LSTM_cleaning import *

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

%matplotlib inline

[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
df = pd.read_csv('sephora_review_db.csv.zip')

In [5]:
model_df = df[['review_text', 'skin_type']].copy()
model_df.head()

Unnamed: 0,review_text,skin_type
0,Really good foundation I like it a lot but som...,oily
1,"As someone who is very VERY fair, I have alway...",combination
2,First purchase of a foundation of any kind. Wa...,combination
3,I absolutely love this foundation! I get compl...,combination
4,Love this foundation! I was using Too Faced Bo...,combination


In [6]:
model_df['review_text'] = model_df['review_text'].apply(format_strings)
model_df.head()

Unnamed: 0,review_text,skin_type
0,really good foundation like lot sometimes grab...,oily
1,"someone fair, always hard time finding foundat...",combination
2,first purchase foundation kind. willing give t...,combination
3,absolutely love foundation! get compliments da...,combination
4,love foundation! using faced born way bought b...,combination


In [7]:
model_df.shape

(276072, 2)

In [8]:
missing_zero_values_table(model_df)

Your selected dataframe has 2 columns and 276072 Rows.
There are 1 columns that have missing values.


Unnamed: 0,Zero Values,Missing Values,% of Total Values,Total Zero Missing Values,% Total Zero Missing Values,Data Type
skin_type,0,141630,51.3,141630,51.3,object


It looks like half of the skin types are missing, these rows will be dropped as an NaN skin type does not have much meaning in this context.

In [9]:
model_df.dropna(inplace=True, axis=0)

In [10]:
missing_zero_values_table(model_df)

Your selected dataframe has 2 columns and 134442 Rows.
There are 0 columns that have missing values.


Unnamed: 0,Zero Values,Missing Values,% of Total Values,Total Zero Missing Values,% Total Zero Missing Values,Data Type


In [18]:
# find max length of review
max_review_length = max([len(review.split()) for review in model_df['review_text']])
max_review_length
# len(model_df['review_text'][0].split())

481

In [12]:
model_df['skin_type'].value_counts()

combination    70782
oily           25003
dry            21082
normal         17575
Name: skin_type, dtype: int64

In [37]:
# 70782+25003+21082+17575

## LSTM Model with Undersampling

In [19]:
# max number of words to be used
max_nb_words = 20000
# max number of words in each review, longest review is 481 sentences
max_seq_length = 481
embedding_dim = 100

tokenizer = Tokenizer(num_words=max_nb_words, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
tokenizer.fit_on_texts(model_df['review_text'].values)
word_index = tokenizer.word_index
print('Found {} unique tokens'.format(len(word_index)))

Found 33091 unique tokens


In [35]:
X = tokenizer.texts_to_sequences(model_df['review_text'].values)
# pad sequences vectorizes and creates a uniform length of sentences
X = pad_sequences(X, maxlen=max_seq_length)
print('Shape of data tensor: ', X.shape)

Shape of data tensor:  (134442, 481)


In [38]:
# create bag of words list to use for columns in dataframe
bow = list(word_index.keys())
# bow_250 = list(word_index.keys())[:250]
len(bow)

33091

In [40]:
y = pd.get_dummies(model_df['skin_type']).values
print('Shape of label tensor: ', y.shape)

Shape of label tensor:  (134442, 4)


In [41]:
# split data into training and testing set
X_train1, X_test1, y_train1, y_test1 = train_test_split(X, y, test_size=0.25, random_state=42)

In [48]:
print('Combination: ', X_train1[y_train1[:, 0] == 1].shape)
print('Oily: ', X_train1[y_train1[:, 1] == 1].shape)
print('Dry: ', X_train1[y_train1[:, 2] == 1].shape)
print('Normal: ', X_train1[y_train1[:, 3] == 1].shape)

Combination:  (53128, 481)
Oily:  (15783, 481)
Dry:  (13165, 481)
Normal:  (18755, 481)


In [49]:
X_train1.shape

(100831, 481)

In [50]:
y_train1.shape

(100831, 4)

In [60]:
column_names = ['word ' + str(num) for num in range(1, 482)]

In [62]:
df_train1 = pd.DataFrame(X_train1, columns=column_names)
df_train2 = pd.DataFrame(y_train1, columns=['Combination', 'Oily', 'Dry', 'Normal'])
df_train_final = pd.concat([df_train1, df_train2], axis=1)
df_train_final.head()

Unnamed: 0,word 1,word 2,word 3,word 4,word 5,word 6,word 7,word 8,word 9,word 10,...,word 476,word 477,word 478,word 479,word 480,word 481,Combination,Oily,Dry,Normal
0,0,0,0,0,0,0,0,0,0,0,...,7,258,394,7,36,12887,0,0,1,0
1,0,0,0,0,0,0,0,0,0,0,...,38,329,60,57,209,375,1,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,1279,225,58,5,5,5,1,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,28,5,34,156,13259,0,1,0,0
4,0,0,0,0,0,0,0,0,0,0,...,52,62,80,154,4314,209,1,0,0,0


In [63]:
# df_train1 = pd.DataFrame(X_train1, columns=bow_250)
# df_train2 = pd.DataFrame(y_train1, columns=['Combination', 'Oily', 'Dry', 'Normal'])
# df_train_final = pd.concat([df_train1, df_train2], axis=1)
# df_train_final.head()

In [64]:
# separate majority and minority class
majority_class = df_train_final[df_train_final['Combination'] == 1]
minority_class = df_train_final[df_train_final['Normal'] == 1]

In [65]:
# downsample majority class
majority_downsampled = resample(majority_class, 
                                replace=False,
                               n_samples=18755,
                               random_state=123)
majority_downsampled.shape

(18755, 485)

In [66]:
majority_downsampled.head()

Unnamed: 0,word 1,word 2,word 3,word 4,word 5,word 6,word 7,word 8,word 9,word 10,...,word 476,word 477,word 478,word 479,word 480,word 481,Combination,Oily,Dry,Normal
25069,0,0,0,0,0,0,0,0,0,0,...,6,135,222,1061,714,250,1,0,0,0
78446,0,0,0,0,0,0,0,0,0,0,...,33,62,54,150,38,64,1,0,0,0
68796,0,0,0,0,0,0,0,0,0,0,...,532,277,131,133,21,9,1,0,0,0
35690,0,0,0,0,0,0,0,0,0,0,...,511,176,484,950,630,160,1,0,0,0
51975,0,0,0,0,0,0,0,0,0,0,...,27,6854,2998,2028,468,25,1,0,0,0


In [67]:
# combine downsample majority class with remaining training set
X_train_ds = pd.concat([majority_downsampled,
                       df_train_final[df_train_final['Oily'] == 1]],
                      axis=0)
X_train_ds2 = pd.concat([X_train_ds,
                       df_train_final[df_train_final['Dry'] == 1]],
                      axis=0)
X_train_ds_final = pd.concat([X_train_ds2,
                       df_train_final[df_train_final['Normal'] == 1]],
                      axis=0)
X_train_ds_final.shape

(66458, 485)

In [68]:
# redefine X and y for 2nd iteration of training and testing set split
X = X_train_ds_final.iloc[:, :-4].values
y = X_train_ds_final.iloc[:, -4:].values
print(X.shape)
print(y.shape)

(66458, 481)
(66458, 4)


In [69]:
X_train2, X_test2, y_train2, y_test2 = train_test_split(X, y, test_size=0.25, random_state=42)

In [70]:
# LSTM model
model = Sequential()
# embedding layer where each word is represented by a vector
model.add(Embedding(max_nb_words, embedding_dim, input_length=X.shape[1]))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(4, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 481, 100)          2000000   
_________________________________________________________________
spatial_dropout1d (SpatialDr (None, 481, 100)          0         
_________________________________________________________________
lstm (LSTM)                  (None, 100)               80400     
_________________________________________________________________
dense (Dense)                (None, 4)                 404       
Total params: 2,080,804
Trainable params: 2,080,804
Non-trainable params: 0
_________________________________________________________________
None


In [72]:
#fit model
model.fit(X_train2, y_train2, batch_size=32, epochs=5)

Train on 49843 samples
Epoch 1/5
 7040/49843 [===>..........................] - ETA: 13:35 - loss: 1.2141 - accuracy: 0.4395

KeyboardInterrupt: 

In [200]:
y_pred = model.predict(X_test2)

In [204]:
acc = model.evaluate(X_test2, y_test2)
acc



[1.2278189338620369, 0.44417694]

**predicting on holdout test set**

In [205]:
y_pred = model.predict(X_test1)

In [207]:
acc = model.evaluate(X_test1, y_test1)
acc



[1.247377719400625, 0.43554193]