[View in Colaboratory](https://colab.research.google.com/github/peter0083/van_ai_coding_challenge_3/blob/master/src/model.ipynb)

In [1]:
import sys
sys.version
sys.version_info

sys.version_info(major=3, minor=6, micro=5, releaselevel='final', serial=0)

In [2]:
# important dependencies
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math
from sklearn.linear_model import ElasticNet, ElasticNetCV, LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, GridSearchCV
from scipy import stats
%matplotlib inline

In [3]:
# read csv
train_df = pd.read_csv('../data/train.csv')

In [4]:
train_df.shape

(376304, 13)

### 1. Preprocessing

In [5]:
train_df.head(10)

Unnamed: 0.1,Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
0,161273,1000965,P00080342,M,55+,7,B,0,1,1,6.0,8.0,19134
1,161274,1000966,P00275642,M,55+,2,C,4+,0,8,,,10027
2,161275,1000966,P00248042,M,55+,2,C,4+,0,8,,,9986
3,161276,1000966,P0099642,M,55+,2,C,4+,0,8,,,3903
4,161277,1000966,P00011742,M,55+,2,C,4+,0,8,,,7996
5,161278,1000966,P00115442,M,55+,2,C,4+,0,8,,,9831
6,161279,1000967,P00219742,F,18-25,0,B,3,0,6,11.0,16.0,8598
7,161280,1000967,P00258942,F,18-25,0,B,3,0,5,11.0,,5239
8,161281,1000967,P00161942,F,18-25,0,B,3,0,5,8.0,,7162
9,161282,1000967,P00247042,F,18-25,0,B,3,0,5,14.0,,7108


In [6]:
# convert the discrete columns into dummy variables

dummy_cols = ['Product_ID', 'Gender', 'Age', 'City_Category', 'Stay_In_Current_City_Years']
train_df_with_dummies = pd.get_dummies(train_df, columns= dummy_cols)

In [7]:
# inspect the train df with dummies

train_df_with_dummies.head(10)

Unnamed: 0.1,Unnamed: 0,User_ID,Occupation,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase,Product_ID_P00000142,Product_ID_P00000242,...,Age_51-55,Age_55+,City_Category_A,City_Category_B,City_Category_C,Stay_In_Current_City_Years_0,Stay_In_Current_City_Years_1,Stay_In_Current_City_Years_2,Stay_In_Current_City_Years_3,Stay_In_Current_City_Years_4+
0,161273,1000965,7,1,1,6.0,8.0,19134,0,0,...,0,1,0,1,0,1,0,0,0,0
1,161274,1000966,2,0,8,,,10027,0,0,...,0,1,0,0,1,0,0,0,0,1
2,161275,1000966,2,0,8,,,9986,0,0,...,0,1,0,0,1,0,0,0,0,1
3,161276,1000966,2,0,8,,,3903,0,0,...,0,1,0,0,1,0,0,0,0,1
4,161277,1000966,2,0,8,,,7996,0,0,...,0,1,0,0,1,0,0,0,0,1
5,161278,1000966,2,0,8,,,9831,0,0,...,0,1,0,0,1,0,0,0,0,1
6,161279,1000967,0,0,6,11.0,16.0,8598,0,0,...,0,0,0,1,0,0,0,0,1,0
7,161280,1000967,0,0,5,11.0,,5239,0,0,...,0,0,0,1,0,0,0,0,1,0
8,161281,1000967,0,0,5,8.0,,7162,0,0,...,0,0,0,1,0,0,0,0,1,0
9,161282,1000967,0,0,5,14.0,,7108,0,0,...,0,0,0,1,0,0,0,0,1,0


Now the dimensions still have a good ratio. I am not running into the curse of dimensionality problem here because my observations are greater than 10 times the number of features. However, I should be careful not to overfit.

**Create X and Y arrays for actual training**

In [8]:
# drop NaN in the dataframe
train_df_with_dummies_noNaN = train_df_with_dummies.dropna()

In [9]:
# create label

y = train_df_with_dummies_noNaN['Purchase'].values
y.shape

(114928,)

In [10]:
# create features

X = train_df_with_dummies_noNaN.drop(['Purchase'], axis=1).values
X.shape

(114928, 3594)

In [11]:
# create train-validation split

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=1)

In [12]:
X_train.shape

(91942, 3594)

In [13]:
X_val.shape

(22986, 3594)

### 2. Baseline Linear Regression

I'm not normalizing the features here yet because I am not doing feature selection.

In [14]:

baseline_lm = LinearRegression(fit_intercept=True, normalize=False)
baseline_lm.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

**Regression metrics**

Reference: https://www.quora.com/What-is-the-difference-between-squared-error-and-absolute-error Reference: https://www.ritchieng.com/machine-learning-evaluate-linear-regression-model/

I will use Root Mean Squared Error RMSE for its interpretability.

In [15]:
y_pred_baseline_lm = baseline_lm.predict(X_val)

print("RMSE on validation set -- baseline model:", math.sqrt(mean_squared_error(y_val, y_pred_baseline_lm)))

RMSE on validation set -- baseline model: 3220.2142413422703


This RMSE is a very large number. We have a lot of room for improvement.

### 3. Elastic Net Cross-validation

I will now use Elastic Net to perform feature selection.

First, find an optimal alpha value for the elastic net model.

In [16]:
elastic_net_cv_model = ElasticNetCV(cv=10, random_state=1)
elastic_net_cv_model.fit(X_train, y_train)

ElasticNetCV(alphas=None, copy_X=True, cv=10, eps=0.001, fit_intercept=True,
       l1_ratio=0.5, max_iter=1000, n_alphas=100, n_jobs=1,
       normalize=False, positive=False, precompute='auto', random_state=1,
       selection='cyclic', tol=0.0001, verbose=0)

In [17]:
relevant_alpha = elastic_net_cv_model.alpha_
print("The relevant alpha from cross validation is", elastic_net_cv_model.alpha_)

The relevant alpha from cross validation is 8714.940093630381


In [18]:
elastic_net_model_all_feat = ElasticNet(alpha = relevant_alpha, l1_ratio=0.5)
elastic_net_model_all_feat.fit(X_train, y_train)

ElasticNet(alpha=8714.940093630381, copy_X=True, fit_intercept=True,
      l1_ratio=0.5, max_iter=1000, normalize=False, positive=False,
      precompute=False, random_state=None, selection='cyclic', tol=0.0001,
      warm_start=False)

In [19]:
y_pred_eNet_all_feat = elastic_net_model_all_feat.predict(X_val)

print("RMSE on validation set -- elastic net model with ALL features:", 
      (math.sqrt(mean_squared_error(y_val, y_pred_eNet_all_feat))))

RMSE on validation set -- elastic net model with ALL features: 5051.876669180532


In [20]:
(math.sqrt(mean_squared_error(y_val, y_pred_eNet_all_feat)))/math.sqrt(mean_squared_error(y_val, y_pred_baseline_lm)) * 100

print("It looks promising! I have just reduced the RMSE from the baseline", (math.sqrt(mean_squared_error(y_val, y_pred_eNet_all_feat))), "to", (math.sqrt(mean_squared_error(y_val, y_pred_baseline_lm))))

It looks promising! I have just reduced the RMSE from the baseline 5051.876669180532 to 3220.2142413422703


How many features did Elastic Net pick?

In [21]:
# reference: https://www.kaggle.com/cast42/feature-selection-and-elastic-net
coef_df_all_feat = pd.Series(elastic_net_model_all_feat.coef_, index = train_df_with_dummies_noNaN.drop(['Purchase'], axis=1).columns)

In [22]:
# reference: https://www.kaggle.com/cast42/feature-selection-and-elastic-net

print("From all features, Elastic Net picked " + str(sum(coef_df_all_feat != 0)) + " variables and eliminated the other " +  str(sum(coef_df_all_feat == 0)) + " variables")

From all features, Elastic Net picked 3 variables and eliminated the other 3591 variables


In [23]:
# reference: https://www.kaggle.com/cast42/feature-selection-and-elastic-net

# The relevant features are

coef_df_all_feat[abs(coef_df_all_feat) > 0].sort_values()

Product_Category_1   -0.192780
Unnamed: 0            0.000368
User_ID               0.000543
dtype: float64

In [24]:
relevant_features = coef_df_all_feat[abs(coef_df_all_feat) > 0].sort_values().index.tolist()

### 4. Retrain model using relevant features

In [25]:
relevant_features

['Product_Category_1', 'Unnamed: 0', 'User_ID']

In [26]:
relevant_features.append('Purchase')

In [27]:
relevant_features

['Product_Category_1', 'Unnamed: 0', 'User_ID', 'Purchase']

In [28]:
# reduce training set to the relevant features

train_df_with_dummies_noNaN_relevant_feat = train_df_with_dummies_noNaN[relevant_features]

In [29]:

train_df_with_dummies_noNaN_relevant_feat.head(5)

Unnamed: 0.1,Product_Category_1,Unnamed: 0,User_ID,Purchase
0,1,161273,1000965,19134
6,6,161279,1000967,8598
17,4,161290,1000970,3458
19,6,161292,1000970,16679
20,1,161293,1000970,11771


In [30]:
# create label

y2 = train_df_with_dummies_noNaN_relevant_feat['Purchase'].values
y2.shape

(114928,)

In [31]:
# create features

X2 = train_df_with_dummies_noNaN_relevant_feat.drop(['Purchase'], axis=1).values
X2.shape

(114928, 3)

**retrain with relevant features only**

In [32]:
relevant_lm = LinearRegression(fit_intercept=True, normalize=False)
relevant_lm.fit(X2, y2)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

### 5. Predict on test set

In [33]:
# read csv
test_df = pd.read_csv('../data/test.csv')

In [34]:
test_df.shape

(161273, 13)

In [35]:
test_df.columns.values

array(['Unnamed: 0', 'User_ID', 'Product_ID', 'Gender', 'Age',
       'Occupation', 'City_Category', 'Stay_In_Current_City_Years',
       'Marital_Status', 'Product_Category_1', 'Product_Category_2',
       'Product_Category_3', 'Purchase'], dtype=object)

In [36]:
# convert the discrete columns into dummy variables

#dummy_cols = ['Product_ID', 'Gender', 'Age', 'City_Category', 'Stay_In_Current_City_Years']
test_df_with_dummies = pd.get_dummies(test_df, columns= dummy_cols)

In [37]:
# although unlikely, I want to check and see if there is any NaN in the test set

test_df_with_dummies.isnull().values.any()

True

In [38]:
test_df_relevant = test_df_with_dummies[relevant_features]

In [39]:
# drop all NaN
test_df_relevant_noNaN = test_df_relevant.dropna()

In [40]:
test_df_relevant_noNaN.head(5)

Unnamed: 0.1,Product_Category_1,Unnamed: 0,User_ID,Purchase
0,3,0,1000001,8370
1,1,1,1000001,15200
2,12,2,1000001,1422
3,12,3,1000001,1057
4,8,4,1000002,7969


In [41]:
y_true = test_df_relevant_noNaN['Purchase'].values

In [42]:
test_df_relevant_noNaN = test_df_relevant_noNaN.drop(['Purchase'], axis=1)

In [43]:
test_df_relevant_noNaN.shape

(161273, 3)

In [44]:

relevant_lm.predict(test_df_relevant_noNaN)

array([11297.27434845, 12860.25572432,  4263.86013754, ...,
        9804.28087718,  9804.28118187,  9804.30313801])

In [45]:
y_predicted = relevant_lm.predict(test_df_relevant_noNaN)

In [46]:
math.sqrt(mean_squared_error(y_true, y_predicted))

4918.699980965837

In [47]:
print("The baseline score was", (math.sqrt(mean_squared_error(y_val, y_pred_eNet_all_feat))), "and now my test score is", math.sqrt(mean_squared_error(y_true, y_predicted)),
      "Even though my validation score was ")

The baseline score was 5051.876669180532 and now my test score is 4918.699980965837 Even though my validation score was 


### 6. Black Friday Data Set as a Classification Problem

Now, I will try to classify the customer's martial status from the black data set.

The setup:
- the features will be all other columns including "Purchase"
- the label will be "Martial Status".

### 7. Preprocessing for Classification

In [49]:
test_df.head(3)

Unnamed: 0.1,Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
0,0,1000001,P00069042,F,0-17,10,A,2,0,3,,,8370
1,1,1000001,P00248942,F,0-17,10,A,2,0,1,6.0,14.0,15200
2,2,1000001,P00087842,F,0-17,10,A,2,0,12,,,1422
