[View in Colaboratory](https://colab.research.google.com/github/peter0083/van_ai_coding_challenge_3/blob/master/src/model.ipynb)

In [1]:
# first time using Google Colab
print("hello world!")

hello world!


In [2]:
import sys
sys.version
sys.version_info

sys.version_info(major=3, minor=6, micro=5, releaselevel='final', serial=0)

In [50]:
# important dependencies
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math
from sklearn.linear_model import ElasticNet, ElasticNetCV, LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, GridSearchCV
from scipy import stats
%matplotlib inline

In [4]:
# read csv
train_df = pd.read_csv('../data/train.csv')

In [5]:
train_df.shape

(550068, 12)

### 1. Preprocessing

In [6]:
train_df.head(10)

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
0,1000001,P00069042,F,0-17,10,A,2,0,3,,,8370
1,1000001,P00248942,F,0-17,10,A,2,0,1,6.0,14.0,15200
2,1000001,P00087842,F,0-17,10,A,2,0,12,,,1422
3,1000001,P00085442,F,0-17,10,A,2,0,12,14.0,,1057
4,1000002,P00285442,M,55+,16,C,4+,0,8,,,7969
5,1000003,P00193542,M,26-35,15,A,3,0,1,2.0,,15227
6,1000004,P00184942,M,46-50,7,B,2,1,1,8.0,17.0,19215
7,1000004,P00346142,M,46-50,7,B,2,1,1,15.0,,15854
8,1000004,P0097242,M,46-50,7,B,2,1,1,16.0,,15686
9,1000005,P00274942,M,26-35,20,A,1,1,8,,,7871


In [7]:
# convert the discrete columns into dummy variables

dummy_cols = ['Product_ID', 'Gender', 'Age', 'City_Category', 'Stay_In_Current_City_Years']
train_df_with_dummies = pd.get_dummies(train_df, columns= dummy_cols)

In [8]:
# inspect the train df with dummies

train_df_with_dummies.head(10)

Unnamed: 0,User_ID,Occupation,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase,Product_ID_P00000142,Product_ID_P00000242,Product_ID_P00000342,...,Age_51-55,Age_55+,City_Category_A,City_Category_B,City_Category_C,Stay_In_Current_City_Years_0,Stay_In_Current_City_Years_1,Stay_In_Current_City_Years_2,Stay_In_Current_City_Years_3,Stay_In_Current_City_Years_4+
0,1000001,10,0,3,,,8370,0,0,0,...,0,0,1,0,0,0,0,1,0,0
1,1000001,10,0,1,6.0,14.0,15200,0,0,0,...,0,0,1,0,0,0,0,1,0,0
2,1000001,10,0,12,,,1422,0,0,0,...,0,0,1,0,0,0,0,1,0,0
3,1000001,10,0,12,14.0,,1057,0,0,0,...,0,0,1,0,0,0,0,1,0,0
4,1000002,16,0,8,,,7969,0,0,0,...,0,1,0,0,1,0,0,0,0,1
5,1000003,15,0,1,2.0,,15227,0,0,0,...,0,0,1,0,0,0,0,0,1,0
6,1000004,7,1,1,8.0,17.0,19215,0,0,0,...,0,0,0,1,0,0,0,1,0,0
7,1000004,7,1,1,15.0,,15854,0,0,0,...,0,0,0,1,0,0,0,1,0,0
8,1000004,7,1,1,16.0,,15686,0,0,0,...,0,0,0,1,0,0,0,1,0,0
9,1000005,20,1,8,,,7871,0,0,0,...,0,0,1,0,0,0,1,0,0,0


Now the dimensions are 550k x 3655. It is still good ratio. I am not running into the curse of dimensionality problem here because my observations are greater than 10 times the number of features. However, I should be careful not to overfit.

**Create X and Y arrays for actual training**

In [17]:
# drop NaN in the dataframe
train_df_with_dummies_noNaN = train_df_with_dummies.dropna()

In [18]:
# create label

y = train_df_with_dummies_noNaN['Purchase'].values
y.shape

(166821,)

In [19]:
# create features

X = train_df_with_dummies_noNaN.drop(['Purchase'], axis=1).values
X.shape

(166821, 3654)

In [20]:
# create train-validation split

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=1)

In [21]:
X_train.shape

(133456, 3654)

In [22]:
X_val.shape

(33365, 3654)

### 2. Baseline Linear Regression

I'm not normalizing the features here yet because I am not doing feature selection.

In [23]:

baseline_lm = LinearRegression(fit_intercept=True, normalize=False)
baseline_lm.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

**Regression metrics**

Reference: https://www.quora.com/What-is-the-difference-between-squared-error-and-absolute-error Reference: https://www.ritchieng.com/machine-learning-evaluate-linear-regression-model/

I will use Root Mean Squared Error RMSE for its interpretability.

In [33]:
y_pred_baseline_lm = baseline_lm.predict(X_val)

print("RMSE on validation set -- baseline model:", math.sqrt(mean_squared_error(y_val, y_pred_baseline_lm)))

RMSE on validation set -- baseline model: 4688021.134763261


This RMSE is a very large number. We have a lot of room for improvement.

### 3. Elastic Net Cross-validation

I will now use Elastic Net to perform feature selection.

First, find an optimal alpha value for the elastic net model.

In [34]:
elastic_net_cv_model = ElasticNetCV(cv=10, random_state=1)
elastic_net_cv_model.fit(X_train, y_train)

ElasticNetCV(alphas=None, copy_X=True, cv=10, eps=0.001, fit_intercept=True,
       l1_ratio=0.5, max_iter=1000, n_alphas=100, n_jobs=1,
       normalize=False, positive=False, precompute='auto', random_state=1,
       selection='cyclic', tol=0.0001, verbose=0)

In [38]:
relevant_alpha = elastic_net_cv_model.alpha_
print("The relevant alpha from cross validation is", elastic_net_cv_model.alpha_)

The relevant alpha from cross validation is 40.16918968228336


In [39]:
elastic_net_model_all_feat = ElasticNet(alpha = relevant_alpha, l1_ratio=0.5)
elastic_net_model_all_feat.fit(X_train, y_train)

ElasticNet(alpha=40.16918968228336, copy_X=True, fit_intercept=True,
      l1_ratio=0.5, max_iter=1000, normalize=False, positive=False,
      precompute=False, random_state=None, selection='cyclic', tol=0.0001,
      warm_start=False)

In [41]:
y_pred_eNet_all_feat = elastic_net_model_all_feat.predict(X_val)

print("RMSE on validation set -- elastic net model with ALL features:", 
      (math.sqrt(mean_squared_error(y_val, y_pred_eNet_all_feat))))

RMSE on validation set -- elastic net model with ALL features: 4895.660278169304


In [46]:
(math.sqrt(mean_squared_error(y_val, y_pred_eNet_all_feat)))/math.sqrt(mean_squared_error(y_val, y_pred_baseline_lm)) * 100

0.10442914264755182

It looks promising! I have just reduced the RMSE to 0.1% of the baseline.

How many features did Elastic Net pick?

In [47]:
# reference: https://www.kaggle.com/cast42/feature-selection-and-elastic-net
coef_df_all_feat = pd.Series(elastic_net_model_all_feat.coef_, index = train_df_with_dummies_noNaN.drop(['Purchase'], axis=1).columns)

In [48]:
# reference: https://www.kaggle.com/cast42/feature-selection-and-elastic-net

print("From all features, Elastic Net picked " + str(sum(coef_df_all_feat != 0)) + " variables and eliminated the other " +  str(sum(coef_df_all_feat == 0)) + " variables")

Using all features, Elastic Net picked 39 variables and eliminated the other 3615 variables


In [64]:
# reference: https://www.kaggle.com/cast42/feature-selection-and-elastic-net

# The relevant features are

coef_df_all_feat[abs(coef_df_all_feat) > 1].sort_values()

Product_Category_1     -180.087047
Product_Category_2      -64.970860
Gender_F                 -4.302533
City_Category_A          -4.204168
City_Category_B          -2.451271
Product_ID_P00102642     -2.147087
Product_ID_P00003442     -1.485433
Product_ID_P00116142      1.022687
Product_ID_P00080342      1.028125
Product_ID_P00184942      1.071919
Product_ID_P00110742      1.073487
Product_ID_P00237542      1.198889
Product_ID_P00148642      1.275832
Product_ID_P00025442      1.433131
Product_ID_P00059442      1.634299
Gender_M                  4.302533
City_Category_C           7.655439
Occupation               11.959446
Product_Category_3       15.674371
dtype: float64

In [74]:
relevant_features = coef_df_all_feat[abs(coef_df_all_feat) > 1].sort_values().index.tolist()

### 4. Retrain model using relevant features

In [75]:
relevant_features

['Product_Category_1',
 'Product_Category_2',
 'Gender_F',
 'City_Category_A',
 'City_Category_B',
 'Product_ID_P00102642',
 'Product_ID_P00003442',
 'Product_ID_P00116142',
 'Product_ID_P00080342',
 'Product_ID_P00184942',
 'Product_ID_P00110742',
 'Product_ID_P00237542',
 'Product_ID_P00148642',
 'Product_ID_P00025442',
 'Product_ID_P00059442',
 'Gender_M',
 'City_Category_C',
 'Occupation',
 'Product_Category_3']

In [76]:
relevant_features.append('Purchase')

In [77]:
relevant_features

['Product_Category_1',
 'Product_Category_2',
 'Gender_F',
 'City_Category_A',
 'City_Category_B',
 'Product_ID_P00102642',
 'Product_ID_P00003442',
 'Product_ID_P00116142',
 'Product_ID_P00080342',
 'Product_ID_P00184942',
 'Product_ID_P00110742',
 'Product_ID_P00237542',
 'Product_ID_P00148642',
 'Product_ID_P00025442',
 'Product_ID_P00059442',
 'Gender_M',
 'City_Category_C',
 'Occupation',
 'Product_Category_3',
 'Purchase']

In [78]:
# reduce training set to the relevant features

train_df_with_dummies_noNaN_relevant_feat = train_df_with_dummies_noNaN[relevant_features]

In [79]:

train_df_with_dummies_noNaN_relevant_feat.head(5)

Unnamed: 0,Product_Category_1,Product_Category_2,Gender_F,City_Category_A,City_Category_B,Product_ID_P00102642,Product_ID_P00003442,Product_ID_P00116142,Product_ID_P00080342,Product_ID_P00184942,Product_ID_P00110742,Product_ID_P00237542,Product_ID_P00148642,Product_ID_P00025442,Product_ID_P00059442,Gender_M,City_Category_C,Occupation,Product_Category_3,Purchase
1,1,6.0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,10,14.0,15200
6,1,8.0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,7,17.0,19215
13,1,2.0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,20,5.0,15665
14,5,8.0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,9,14.0,5378
16,2,3.0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,9,4.0,13055


In [80]:
# create label

y2 = train_df_with_dummies_noNaN_relevant_feat['Purchase'].values
y2.shape

(166821,)

In [81]:
# create features

X2 = train_df_with_dummies_noNaN_relevant_feat.drop(['Purchase'], axis=1).values
X2.shape

(166821, 19)

**retrain with relevant features only**

In [82]:
relevant_lm = LinearRegression(fit_intercept=True, normalize=False)
relevant_lm.fit(X2, y2)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

### 5. Predict on test set

In [83]:
# read csv
test_df = pd.read_csv('../data/test.csv')

In [84]:
test_df.shape

(233599, 11)

In [89]:
test_df.columns.values

array(['User_ID', 'Product_ID', 'Gender', 'Age', 'Occupation',
       'City_Category', 'Stay_In_Current_City_Years', 'Marital_Status',
       'Product_Category_1', 'Product_Category_2', 'Product_Category_3'],
      dtype=object)

In [90]:
# convert the discrete columns into dummy variables

#dummy_cols = ['Product_ID', 'Gender', 'Age', 'City_Category', 'Stay_In_Current_City_Years']
test_df_with_dummies = pd.get_dummies(test_df, columns= dummy_cols)

In [91]:
# although unlikely, I want to check and see if there is any NaN in the test set

test_df_with_dummies.isnull().values.any()

True

In [93]:
relevant_features.remove('Purchase')
test_df_relevant = test_df_with_dummies[relevant_features]

In [95]:
test_df_relevant_noNaN = test_df_relevant.dropna()

In [96]:
relevant_lm.predict(test_df_relevant_noNaN)

array([10324.25     , 12417.484375 , 13590.734375 , ...,  9176.671875 ,
       12827.3125   , 12563.5078125])

Too bad the data set does not have test labels for me to calculate how well I predicted these values and see if my selected features did well. 