In [29]:
#importing relevant libraries
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn import linear_model
from sklearn.model_selection import KFold
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsRegressor
import xgboost as xgb
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import RFECV
from sklearn.metrics import accuracy_score
import datetime

In [30]:
train=pd.read_csv('data/train.csv')
test=pd.read_csv('data/test.csv')

In [31]:
#Function to process features data set to create categorical dummy variables   
def process_categories(df):
    categorical_variables = ['Gender','Age','City_Category','Stay_In_Current_City_Years']
    dummy_variables=pd.get_dummies(df[categorical_variables])
    df_towork=pd.concat([dummy_variables,df],axis=1).drop(categorical_variables,axis=1)
    df_towork=pd.concat([pd.get_dummies(df['Occupation'],prefix='Occupation'),df_towork],axis=1).drop('Occupation',axis=1)
    
    #Since male and female are the only 2 genders in the data we can drop one of the variables
    df_towork.drop('Gender_M',axis=1,inplace=True)
    
    cat1=pd.get_dummies(df_towork['Product_Category_1'],prefix='Product_category_1')
    cat2=pd.get_dummies(df_towork['Product_Category_2'],prefix='Product_category_2')
    cat3=pd.get_dummies(df_towork['Product_Category_3'],prefix='Product_category_3')
    df_towork=pd.concat([df_towork,cat1,cat2,cat3],axis=1).drop(['Product_Category_1','Product_Category_2',
                                                                 'Product_Category_3'],axis=1)
    return df_towork

In [32]:
def identify_missing_columns(df):
    missing_columns=[]
    for i in range(1,21):
        colname='Product_category_1_' + str(i)
        if colname not in  df.columns:
            missing_columns.append(colname)
    for i in range(1,21):
        colname='Product_category_2_' + str(float(i))
        if colname not in  df.columns:
            missing_columns.append(colname)
    for i in range(1,21):
        colname='Product_category_3_' + str(float(i))
        if colname not in  df.columns:
            missing_columns.append(colname)
            
    for i in missing_columns:
        df[i]=0
    return df

In [33]:
def collate_prod_categories(df):
    for i in range(1,21):
        new_column_name = 'Prod_category_' + str(i)
        category1_name = 'Product_category_1_' + str(i)
        category2_name = 'Product_category_2_' + str(float(i))
        category3_name = 'Product_category_3_' + str(float(i))
        df[new_column_name] = (df[category1_name] | df[category2_name] | df[category3_name])
        cols_to_drop=[category1_name,category2_name,category3_name]
        df=df.drop(cols_to_drop,axis=1)    
    return df

In [34]:
df_train=process_categories(train)
df_train=identify_missing_columns(df_train)
df_train_final=collate_prod_categories(df_train)

In [35]:
df_train_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 550068 entries, 0 to 550067
Data columns (total 61 columns):
 #   Column                         Non-Null Count   Dtype 
---  ------                         --------------   ----- 
 0   Occupation_0                   550068 non-null  uint8 
 1   Occupation_1                   550068 non-null  uint8 
 2   Occupation_2                   550068 non-null  uint8 
 3   Occupation_3                   550068 non-null  uint8 
 4   Occupation_4                   550068 non-null  uint8 
 5   Occupation_5                   550068 non-null  uint8 
 6   Occupation_6                   550068 non-null  uint8 
 7   Occupation_7                   550068 non-null  uint8 
 8   Occupation_8                   550068 non-null  uint8 
 9   Occupation_9                   550068 non-null  uint8 
 10  Occupation_10                  550068 non-null  uint8 
 11  Occupation_11                  550068 non-null  uint8 
 12  Occupation_12                  550068 non-nu

In [36]:
df_test=process_categories(test)
df_test=identify_missing_columns(df_test)
df_test_final=collate_prod_categories(df_test)

In [37]:
features=df_train_final.select_dtypes(include=('int64','uint8')).drop(['Purchase'],axis=1).columns
lr = linear_model.LinearRegression()
lr.fit(df_train_final[features],df_train_final['Purchase'])       
predict=lr.predict(df_test_final[features])
submissions=pd.DataFrame({'Purchase':predict,'User_ID':df_test_final['User_ID'],'Product_ID':df_test_final['Product_ID']})
submissions.to_csv('submission_0330_lm.csv',index=False)

In [22]:
all_X=df_train_final[features]
all_y=df_train_final['Purchase']
train_X,test_X,train_y,test_y = train_test_split(all_X,all_y,test_size=.2,random_state=0)
model_RF = RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                        max_depth=None, max_features='auto', max_leaf_nodes=None,
                        max_samples=None, min_impurity_decrease=0.0,
                        min_impurity_split=None, min_samples_leaf=1,
                        min_samples_split=2, min_weight_fraction_leaf=0.0,
                        n_estimators=100, n_jobs=None, oob_score=False,
                        random_state=None, verbose=0, warm_start=False)
model_RF.fit(train_X[features],train_y)
predict=model_RF.predict(df_test_final[features])
submissions=pd.DataFrame({'Purchase':predict,'User_ID':df_test_final['User_ID'],'Product_ID':df_test_final['Product_ID']})
submissions.to_csv('submission_0330_RF.csv',index=False)

In [51]:
#Function to grid search over different hyperparameters and select the best one
def select_model(df,optimized_columns):
    all_X = df[optimized_columns]
    all_y = df['Purchase']
    
    model_list = [
    {
    "name":"XGBRegressor",
    "estimator":XGBRegressor(),
    "hyperparameters":
        {
            "learning_rate": [ 0.20, 0.25, 0.30 ],
            "max_depth":[ 5, 6, 8],
            "min_child_weight":[ 1, 3],
            "gamma":[ 0.0, 0.1],
            "colsample_bytree" : [  0.5 , 0.7 ]
        }
    }
    ]
    
    for i in model_list:
        print(i['name'])
        gv = GridSearchCV(i['estimator'],i['hyperparameters'],cv=5,verbose=3)
        print('Setup of Grid Search')
        gv.fit(all_X,all_y)
        print('Grid Search Fit')
        i['best_params'] = gv.best_params_
        i['best_score'] = gv.best_score_
        i['best_estimator'] = gv.best_estimator_
        print('Returning Model')
    return model_list

In [52]:
begin_time = datetime.datetime.now()
best_model = select_model(df_train_final,features)
best_model
end_time = datetime.datetime.now()
print("Time taken {}".format(end_time - begin_time))

XGBRegressor
Setup of Grid Search
Fitting 5 folds for each of 72 candidates, totalling 360 fits
[CV] colsample_bytree=0.5, gamma=0.0, learning_rate=0.2, max_depth=5, min_child_weight=1 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  colsample_bytree=0.5, gamma=0.0, learning_rate=0.2, max_depth=5, min_child_weight=1, score=0.637, total=  32.2s
[CV] colsample_bytree=0.5, gamma=0.0, learning_rate=0.2, max_depth=5, min_child_weight=1 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   32.1s remaining:    0.0s


[CV]  colsample_bytree=0.5, gamma=0.0, learning_rate=0.2, max_depth=5, min_child_weight=1, score=0.638, total=  24.3s
[CV] colsample_bytree=0.5, gamma=0.0, learning_rate=0.2, max_depth=5, min_child_weight=1 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   56.4s remaining:    0.0s


[CV]  colsample_bytree=0.5, gamma=0.0, learning_rate=0.2, max_depth=5, min_child_weight=1, score=0.641, total=  24.8s
[CV] colsample_bytree=0.5, gamma=0.0, learning_rate=0.2, max_depth=5, min_child_weight=1 
[CV]  colsample_bytree=0.5, gamma=0.0, learning_rate=0.2, max_depth=5, min_child_weight=1, score=0.636, total=  24.9s
[CV] colsample_bytree=0.5, gamma=0.0, learning_rate=0.2, max_depth=5, min_child_weight=1 
[CV]  colsample_bytree=0.5, gamma=0.0, learning_rate=0.2, max_depth=5, min_child_weight=1, score=0.561, total=  24.7s
[CV] colsample_bytree=0.5, gamma=0.0, learning_rate=0.2, max_depth=5, min_child_weight=3 
[CV]  colsample_bytree=0.5, gamma=0.0, learning_rate=0.2, max_depth=5, min_child_weight=3, score=0.638, total=  24.8s
[CV] colsample_bytree=0.5, gamma=0.0, learning_rate=0.2, max_depth=5, min_child_weight=3 
[CV]  colsample_bytree=0.5, gamma=0.0, learning_rate=0.2, max_depth=5, min_child_weight=3, score=0.637, total=  24.8s
[CV] colsample_bytree=0.5, gamma=0.0, learning_rat

[CV]  colsample_bytree=0.5, gamma=0.0, learning_rate=0.25, max_depth=6, min_child_weight=1, score=0.658, total=  29.0s
[CV] colsample_bytree=0.5, gamma=0.0, learning_rate=0.25, max_depth=6, min_child_weight=1 
[CV]  colsample_bytree=0.5, gamma=0.0, learning_rate=0.25, max_depth=6, min_child_weight=1, score=0.656, total=  29.0s
[CV] colsample_bytree=0.5, gamma=0.0, learning_rate=0.25, max_depth=6, min_child_weight=1 
[CV]  colsample_bytree=0.5, gamma=0.0, learning_rate=0.25, max_depth=6, min_child_weight=1, score=0.579, total=  29.3s
[CV] colsample_bytree=0.5, gamma=0.0, learning_rate=0.25, max_depth=6, min_child_weight=3 
[CV]  colsample_bytree=0.5, gamma=0.0, learning_rate=0.25, max_depth=6, min_child_weight=3, score=0.653, total=  28.5s
[CV] colsample_bytree=0.5, gamma=0.0, learning_rate=0.25, max_depth=6, min_child_weight=3 
[CV]  colsample_bytree=0.5, gamma=0.0, learning_rate=0.25, max_depth=6, min_child_weight=3, score=0.656, total=  29.0s
[CV] colsample_bytree=0.5, gamma=0.0, lea

[CV]  colsample_bytree=0.5, gamma=0.0, learning_rate=0.3, max_depth=8, min_child_weight=1, score=0.671, total=  39.7s
[CV] colsample_bytree=0.5, gamma=0.0, learning_rate=0.3, max_depth=8, min_child_weight=1 
[CV]  colsample_bytree=0.5, gamma=0.0, learning_rate=0.3, max_depth=8, min_child_weight=1, score=0.672, total=  39.3s
[CV] colsample_bytree=0.5, gamma=0.0, learning_rate=0.3, max_depth=8, min_child_weight=1 
[CV]  colsample_bytree=0.5, gamma=0.0, learning_rate=0.3, max_depth=8, min_child_weight=1, score=0.584, total=  40.2s
[CV] colsample_bytree=0.5, gamma=0.0, learning_rate=0.3, max_depth=8, min_child_weight=3 
[CV]  colsample_bytree=0.5, gamma=0.0, learning_rate=0.3, max_depth=8, min_child_weight=3, score=0.670, total=  40.9s
[CV] colsample_bytree=0.5, gamma=0.0, learning_rate=0.3, max_depth=8, min_child_weight=3 
[CV]  colsample_bytree=0.5, gamma=0.0, learning_rate=0.3, max_depth=8, min_child_weight=3, score=0.672, total=  39.5s
[CV] colsample_bytree=0.5, gamma=0.0, learning_rat

[CV]  colsample_bytree=0.5, gamma=0.1, learning_rate=0.25, max_depth=5, min_child_weight=1, score=0.647, total=  24.8s
[CV] colsample_bytree=0.5, gamma=0.1, learning_rate=0.25, max_depth=5, min_child_weight=1 
[CV]  colsample_bytree=0.5, gamma=0.1, learning_rate=0.25, max_depth=5, min_child_weight=1, score=0.643, total=  25.1s
[CV] colsample_bytree=0.5, gamma=0.1, learning_rate=0.25, max_depth=5, min_child_weight=1 
[CV]  colsample_bytree=0.5, gamma=0.1, learning_rate=0.25, max_depth=5, min_child_weight=1, score=0.556, total=  25.1s
[CV] colsample_bytree=0.5, gamma=0.1, learning_rate=0.25, max_depth=5, min_child_weight=3 
[CV]  colsample_bytree=0.5, gamma=0.1, learning_rate=0.25, max_depth=5, min_child_weight=3, score=0.644, total=  25.2s
[CV] colsample_bytree=0.5, gamma=0.1, learning_rate=0.25, max_depth=5, min_child_weight=3 
[CV]  colsample_bytree=0.5, gamma=0.1, learning_rate=0.25, max_depth=5, min_child_weight=3, score=0.644, total=  24.8s
[CV] colsample_bytree=0.5, gamma=0.1, lea

[CV]  colsample_bytree=0.5, gamma=0.1, learning_rate=0.3, max_depth=6, min_child_weight=1, score=0.662, total=  29.4s
[CV] colsample_bytree=0.5, gamma=0.1, learning_rate=0.3, max_depth=6, min_child_weight=1 
[CV]  colsample_bytree=0.5, gamma=0.1, learning_rate=0.3, max_depth=6, min_child_weight=1, score=0.660, total=  29.5s
[CV] colsample_bytree=0.5, gamma=0.1, learning_rate=0.3, max_depth=6, min_child_weight=1 
[CV]  colsample_bytree=0.5, gamma=0.1, learning_rate=0.3, max_depth=6, min_child_weight=1, score=0.570, total=  29.6s
[CV] colsample_bytree=0.5, gamma=0.1, learning_rate=0.3, max_depth=6, min_child_weight=3 
[CV]  colsample_bytree=0.5, gamma=0.1, learning_rate=0.3, max_depth=6, min_child_weight=3, score=0.658, total=  29.3s
[CV] colsample_bytree=0.5, gamma=0.1, learning_rate=0.3, max_depth=6, min_child_weight=3 
[CV]  colsample_bytree=0.5, gamma=0.1, learning_rate=0.3, max_depth=6, min_child_weight=3, score=0.658, total=  28.8s
[CV] colsample_bytree=0.5, gamma=0.1, learning_rat

[CV]  colsample_bytree=0.7, gamma=0.0, learning_rate=0.2, max_depth=8, min_child_weight=1, score=0.673, total=  50.3s
[CV] colsample_bytree=0.7, gamma=0.0, learning_rate=0.2, max_depth=8, min_child_weight=1 
[CV]  colsample_bytree=0.7, gamma=0.0, learning_rate=0.2, max_depth=8, min_child_weight=1, score=0.671, total=  54.1s
[CV] colsample_bytree=0.7, gamma=0.0, learning_rate=0.2, max_depth=8, min_child_weight=1 
[CV]  colsample_bytree=0.7, gamma=0.0, learning_rate=0.2, max_depth=8, min_child_weight=1, score=0.572, total=  52.7s
[CV] colsample_bytree=0.7, gamma=0.0, learning_rate=0.2, max_depth=8, min_child_weight=3 
[CV]  colsample_bytree=0.7, gamma=0.0, learning_rate=0.2, max_depth=8, min_child_weight=3, score=0.672, total=  50.9s
[CV] colsample_bytree=0.7, gamma=0.0, learning_rate=0.2, max_depth=8, min_child_weight=3 
[CV]  colsample_bytree=0.7, gamma=0.0, learning_rate=0.2, max_depth=8, min_child_weight=3, score=0.673, total=  48.9s
[CV] colsample_bytree=0.7, gamma=0.0, learning_rat

[CV]  colsample_bytree=0.7, gamma=0.0, learning_rate=0.3, max_depth=5, min_child_weight=1, score=0.657, total=  31.0s
[CV] colsample_bytree=0.7, gamma=0.0, learning_rate=0.3, max_depth=5, min_child_weight=1 
[CV]  colsample_bytree=0.7, gamma=0.0, learning_rate=0.3, max_depth=5, min_child_weight=1, score=0.652, total=  31.2s
[CV] colsample_bytree=0.7, gamma=0.0, learning_rate=0.3, max_depth=5, min_child_weight=1 
[CV]  colsample_bytree=0.7, gamma=0.0, learning_rate=0.3, max_depth=5, min_child_weight=1, score=0.548, total=  32.0s
[CV] colsample_bytree=0.7, gamma=0.0, learning_rate=0.3, max_depth=5, min_child_weight=3 
[CV]  colsample_bytree=0.7, gamma=0.0, learning_rate=0.3, max_depth=5, min_child_weight=3, score=0.654, total=  31.3s
[CV] colsample_bytree=0.7, gamma=0.0, learning_rate=0.3, max_depth=5, min_child_weight=3 
[CV]  colsample_bytree=0.7, gamma=0.0, learning_rate=0.3, max_depth=5, min_child_weight=3, score=0.654, total=  32.0s
[CV] colsample_bytree=0.7, gamma=0.0, learning_rat

[CV]  colsample_bytree=0.7, gamma=0.1, learning_rate=0.2, max_depth=6, min_child_weight=1, score=0.659, total= 1.2min
[CV] colsample_bytree=0.7, gamma=0.1, learning_rate=0.2, max_depth=6, min_child_weight=1 
[CV]  colsample_bytree=0.7, gamma=0.1, learning_rate=0.2, max_depth=6, min_child_weight=1, score=0.656, total=  47.9s
[CV] colsample_bytree=0.7, gamma=0.1, learning_rate=0.2, max_depth=6, min_child_weight=1 
[CV]  colsample_bytree=0.7, gamma=0.1, learning_rate=0.2, max_depth=6, min_child_weight=1, score=0.546, total=  46.3s
[CV] colsample_bytree=0.7, gamma=0.1, learning_rate=0.2, max_depth=6, min_child_weight=3 
[CV]  colsample_bytree=0.7, gamma=0.1, learning_rate=0.2, max_depth=6, min_child_weight=3, score=0.656, total=  45.2s
[CV] colsample_bytree=0.7, gamma=0.1, learning_rate=0.2, max_depth=6, min_child_weight=3 
[CV]  colsample_bytree=0.7, gamma=0.1, learning_rate=0.2, max_depth=6, min_child_weight=3, score=0.656, total=  45.7s
[CV] colsample_bytree=0.7, gamma=0.1, learning_rat

[CV]  colsample_bytree=0.7, gamma=0.1, learning_rate=0.25, max_depth=8, min_child_weight=1, score=0.677, total=  48.9s
[CV] colsample_bytree=0.7, gamma=0.1, learning_rate=0.25, max_depth=8, min_child_weight=1 
[CV]  colsample_bytree=0.7, gamma=0.1, learning_rate=0.25, max_depth=8, min_child_weight=1, score=0.678, total=  50.4s
[CV] colsample_bytree=0.7, gamma=0.1, learning_rate=0.25, max_depth=8, min_child_weight=1 
[CV]  colsample_bytree=0.7, gamma=0.1, learning_rate=0.25, max_depth=8, min_child_weight=1, score=0.566, total=  49.7s
[CV] colsample_bytree=0.7, gamma=0.1, learning_rate=0.25, max_depth=8, min_child_weight=3 
[CV]  colsample_bytree=0.7, gamma=0.1, learning_rate=0.25, max_depth=8, min_child_weight=3, score=0.678, total=  48.8s
[CV] colsample_bytree=0.7, gamma=0.1, learning_rate=0.25, max_depth=8, min_child_weight=3 
[CV]  colsample_bytree=0.7, gamma=0.1, learning_rate=0.25, max_depth=8, min_child_weight=3, score=0.676, total=  51.3s
[CV] colsample_bytree=0.7, gamma=0.1, lea

[Parallel(n_jobs=1)]: Done 360 out of 360 | elapsed: 225.3min finished


Grid Search Fit
Returning Model
Time taken 3:46:27.699731


In [53]:
best_model

[{'name': 'XGBRegressor',
  'estimator': XGBRegressor(base_score=None, booster=None, colsample_bylevel=None,
               colsample_bynode=None, colsample_bytree=None, gamma=None,
               gpu_id=None, importance_type='gain', interaction_constraints=None,
               learning_rate=None, max_delta_step=None, max_depth=None,
               min_child_weight=None, missing=nan, monotone_constraints=None,
               n_estimators=100, n_jobs=None, num_parallel_tree=None,
               objective='reg:squarederror', random_state=None, reg_alpha=None,
               reg_lambda=None, scale_pos_weight=None, subsample=None,
               tree_method=None, validate_parameters=None, verbosity=None),
  'hyperparameters': {'learning_rate': [0.2, 0.25, 0.3],
   'max_depth': [5, 6, 8],
   'min_child_weight': [1, 3],
   'gamma': [0.0, 0.1],
   'colsample_bytree': [0.5, 0.7]},
  'best_params': {'colsample_bytree': 0.7,
   'gamma': 0.0,
   'learning_rate': 0.3,
   'max_depth': 8,
   'min_ch

In [55]:
xgbr = xgb.XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
               colsample_bynode=1, colsample_bytree=0.7, gamma=0.0, gpu_id=-1,
               importance_type='gain', interaction_constraints='',
               learning_rate=0.3, max_delta_step=0, max_depth=8,
               min_child_weight=1,  monotone_constraints='()',
               n_estimators=100, n_jobs=4, num_parallel_tree=1,
               objective='reg:squarederror', random_state=0, reg_alpha=0,
               reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='exact',
               validate_parameters=1, verbosity=None)
print(xgbr)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.7, gamma=0.0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.3, max_delta_step=0, max_depth=8,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=100, n_jobs=4, num_parallel_tree=1,
             objective='reg:squarederror', random_state=0, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='exact',
             validate_parameters=1, verbosity=None)


In [56]:
begin_time = datetime.datetime.now()
xgbr.fit(train_X, train_y)
time_now = datetime.datetime.now()
print('done fitting at {}'.format(time_now))
score = xgbr.score(train_X, train_y) 
time_now = datetime.datetime.now()
print('done scoring at {}'.format(time_now))
print("Training score: ", score)
scores = cross_val_score(xgbr, train_X, train_y,cv=10)
time_now = datetime.datetime.now()
print('done cross val scoring at {}'.format(time_now))
print("Mean cross-validation score: %.2f" % scores.mean())
end_time = datetime.datetime.now()
print("Time taken {}".format(end_time - begin_time))

done fitting at 2021-03-31 16:18:51.320863
done scoring at 2021-03-31 16:18:52.475109
Training score:  0.707840173408586
done cross val scoring at 2021-03-31 16:31:11.809316
Mean cross-validation score: 0.69
Time taken 0:13:31.886681


In [57]:
ypred = xgbr.predict(test_X)
mse = mean_squared_error(test_y, ypred)
print("MSE: %.2f" % mse)
#MSE: 3.35
print("RMSE: %.2f" % (mse**(1/2.0)))
#RMSE: 1.83 

MSE: 7916734.31
RMSE: 2813.67


In [58]:
predict=xgbr.predict(df_test_final[features])
submissions=pd.DataFrame({'Purchase':predict,'User_ID':df_test_final['User_ID'],'Product_ID':df_test_final['Product_ID']})
submissions.to_csv('submission_0331_xgbr.csv',index=False)