In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import GradientBoostingRegressor

In [2]:
train=pd.read_csv('train/train.csv')
test=pd.read_csv('test/test.csv')

data=pd.concat([train,test])

In [3]:
data.head()

Unnamed: 0,Age,City_Category,Gender,Marital_Status,Occupation,Product_Category_1,Product_Category_2,Product_Category_3,Product_ID,Purchase,Stay_In_Current_City_Years,User_ID
0,0-17,A,F,0,10,3,,,P00069042,8370.0,2,1000001
1,0-17,A,F,0,10,1,6.0,14.0,P00248942,15200.0,2,1000001
2,0-17,A,F,0,10,12,,,P00087842,1422.0,2,1000001
3,0-17,A,F,0,10,12,14.0,,P00085442,1057.0,2,1000001
4,55+,C,M,0,16,8,,,P00285442,7969.0,4+,1000002


In [4]:
data.Gender.value_counts()

M    590031
F    193636
Name: Gender, dtype: int64

In [5]:
train.info(),test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 550068 entries, 0 to 550067
Data columns (total 12 columns):
User_ID                       550068 non-null int64
Product_ID                    550068 non-null object
Gender                        550068 non-null object
Age                           550068 non-null object
Occupation                    550068 non-null int64
City_Category                 550068 non-null object
Stay_In_Current_City_Years    550068 non-null object
Marital_Status                550068 non-null int64
Product_Category_1            550068 non-null int64
Product_Category_2            376430 non-null float64
Product_Category_3            166821 non-null float64
Purchase                      550068 non-null int64
dtypes: float64(2), int64(5), object(5)
memory usage: 50.4+ MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 233599 entries, 0 to 233598
Data columns (total 11 columns):
User_ID                       233599 non-null int64
Product_ID                    

(None, None)

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 783667 entries, 0 to 233598
Data columns (total 12 columns):
Age                           783667 non-null object
City_Category                 783667 non-null object
Gender                        783667 non-null object
Marital_Status                783667 non-null int64
Occupation                    783667 non-null int64
Product_Category_1            783667 non-null int64
Product_Category_2            537685 non-null float64
Product_Category_3            237858 non-null float64
Product_ID                    783667 non-null object
Purchase                      550068 non-null float64
Stay_In_Current_City_Years    783667 non-null object
User_ID                       783667 non-null int64
dtypes: float64(3), int64(4), object(5)
memory usage: 77.7+ MB


In [7]:
train.Product_Category_1.value_counts(),test.Product_Category_1.value_counts()

(5     150933
 1     140378
 8     113925
 11     24287
 2      23864
 6      20466
 3      20213
 4      11753
 16      9828
 15      6290
 13      5549
 10      5125
 12      3947
 7       3721
 18      3125
 20      2550
 19      1603
 14      1523
 17       578
 9        410
 Name: Product_Category_1, dtype: int64, 5     65017
 1     60321
 8     48369
 2     10192
 11    10153
 6      8860
 3      8578
 4      5003
 16     4105
 15     2694
 13     2381
 10     2248
 12     1663
 7      1624
 18     1311
 14      663
 17      223
 9       194
 Name: Product_Category_1, dtype: int64)

In [8]:
data.User_ID.max(),data.User_ID.min()

(1006040, 1000001)

In [9]:
data.Stay_In_Current_City_Years.value_counts()

1     276425
2     145427
3     135428
4+    120671
0     105716
Name: Stay_In_Current_City_Years, dtype: int64

In [10]:
dataP=pd.DataFrame()
dataP=pd.concat([pd.get_dummies(data['Product_Category_1'].fillna(0).astype(int)),
                 pd.get_dummies(data['Product_Category_2'].fillna(0).astype(int)),
                 pd.get_dummies(data['Product_Category_3'].fillna(0).astype(int))],axis=1)
dataP=dataP.groupby(dataP.columns,axis=1).sum().drop(0,axis=1)

In [11]:
dataP[:5]

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20
0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0
4,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0


In [12]:
age={'0-17':1,'18-25':2,'26-35':3,'36-45':4,'46-50':5,'51-55':6,'55+':7}

In [13]:
dataB=pd.DataFrame()
#dataB=pd.concat([pd.get_dummies(data['Age']),
dataB=pd.get_dummies(data['City_Category'])
 #                dataP],axis=1)
dataB['Age']=data['Age'].map(age)
dataB['Cat_1']=data['Product_Category_1'].fillna(0).astype(int)
dataB['Cat_2']=data['Product_Category_2'].fillna(0).astype(int)
dataB['Cat_3']=data['Product_Category_3'].fillna(0).astype(int)
dataB['Stay']=data.Stay_In_Current_City_Years.replace('4+',4)
dataB['Gender']=data.Gender.map({'M':1,'F':0})
dataB['Marital_Status']=data.Marital_Status
dataB['Occupation']=data['Occupation']
dataB['UserID']=data['User_ID']-1000000
dataB['PID']=data['Product_ID'].str.strip('P')
dataB['Purchase']=data['Purchase']

In [14]:
dataB.head()

Unnamed: 0,A,B,C,Age,Cat_1,Cat_2,Cat_3,Stay,Gender,Marital_Status,Occupation,UserID,PID,Purchase
0,1,0,0,1,3,0,0,2,0,0,10,1,69042,8370.0
1,1,0,0,1,1,6,14,2,0,0,10,1,248942,15200.0
2,1,0,0,1,12,0,0,2,0,0,10,1,87842,1422.0
3,1,0,0,1,12,14,0,2,0,0,10,1,85442,1057.0
4,0,0,1,7,8,0,0,4,1,0,16,2,285442,7969.0


In [15]:
train_w=dataB[dataB['Purchase'].notnull()]
test_w=dataB[dataB['Purchase'].isnull()].drop('Purchase',axis=1)
train.shape,test.shape

((550068, 12), (233599, 11))

In [16]:
X_train,X_test,y_train,y_test=train_test_split(train_w.drop('Purchase',axis=1),train_w.Purchase)

In [22]:
md=GradientBoostingRegressor(min_samples_split=600,max_depth=15,max_features='sqrt',n_estimators=500).fit(X_train,y_train)

print('train score: {}'.format(md.score(X_train,y_train)))
print('test score: {}'.format(md.score(X_test,y_test)))

train_pred=md.predict(X_train)
test_pred=md.predict(X_test)
rms_train=math.sqrt(mean_squared_error(y_train,train_pred))
rms_test=math.sqrt(mean_squared_error(y_test,test_pred))
print('train set RMS score: {}'.format(rms_train))
print('test set RMS score: {}'.format(rms_test))

train score: 0.8220093615101316
test score: 0.7442216498990566
train set RMS score: 2121.4304664491815
test set RMS score: 2532.2625320492957


In [21]:
params={'max_depth':range(14,19,2), 'min_samples_split':range(700,1101,100)}
grid=GridSearchCV(GradientBoostingRegressor(max_features='sqrt',n_estimators=500),params,cv=5,n_jobs=-1)
grid.fit(X_train,y_train)
print('best parameters: {}'.format(grid.best_params_))
print('mean train score: {}'.format(grid.cv_results_['mean_train_score']))
print('mean test score: {}'.format(grid.cv_results_['mean_test_score']))

best parameters: {'max_depth': 16, 'min_samples_split': 700}
mean train score: [ 0.81039263  0.80539182  0.80108928  0.79755458  0.82396338  0.81883791
  0.81424213  0.80947499  0.83538157  0.82953022  0.82368163  0.81886207]
mean test score: [ 0.74237075  0.74176555  0.74145816  0.74103565  0.74242955  0.74190874
  0.74215254  0.74182184  0.7413114   0.74194161  0.7419134   0.7421324 ]




In [17]:
predictions=pd.DataFrame()

md=GradientBoostingRegressor(min_samples_split=600,max_depth=14,max_features='sqrt',n_estimators=500).fit(train_w.drop('Purchase',axis=1),train_w.Purchase)
predictions['one']=md.predict(test_w)

In [18]:
md=GradientBoostingRegressor(min_samples_split=700,max_depth=15,max_features='sqrt',n_estimators=500).fit(train_w.drop('Purchase',axis=1),train_w.Purchase)
predictions['two']=md.predict(test_w)

In [19]:
md=GradientBoostingRegressor(min_samples_split=800,max_depth=16,max_features='sqrt',n_estimators=500).fit(train_w.drop('Purchase',axis=1),train_w.Purchase)
predictions['three']=md.predict(test_w)

In [21]:
predictions['avg']=predictions.mean(axis=1)

In [22]:
solution=pd.DataFrame(data={'User_ID':list(test.User_ID),
                            'Product_ID':list(test.Product_ID),
                            'Purchase':predictions.avg})
solution.to_csv('solutions.csv',index=False)