In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn import preprocessing
from sklearn.cross_validation import train_test_split

In [2]:
train_df = pd.read_csv('train.csv')
test_df  = pd.read_csv('test.csv')

In [3]:
rng = np.random.RandomState(8888)

In [4]:
train_df.head()

Unnamed: 0,QuoteNumber,Original_Quote_Date,QuoteConversion_Flag,Field6,Field7,Field8,Field9,Field10,Field11,Field12,...,GeographicField59A,GeographicField59B,GeographicField60A,GeographicField60B,GeographicField61A,GeographicField61B,GeographicField62A,GeographicField62B,GeographicField63,GeographicField64
0,1,2013-08-16,0,B,23,0.9403,0.0006,965,1.02,N,...,9,9,-1,8,-1,18,-1,10,N,CA
1,2,2014-04-22,0,F,7,1.0006,0.004,548,1.2433,N,...,10,10,-1,11,-1,17,-1,20,N,NJ
2,4,2014-08-25,0,F,7,1.0006,0.004,548,1.2433,N,...,15,18,-1,21,-1,11,-1,8,N,NJ
3,6,2013-04-15,0,J,10,0.9769,0.0004,1165,1.2665,N,...,6,5,-1,10,-1,9,-1,21,N,TX
4,8,2014-01-25,0,E,23,0.9472,0.0006,1487,1.3045,N,...,18,22,-1,10,-1,11,-1,12,N,IL


In [5]:
test_df.head()

Unnamed: 0,QuoteNumber,Original_Quote_Date,Field6,Field7,Field8,Field9,Field10,Field11,Field12,CoverageField1A,...,GeographicField59A,GeographicField59B,GeographicField60A,GeographicField60B,GeographicField61A,GeographicField61B,GeographicField62A,GeographicField62B,GeographicField63,GeographicField64
0,3,2014-08-12,E,16,0.9364,0.0006,1487,1.3045,N,4,...,1,1,-1,1,-1,20,-1,25,Y,IL
1,5,2013-09-07,F,11,0.9919,0.0038,564,1.1886,N,8,...,10,10,-1,5,-1,5,-1,21,N,NJ
2,7,2013-03-29,F,15,0.8945,0.0038,564,1.067,N,11,...,10,11,-1,20,-1,22,-1,11,N,NJ
3,9,2015-03-21,K,21,0.887,0.0004,1113,1.2665,Y,14,...,8,8,-1,13,-1,8,-1,21,N,TX
4,10,2014-12-10,B,25,0.9153,0.0007,935,1.02,N,4,...,7,7,-1,3,-1,22,-1,21,N,CA


# Handle Dates

In [6]:
train_df['Date'] = pd.to_datetime(pd.Series(train_df['Original_Quote_Date']))
train_df = train_df.drop('Original_Quote_Date', axis=1)

test_df['Date'] = pd.to_datetime(pd.Series(test_df['Original_Quote_Date']))
test_df = test_df.drop('Original_Quote_Date', axis=1)

train_df['Year']    = train_df['Date'].apply(lambda x: int(str(x)[:4]))
train_df['Month']   = train_df['Date'].apply(lambda x: int(str(x)[5:7]))
train_df['weekday'] = train_df['Date'].dt.dayofweek


test_df['Year']    = test_df['Date'].apply(lambda x: int(str(x)[:4]))
test_df['Month']   = test_df['Date'].apply(lambda x: int(str(x)[5:7]))
test_df['weekday'] = test_df['Date'].dt.dayofweek

train_df = train_df.drop('Date', axis=1)
test_df = test_df.drop('Date', axis=1)

In [7]:
train_df.head()

Unnamed: 0,QuoteNumber,QuoteConversion_Flag,Field6,Field7,Field8,Field9,Field10,Field11,Field12,CoverageField1A,...,GeographicField60B,GeographicField61A,GeographicField61B,GeographicField62A,GeographicField62B,GeographicField63,GeographicField64,Year,Month,weekday
0,1,0,B,23,0.9403,0.0006,965,1.02,N,17,...,8,-1,18,-1,10,N,CA,2013,8,4
1,2,0,F,7,1.0006,0.004,548,1.2433,N,6,...,11,-1,17,-1,20,N,NJ,2014,4,1
2,4,0,F,7,1.0006,0.004,548,1.2433,N,7,...,21,-1,11,-1,8,N,NJ,2014,8,0
3,6,0,J,10,0.9769,0.0004,1165,1.2665,N,3,...,10,-1,9,-1,21,N,TX,2013,4,0
4,8,0,E,23,0.9472,0.0006,1487,1.3045,N,8,...,10,-1,11,-1,12,N,IL,2014,1,5


In [8]:
test_df.head()

Unnamed: 0,QuoteNumber,Field6,Field7,Field8,Field9,Field10,Field11,Field12,CoverageField1A,CoverageField1B,...,GeographicField60B,GeographicField61A,GeographicField61B,GeographicField62A,GeographicField62B,GeographicField63,GeographicField64,Year,Month,weekday
0,3,E,16,0.9364,0.0006,1487,1.3045,N,4,4,...,1,-1,20,-1,25,Y,IL,2014,8,1
1,5,F,11,0.9919,0.0038,564,1.1886,N,8,14,...,5,-1,5,-1,21,N,NJ,2013,9,5
2,7,F,15,0.8945,0.0038,564,1.067,N,11,18,...,20,-1,22,-1,11,N,NJ,2013,3,4
3,9,K,21,0.887,0.0004,1113,1.2665,Y,14,22,...,13,-1,8,-1,21,N,TX,2015,3,5
4,10,B,25,0.9153,0.0007,935,1.02,N,4,5,...,3,-1,22,-1,21,N,CA,2014,12,2


# Fill NA

In [9]:
train_df = train_df.fillna(-1)
test_df  = test_df.fillna(-1)

# Encode NonNumeric columns

In [10]:
for col in train_df.columns:
    if train_df[col].dtype == 'object':
        encoder = preprocessing.LabelEncoder()
        encoder.fit(list(train_df[col].values) + list(test_df[col].values))
        train_df[col] = encoder.transform(list(train_df[col].values))
        test_df[col]  = encoder.transform(list(test_df[col].values))

In [11]:
train_df.shape

(260753, 301)

#  Get validation dataset

In [12]:
val_index1 = np.random.randint(0, train_df.shape[0], 2000)
val_index2 = np.random.randint(0, train_df.shape[0], 2000)
val_index3 = np.random.randint(0, train_df.shape[0], 2000)
val_index4 = np.random.randint(0, train_df.shape[0], 2000)
val_index5 = np.random.randint(0, train_df.shape[0], 2000)

In [13]:
val_data1 = train_df.ix[val_index1]
val_data2 = train_df.ix[val_index2]
val_data3 = train_df.ix[val_index3]
val_data4 = train_df.ix[val_index4]
val_data5 = train_df.ix[val_index5]

In [14]:
val_data1.head()

Unnamed: 0,QuoteNumber,QuoteConversion_Flag,Field6,Field7,Field8,Field9,Field10,Field11,Field12,CoverageField1A,...,GeographicField60B,GeographicField61A,GeographicField61B,GeographicField62A,GeographicField62B,GeographicField63,GeographicField64,Year,Month,weekday
78977,131514,0,1,19,0.9403,0.0006,7,1.02,0,5,...,3,-1,15,-1,8,1,0,2013,5,2
100074,166780,0,1,25,0.9153,0.0007,6,1.02,0,3,...,22,-1,20,-1,18,1,0,2014,10,1
230984,384999,0,7,21,0.887,0.0004,0,1.2665,1,5,...,5,-1,12,-1,23,1,3,2015,2,1
74012,123216,0,4,16,0.9194,0.0006,3,1.3045,0,1,...,13,-1,22,-1,15,1,1,2014,9,3
193523,322557,1,5,3,0.9023,0.0038,5,1.067,0,5,...,3,-1,2,-1,14,1,2,2013,2,2


In [15]:
train_target = train_df['QuoteConversion_Flag'].values
val_target1 = val_data1['QuoteConversion_Flag'].values
val_target2 = val_data2['QuoteConversion_Flag'].values
val_target3 = val_data3['QuoteConversion_Flag'].values
val_target4 = val_data4['QuoteConversion_Flag'].values
val_target5 = val_data5['QuoteConversion_Flag'].values
test_index = test_df['QuoteNumber'].values

In [16]:
train_features = train_df.drop(['QuoteNumber', 'QuoteConversion_Flag'], axis=1)
val1_features = val_data1.drop(['QuoteNumber', 'QuoteConversion_Flag'], axis=1)
val2_features = val_data2.drop(['QuoteNumber', 'QuoteConversion_Flag'], axis=1)
val3_features = val_data3.drop(['QuoteNumber', 'QuoteConversion_Flag'], axis=1)
val4_features = val_data4.drop(['QuoteNumber', 'QuoteConversion_Flag'], axis=1)
val5_features = val_data5.drop(['QuoteNumber', 'QuoteConversion_Flag'], axis=1)
test_features = test_df.drop('QuoteNumber', axis=1)

In [17]:
train_features.shape

(260753, 299)

# train 5 xgboost classifiers

In [18]:
dtrain = xgb.DMatrix(train_features, label=train_target)
dval1  = xgb.DMatrix(val1_features, label=val_target1)
dval2  = xgb.DMatrix(val2_features, label=val_target2)
dval3  = xgb.DMatrix(val3_features, label=val_target3)
dval4  = xgb.DMatrix(val4_features, label=val_target4)
dval5  = xgb.DMatrix(val5_features, label=val_target5)

In [19]:
params = {
    'max_depth': 8,
    'eta': 0.023,
    'silent': 1,
    'objective': "binary:logistic",
    'booster': "gbtree",
    'eval_metric': 'auc',
    'nthread':2,
    'subsample': 0.83,
    'colsample_bytree': 0.77
}

num_round = 1000
watchlist1 = [(dtrain, 'train'), (dval1, 'validation1')]
watchlist2 = [(dtrain, 'train'), (dval2, 'validation2')]
watchlist3 = [(dtrain, 'train'), (dval3, 'validation3')]
watchlist4 = [(dtrain, 'train'), (dval4, 'validation4')]
watchlist5 = [(dtrain, 'train'), (dval5, 'validation5')]

gbm1 = xgb.train(params, dtrain, num_round, watchlist1)
gbm2 = xgb.train(params, dtrain, num_round, watchlist2)
gbm3 = xgb.train(params, dtrain, num_round, watchlist3)
gbm4 = xgb.train(params, dtrain, num_round, watchlist4)
gbm5 = xgb.train(params, dtrain, num_round, watchlist5)

[0]	train-auc:0.947939	validation1-auc:0.950841
[1]	train-auc:0.949508	validation1-auc:0.951924
[2]	train-auc:0.948959	validation1-auc:0.951885
[3]	train-auc:0.950788	validation1-auc:0.953918
[4]	train-auc:0.950711	validation1-auc:0.953838
[5]	train-auc:0.950178	validation1-auc:0.953666
[6]	train-auc:0.952137	validation1-auc:0.955611
[7]	train-auc:0.952685	validation1-auc:0.956284
[8]	train-auc:0.953655	validation1-auc:0.957157
[9]	train-auc:0.954066	validation1-auc:0.957087
[10]	train-auc:0.953888	validation1-auc:0.956610
[11]	train-auc:0.954024	validation1-auc:0.957148
[12]	train-auc:0.954169	validation1-auc:0.957059
[13]	train-auc:0.954382	validation1-auc:0.957596
[14]	train-auc:0.954783	validation1-auc:0.957658
[15]	train-auc:0.954849	validation1-auc:0.957692
[16]	train-auc:0.955100	validation1-auc:0.957785
[17]	train-auc:0.955190	validation1-auc:0.957740
[18]	train-auc:0.955707	validation1-auc:0.958349
[19]	train-auc:0.955705	validation1-auc:0.958461
[20]	train-auc:0.955855	valida

# Predict

In [20]:
w1, w2, w3, w4, w5 = 1, 1, 1, 1, 1

In [21]:
dtest  = xgb.DMatrix(test_features)
preds = ((gbm1.predict(dtest) * w1) + (gbm2.predict(dtest) * w2) + (gbm3.predict(dtest) * w3) + (gbm4.predict(dtest) * w4) + (gbm5.predict(dtest) * w5)) * 1.0 / (w1+w2+w3+w4+w5)

In [22]:
submit_df = pd.DataFrame({"QuoteNumber": test_index, "QuoteConversion_Flag": preds})
submit_df.to_csv('homesite.csv', index=False)