In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn import preprocessing
from sklearn.cross_validation import train_test_split

In [2]:
train_df = pd.read_csv('train.csv')
test_df  = pd.read_csv('test.csv')

In [3]:
rng = np.random.RandomState(8888)

In [4]:
train_df.head()

Unnamed: 0,QuoteNumber,Original_Quote_Date,QuoteConversion_Flag,Field6,Field7,Field8,Field9,Field10,Field11,Field12,...,GeographicField59A,GeographicField59B,GeographicField60A,GeographicField60B,GeographicField61A,GeographicField61B,GeographicField62A,GeographicField62B,GeographicField63,GeographicField64
0,1,2013-08-16,0,B,23,0.9403,0.0006,965,1.02,N,...,9,9,-1,8,-1,18,-1,10,N,CA
1,2,2014-04-22,0,F,7,1.0006,0.004,548,1.2433,N,...,10,10,-1,11,-1,17,-1,20,N,NJ
2,4,2014-08-25,0,F,7,1.0006,0.004,548,1.2433,N,...,15,18,-1,21,-1,11,-1,8,N,NJ
3,6,2013-04-15,0,J,10,0.9769,0.0004,1165,1.2665,N,...,6,5,-1,10,-1,9,-1,21,N,TX
4,8,2014-01-25,0,E,23,0.9472,0.0006,1487,1.3045,N,...,18,22,-1,10,-1,11,-1,12,N,IL


In [5]:
test_df.head()

Unnamed: 0,QuoteNumber,Original_Quote_Date,Field6,Field7,Field8,Field9,Field10,Field11,Field12,CoverageField1A,...,GeographicField59A,GeographicField59B,GeographicField60A,GeographicField60B,GeographicField61A,GeographicField61B,GeographicField62A,GeographicField62B,GeographicField63,GeographicField64
0,3,2014-08-12,E,16,0.9364,0.0006,1487,1.3045,N,4,...,1,1,-1,1,-1,20,-1,25,Y,IL
1,5,2013-09-07,F,11,0.9919,0.0038,564,1.1886,N,8,...,10,10,-1,5,-1,5,-1,21,N,NJ
2,7,2013-03-29,F,15,0.8945,0.0038,564,1.067,N,11,...,10,11,-1,20,-1,22,-1,11,N,NJ
3,9,2015-03-21,K,21,0.887,0.0004,1113,1.2665,Y,14,...,8,8,-1,13,-1,8,-1,21,N,TX
4,10,2014-12-10,B,25,0.9153,0.0007,935,1.02,N,4,...,7,7,-1,3,-1,22,-1,21,N,CA


# Handle Dates

In [6]:
train_df['Date'] = pd.to_datetime(pd.Series(train_df['Original_Quote_Date']))
train_df = train_df.drop('Original_Quote_Date', axis=1)

test_df['Date'] = pd.to_datetime(pd.Series(test_df['Original_Quote_Date']))
test_df = test_df.drop('Original_Quote_Date', axis=1)

train_df['Year']    = train_df['Date'].apply(lambda x: int(str(x)[:4]))
train_df['Month']   = train_df['Date'].apply(lambda x: int(str(x)[5:7]))
train_df['weekday'] = train_df['Date'].dt.dayofweek


test_df['Year']    = test_df['Date'].apply(lambda x: int(str(x)[:4]))
test_df['Month']   = test_df['Date'].apply(lambda x: int(str(x)[5:7]))
test_df['weekday'] = test_df['Date'].dt.dayofweek

train_df = train_df.drop('Date', axis=1)
test_df = test_df.drop('Date', axis=1)

In [7]:
train_df.head()

Unnamed: 0,QuoteNumber,QuoteConversion_Flag,Field6,Field7,Field8,Field9,Field10,Field11,Field12,CoverageField1A,...,GeographicField60B,GeographicField61A,GeographicField61B,GeographicField62A,GeographicField62B,GeographicField63,GeographicField64,Year,Month,weekday
0,1,0,B,23,0.9403,0.0006,965,1.02,N,17,...,8,-1,18,-1,10,N,CA,2013,8,4
1,2,0,F,7,1.0006,0.004,548,1.2433,N,6,...,11,-1,17,-1,20,N,NJ,2014,4,1
2,4,0,F,7,1.0006,0.004,548,1.2433,N,7,...,21,-1,11,-1,8,N,NJ,2014,8,0
3,6,0,J,10,0.9769,0.0004,1165,1.2665,N,3,...,10,-1,9,-1,21,N,TX,2013,4,0
4,8,0,E,23,0.9472,0.0006,1487,1.3045,N,8,...,10,-1,11,-1,12,N,IL,2014,1,5


In [8]:
test_df.head()

Unnamed: 0,QuoteNumber,Field6,Field7,Field8,Field9,Field10,Field11,Field12,CoverageField1A,CoverageField1B,...,GeographicField60B,GeographicField61A,GeographicField61B,GeographicField62A,GeographicField62B,GeographicField63,GeographicField64,Year,Month,weekday
0,3,E,16,0.9364,0.0006,1487,1.3045,N,4,4,...,1,-1,20,-1,25,Y,IL,2014,8,1
1,5,F,11,0.9919,0.0038,564,1.1886,N,8,14,...,5,-1,5,-1,21,N,NJ,2013,9,5
2,7,F,15,0.8945,0.0038,564,1.067,N,11,18,...,20,-1,22,-1,11,N,NJ,2013,3,4
3,9,K,21,0.887,0.0004,1113,1.2665,Y,14,22,...,13,-1,8,-1,21,N,TX,2015,3,5
4,10,B,25,0.9153,0.0007,935,1.02,N,4,5,...,3,-1,22,-1,21,N,CA,2014,12,2


# Fill NA

In [9]:
train_df = train_df.fillna(-1)
test_df  = test_df.fillna(-1)

# Encode NonNumeric columns

In [10]:
for col in train_df.columns:
    if train_df[col].dtype == 'object':
        encoder = preprocessing.LabelEncoder()
        encoder.fit(list(train_df[col].values) + list(test_df[col].values))
        train_df[col] = encoder.transform(list(train_df[col].values))
        test_df[col]  = encoder.transform(list(test_df[col].values))

In [11]:
train_df.shape

(260753, 301)

#  Get validation dataset

In [12]:
val_index = np.random.randint(0, train_df.shape[0], 2000)

In [13]:
val_data = train_df.ix[val_index]
val_data.shape

(2000, 301)

In [14]:
val_data.head()

Unnamed: 0,QuoteNumber,QuoteConversion_Flag,Field6,Field7,Field8,Field9,Field10,Field11,Field12,CoverageField1A,...,GeographicField60B,GeographicField61A,GeographicField61B,GeographicField62A,GeographicField62B,GeographicField63,GeographicField64,Year,Month,weekday
248826,414462,1,1,25,0.9153,0.0007,6,1.02,0,12,...,3,25,25,-1,8,1,0,2015,1,3
12478,20757,0,1,23,0.9403,0.0006,7,1.02,0,8,...,1,-1,25,-1,23,1,0,2013,3,1
105620,176092,0,1,23,0.9153,0.0007,6,1.02,0,4,...,7,-1,22,-1,21,1,0,2015,1,4
17845,29592,0,6,23,0.9559,0.0004,1,1.2392,0,2,...,14,-1,7,-1,19,1,3,2013,1,0
112111,186753,0,4,23,0.9392,0.0006,3,1.3045,0,14,...,7,-1,4,-1,8,1,1,2014,3,0


In [15]:
train_target = train_df['QuoteConversion_Flag'].values
val_target = val_data['QuoteConversion_Flag'].values
test_index = test_df['QuoteNumber'].values

In [16]:
train_features = train_df.drop(['QuoteNumber', 'QuoteConversion_Flag'], axis=1)
val_features = val_data.drop(['QuoteNumber', 'QuoteConversion_Flag'], axis=1)
test_features = test_df.drop('QuoteNumber', axis=1)

In [17]:
train_features.shape

(260753, 299)

# train using xgboost classifier

In [18]:
dtrain = xgb.DMatrix(train_features, label=train_target)
dval  = xgb.DMatrix(val_features, label=val_target)

In [19]:
params = {
    'max_depth': 8,
    'eta': 0.023,
    'silent': 1,
    'objective': "binary:logistic",
    'booster': "gbtree",
    'eval_metric': 'auc',
    'nthread':2,
    'subsample': 0.83,
    'colsample_bytree': 0.77
}

num_round = 1500
watchlist = [(dtrain, 'train'), (dval, 'validation')]

gbm = xgb.train(params, dtrain, num_round, watchlist)

[0]	train-auc:0.947939	validation-auc:0.937567
[1]	train-auc:0.949508	validation-auc:0.937471
[2]	train-auc:0.948959	validation-auc:0.936916
[3]	train-auc:0.950788	validation-auc:0.939563
[4]	train-auc:0.950711	validation-auc:0.939689
[5]	train-auc:0.950178	validation-auc:0.939140
[6]	train-auc:0.952137	validation-auc:0.941543
[7]	train-auc:0.952685	validation-auc:0.943660
[8]	train-auc:0.953655	validation-auc:0.945574
[9]	train-auc:0.954066	validation-auc:0.946139
[10]	train-auc:0.953888	validation-auc:0.946028
[11]	train-auc:0.954024	validation-auc:0.946781
[12]	train-auc:0.954169	validation-auc:0.946797
[13]	train-auc:0.954382	validation-auc:0.948451
[14]	train-auc:0.954783	validation-auc:0.948778
[15]	train-auc:0.954849	validation-auc:0.948932
[16]	train-auc:0.955100	validation-auc:0.948906
[17]	train-auc:0.955190	validation-auc:0.948878
[18]	train-auc:0.955707	validation-auc:0.949016
[19]	train-auc:0.955705	validation-auc:0.948995
[20]	train-auc:0.955855	validation-auc:0.948980
[2

# Predict

In [20]:
dtest  = xgb.DMatrix(test_features)
preds = gbm.predict(dtest)

In [21]:
submit_df = pd.DataFrame({"QuoteNumber": test_index, "QuoteConversion_Flag": preds})
submit_df.to_csv('homesite.csv', index=False)