In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn import preprocessing
from sklearn.cross_validation import train_test_split

In [2]:
train_df = pd.read_csv('train.csv')
test_df  = pd.read_csv('test.csv')

In [3]:
train_df.head()

Unnamed: 0,QuoteNumber,Original_Quote_Date,QuoteConversion_Flag,Field6,Field7,Field8,Field9,Field10,Field11,Field12,...,GeographicField59A,GeographicField59B,GeographicField60A,GeographicField60B,GeographicField61A,GeographicField61B,GeographicField62A,GeographicField62B,GeographicField63,GeographicField64
0,1,2013-08-16,0,B,23,0.9403,0.0006,965,1.02,N,...,9,9,-1,8,-1,18,-1,10,N,CA
1,2,2014-04-22,0,F,7,1.0006,0.004,548,1.2433,N,...,10,10,-1,11,-1,17,-1,20,N,NJ
2,4,2014-08-25,0,F,7,1.0006,0.004,548,1.2433,N,...,15,18,-1,21,-1,11,-1,8,N,NJ
3,6,2013-04-15,0,J,10,0.9769,0.0004,1165,1.2665,N,...,6,5,-1,10,-1,9,-1,21,N,TX
4,8,2014-01-25,0,E,23,0.9472,0.0006,1487,1.3045,N,...,18,22,-1,10,-1,11,-1,12,N,IL


In [4]:
test_df.head()

Unnamed: 0,QuoteNumber,Original_Quote_Date,Field6,Field7,Field8,Field9,Field10,Field11,Field12,CoverageField1A,...,GeographicField59A,GeographicField59B,GeographicField60A,GeographicField60B,GeographicField61A,GeographicField61B,GeographicField62A,GeographicField62B,GeographicField63,GeographicField64
0,3,2014-08-12,E,16,0.9364,0.0006,1487,1.3045,N,4,...,1,1,-1,1,-1,20,-1,25,Y,IL
1,5,2013-09-07,F,11,0.9919,0.0038,564,1.1886,N,8,...,10,10,-1,5,-1,5,-1,21,N,NJ
2,7,2013-03-29,F,15,0.8945,0.0038,564,1.067,N,11,...,10,11,-1,20,-1,22,-1,11,N,NJ
3,9,2015-03-21,K,21,0.887,0.0004,1113,1.2665,Y,14,...,8,8,-1,13,-1,8,-1,21,N,TX
4,10,2014-12-10,B,25,0.9153,0.0007,935,1.02,N,4,...,7,7,-1,3,-1,22,-1,21,N,CA


In [5]:
train_target = train_df['QuoteConversion_Flag'].values
test_index = test_df['QuoteNumber'].values

In [6]:
train_df = train_df.drop(['QuoteNumber', 'QuoteConversion_Flag'], axis=1)
test_df = test_df.drop('QuoteNumber', axis=1)

# Handle Dates

In [7]:
train_df['Date'] = pd.to_datetime(pd.Series(train_df['Original_Quote_Date']))
train_df = train_df.drop('Original_Quote_Date', axis=1)

test_df['Date'] = pd.to_datetime(pd.Series(test_df['Original_Quote_Date']))
test_df = test_df.drop('Original_Quote_Date', axis=1)

train_df['Year']    = train_df['Date'].apply(lambda x: int(str(x)[:4]))
train_df['Month']   = train_df['Date'].apply(lambda x: int(str(x)[5:7]))
train_df['weekday'] = train_df['Date'].dt.dayofweek


test_df['Year']    = test_df['Date'].apply(lambda x: int(str(x)[:4]))
test_df['Month']   = test_df['Date'].apply(lambda x: int(str(x)[5:7]))
test_df['weekday'] = test_df['Date'].dt.dayofweek

train_df = train_df.drop('Date', axis=1)
test_df = test_df.drop('Date', axis=1)

In [8]:
train_df.head()

Unnamed: 0,Field6,Field7,Field8,Field9,Field10,Field11,Field12,CoverageField1A,CoverageField1B,CoverageField2A,...,GeographicField60B,GeographicField61A,GeographicField61B,GeographicField62A,GeographicField62B,GeographicField63,GeographicField64,Year,Month,weekday
0,B,23,0.9403,0.0006,965,1.02,N,17,23,17,...,8,-1,18,-1,10,N,CA,2013,8,4
1,F,7,1.0006,0.004,548,1.2433,N,6,8,6,...,11,-1,17,-1,20,N,NJ,2014,4,1
2,F,7,1.0006,0.004,548,1.2433,N,7,12,7,...,21,-1,11,-1,8,N,NJ,2014,8,0
3,J,10,0.9769,0.0004,1165,1.2665,N,3,2,3,...,10,-1,9,-1,21,N,TX,2013,4,0
4,E,23,0.9472,0.0006,1487,1.3045,N,8,13,8,...,10,-1,11,-1,12,N,IL,2014,1,5


In [9]:
test_df.head()

Unnamed: 0,Field6,Field7,Field8,Field9,Field10,Field11,Field12,CoverageField1A,CoverageField1B,CoverageField2A,...,GeographicField60B,GeographicField61A,GeographicField61B,GeographicField62A,GeographicField62B,GeographicField63,GeographicField64,Year,Month,weekday
0,E,16,0.9364,0.0006,1487,1.3045,N,4,4,4,...,1,-1,20,-1,25,Y,IL,2014,8,1
1,F,11,0.9919,0.0038,564,1.1886,N,8,14,8,...,5,-1,5,-1,21,N,NJ,2013,9,5
2,F,15,0.8945,0.0038,564,1.067,N,11,18,11,...,20,-1,22,-1,11,N,NJ,2013,3,4
3,K,21,0.887,0.0004,1113,1.2665,Y,14,22,15,...,13,-1,8,-1,21,N,TX,2015,3,5
4,B,25,0.9153,0.0007,935,1.02,N,4,5,4,...,3,-1,22,-1,21,N,CA,2014,12,2


# Fill NA

In [10]:
train_df = train_df.fillna(-1)
test_df  = test_df.fillna(-1)

# Encode NonNumeric columns

In [11]:
for col in train_df.columns:
    if train_df[col].dtype == 'object':
        encoder = preprocessing.LabelEncoder()
        encoder.fit(list(train_df[col].values) + list(test_df[col].values))
        train_df[col] = encoder.transform(list(train_df[col].values))
        test_df[col]  = encoder.transform(list(test_df[col].values))

#  split validation dataset

In [12]:
rng = np.random.RandomState(33)

train_xgb, val_xgb, train_target_xgb, val_target_xgb = train_test_split(train_df, train_target, test_size=0.2, random_state=rng)

In [13]:
train_df.shape

(260753, 299)

In [14]:
train_xgb.shape

(208602, 299)

In [15]:
val_xgb.shape

(52151, 299)

In [16]:
val_target_xgb.shape

(52151,)

# train using xgboost classifier

In [17]:
dtrain = xgb.DMatrix(train_xgb, label=train_target_xgb)
dval  = xgb.DMatrix(val_xgb, label=val_target_xgb)

In [18]:
params = {
    'max_depth': 5,
    'eta': 0.1,
    'silent': 1,
    'objective': "binary:logistic",
    'eval_metric': 'auc',
    'nthread':2
}

num_round = 200
watchlist = [(dtrain, 'train'), (dval, 'evals')]

gbm = xgb.train(params, dtrain, num_round, watchlist)

[0]	train-auc:0.923232	evals-auc:0.923092
[1]	train-auc:0.926509	evals-auc:0.926310
[2]	train-auc:0.931993	evals-auc:0.931905
[3]	train-auc:0.935362	evals-auc:0.935221
[4]	train-auc:0.936529	evals-auc:0.936445
[5]	train-auc:0.937500	evals-auc:0.937384
[6]	train-auc:0.937063	evals-auc:0.937061
[7]	train-auc:0.939114	evals-auc:0.939149
[8]	train-auc:0.939880	evals-auc:0.939795
[9]	train-auc:0.941126	evals-auc:0.941068
[10]	train-auc:0.941619	evals-auc:0.941638
[11]	train-auc:0.942034	evals-auc:0.941985
[12]	train-auc:0.942484	evals-auc:0.942429
[13]	train-auc:0.945010	evals-auc:0.945208
[14]	train-auc:0.945434	evals-auc:0.945623
[15]	train-auc:0.945645	evals-auc:0.945819
[16]	train-auc:0.945894	evals-auc:0.945998
[17]	train-auc:0.946892	evals-auc:0.947022
[18]	train-auc:0.947407	evals-auc:0.947592
[19]	train-auc:0.947899	evals-auc:0.948131
[20]	train-auc:0.948792	evals-auc:0.949086
[21]	train-auc:0.949619	evals-auc:0.949890
[22]	train-auc:0.950266	evals-auc:0.950474
[23]	train-auc:0.9507

# Predict

In [20]:
dtest  = xgb.DMatrix(test_df)
preds = gbm.predict(dtest)

In [21]:
submit_df = pd.DataFrame({"QuoteNumber": test_index, "QuoteConversion_Flag": preds})
submit_df.to_csv('homesite.csv', index=False)