In [2]:
import pandas as pd
import os

train_data = pd.read_csv(os.getcwd()+'/data/BNP_data/train.csv')
test_data = pd.read_csv(os.getcwd()+'/data/BNP_data/test.csv')

# Fill NA values accordingly
from sklearn.base import TransformerMixin
import numpy as np

class DataFrameImputer(TransformerMixin):
    def fit(self, X, y=None):
        self.fill = pd.Series([X[c].value_counts().index[0]
            if X[c].dtype == np.dtype('O') else X[c].median() for c in X],
            index=X.columns)
        return self
    def transform(self, X, y=None):
        return X.fillna(self.fill)
    
train_data = DataFrameImputer().fit_transform(train_data)
test_data = DataFrameImputer().fit_transform(test_data)

# Convert string values to numeric
from sklearn import preprocessing

criteria = train_data.dtypes == object
object_columns = train_data[criteria.index[criteria]].columns

le = preprocessing.LabelEncoder()
for col in object_columns:
    train_data[col] = le.fit_transform(train_data[col])
    test_data[col] = le.fit_transform(test_data[col])

# Cross-validation
from sklearn import cross_validation
X_train = train_data.drop('target', axis=1)
Y_train = train_data['target']

In [6]:
import xgboost as xgb

eta = 0.01
max_depth = 10
num_rounds = 1200
subsample = 0.8
colsample_bytree = 1

# Format the matrices for xgb
dtrain = xgb.DMatrix(X_train, label = Y_train)
dtest = xgb.DMatrix(test_data, label = Y_train)

xgb_params = {'objective': 'binary:logistic', 'eta': eta, 'max_depth': max_depth, \
              'eval_metric': 'logloss', 'watchlist': dtrain, 'colsample_bytree': colsample_bytree, \
                'subsample': subsample, 'nthread': 8}

gbdt = xgb.train(xgb_params, dtrain, num_rounds)

# Get the predicted values for the test data
Y_test_pred = gbdt.predict(dtest)
Y_test_pred[Y_test_pred > 1] = 1
Y_test_pred[Y_test_pred < 0] = 0

In [7]:
id_test = test_data['ID']
pd.DataFrame({"id": id_test, "PredictedProb": Y_test_pred}).to_csv('bnp_submission_3.csv',index=False)

Submission 2 - Tuned parameters

In [None]:
eta = 0.01
max_depth = 5
num_rounds = 400

xgb_params = {'objective': 'reg:linear', 'eta': eta, 'max_depth': max_depth, 'seed': 42, 'eval_metric': 'logloss'}

# Format the matrices for xgb
dtrain = xgb.DMatrix(X_train, label = Y_train)
dtest = xgb.DMatrix(test_data, label = Y_train)
gbdt = xgb.train(xgb_params, dtrain, num_rounds)

# Get the predicted values for the test data
Y_test_pred = gbdt.predict(dtest)
Y_test_pred[Y_test_pred > 1] = 1
Y_test_pred[Y_test_pred < 0] = 0