In [9]:
import numpy as np
import pandas as pd
import sklearn.cross_validation
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
from IPython.display import display, HTML

%matplotlib inline

In [10]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv', index_col='id')

display(train.head(5))

# Break the training data into a target ("dependent") and inputs ("inpedendents")
y_train = train.ACTION
X_train = train.drop(["ACTION"], axis=1)
print("Number of instances:{}".format(len(train.index)))

from collections import Counter

def tally_predictions(predictions):
    count = Counter()
    for pred in predictions:
        if pred == 1:
            count[1] += 1
        else:
            count[0] += 1
    print(count[0])
    print(count[1])

Unnamed: 0,ACTION,RESOURCE,MGR_ID,ROLE_ROLLUP_1,ROLE_ROLLUP_2,ROLE_DEPTNAME,ROLE_TITLE,ROLE_FAMILY_DESC,ROLE_FAMILY,ROLE_CODE
0,1,39353,85475,117961,118300,123472,117905,117906,290919,117908
1,1,17183,1540,117961,118343,123125,118536,118536,308574,118539
2,1,36724,14457,118219,118220,117884,117879,267952,19721,117880
3,1,36135,5396,117961,118343,119993,118321,240983,290919,118322
4,1,42680,5905,117929,117930,119569,119323,123932,19793,119325


Number of instances:32769


In [12]:
from sklearn.cross_validation import train_test_split, cross_val_score
from sklearn.metrics import roc_auc_score

X_t, X_val, y_t, y_val = train_test_split(X_train, y_train, test_size=0.33, random_state=42)
y_val = np.array(y_val)

In [6]:
import xgboost as xgb

# Test out a simple XGBClassifier()
xgmodel = xgb.XGBClassifier(max_depth=10, n_estimators=2000)
xgmodel.fit(X_t, y_t)
preds = xgmodel.predict_proba(X_val)[:, 1]

print(preds)
roc_auc_score(y_val, preds)

[ 0.99938524  0.9999845   0.99974126 ...,  0.99933136  0.99996018
  0.99939835]


0.84019967107364113

In [13]:
from sklearn.grid_search import RandomizedSearchCV, GridSearchCV

# Grid search XGB
parameters = {
    'max_depth': [8],
    'learning_rate': [0.3],
    'n_estimators': [155],
    'min_child_weight': [0.6],
    'colsample_bytree': [0.5]
}

xg_clf = GridSearchCV(xgb.XGBClassifier(), parameters, cv=10, n_jobs=-1, scoring='roc_auc')
xg_clf.fit(X_train, y_train)

print("Best parameter set found on development set with cv=10:\n")
print(xg_clf.best_params_)
print()
for params, mean_score, scores in xg_clf.grid_scores_:
    print("{0:.3f} (+/-{1:.03f}) for {2}".format(mean_score, scores.std() * 2, params))
print()

Best parameter set found on development set with cv=10:

{'max_depth': 8, 'n_estimators': 155, 'colsample_bytree': 0.5, 'min_child_weight': 0.6, 'learning_rate': 0.3}

0.863 (+/-0.024) for {'max_depth': 8, 'n_estimators': 155, 'colsample_bytree': 0.5, 'min_child_weight': 0.6, 'learning_rate': 0.3}



Best so far:
    
0.867 (+/-0.027) for {'min_child_weight': 0.6, 'colsample_bytree': 0.5, 'n_estimators': 155, 'learning_rate': 0.3, 'max_depth': 8}

In [14]:
preds = xg_clf.predict_proba(test)[:, 1]

# Spit out predictions to a file
pred_write = enumerate(preds, start=1)
with open('output/xgboost_best_model.csv', 'w') as f:
    f.write('Id,Action\n')
    for instance, prediction in pred_write:
        f.write('{},{}\n'.format(instance, prediction))