In [1]:
import pandas as pd
import numpy as np
import sys
sys.path.append("/home/mizworski/Repos/xgboost/python-package/")
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from datetime import datetime
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import StratifiedKFold
import matplotlib.pyplot as plt
from sklearn.utils.validation import column_or_1d
from scipy import interp

In [2]:
date_parse_format = '%Y-%m-%d'


def preprocess_data(data):
    categorical_cols = list(set(data.columns) - set(['birth_date', 'contact_date', 'pdays', 'campaign', 'y']))
    
    X = pd.get_dummies(data, columns=categorical_cols, drop_first=True)
    X['contacted'] = pd.get_dummies(X.pdays < 999, drop_first=True)
    now = datetime.strptime(X.contact_date.max(), date_parse_format)
    X['age'] = X.apply(lambda r: int((now - datetime.strptime(r.birth_date, date_parse_format)).days / 365.25), axis=1)
    X.drop('birth_date', axis=1, inplace=True)
    X['days_since_contact'] =  X.apply(lambda r: int((now - datetime.strptime(r.contact_date, date_parse_format)).days), axis=1)
    X.drop('contact_date', axis=1, inplace=True)
    
    train_x = X[X.y != 'unknown']
    test_x = X[X.y == 'unknown']
    train_y = train_x['y']
    train_y = pd.get_dummies(train_y, drop_first=True)
    train_x = train_x.drop('y', axis=1)
    test_x = test_x.drop('y', axis=1)
    
#     train_y = column_or_1d(train_y)
#     train_x = train_x.as_matrix()
#     test_x = test_x.as_matrix()
    
    return train_x, train_y, test_x
    

In [3]:
df = pd.read_csv('../data/bank-classification.csv', index_col=0)
df = df.sample(frac=1)

In [4]:
X_train, Y_train, X_test = preprocess_data(df)

In [5]:
X_train.columns

Index(['campaign', 'pdays', 'job_blue-collar', 'job_entrepreneur',
       'job_housemaid', 'job_management', 'job_retired', 'job_self-employed',
       'job_services', 'job_student', 'job_technician', 'job_unemployed',
       'job_unknown', 'default_unknown', 'default_yes', 'education_basic.6y',
       'education_basic.9y', 'education_high.school', 'education_illiterate',
       'education_professional.course', 'education_university.degree',
       'education_unknown', 'housing_unknown', 'housing_yes', 'loan_unknown',
       'loan_yes', 'previous_1', 'previous_2', 'previous_3', 'previous_4',
       'previous_5', 'previous_6', 'previous_7', 'poutcome_nonexistent',
       'poutcome_success', 'marital_married', 'marital_single',
       'marital_unknown', 'contact_telephone', 'contacted', 'age',
       'days_since_contact'],
      dtype='object')

In [6]:
X_train['houseXmarried'] = X_train.housing_yes * X_train.marital_married
X_train['uniXmarried'] = X_train['education_university.degree'] * X_train.marital_married
X_train['highschoolXmarried'] = X_train['education_high.school'] * X_train.marital_married
X_train['job_high_paid'] = X_train.job_entrepreneur + X_train.job_management + X_train.job_technician
X_train['campaignXdays_since_contact']= X_train.campaign * X_train.days_since_contact
X_train['campaignXpoutcome_success']= X_train.campaign * X_train.poutcome_success

In [15]:
from scipy import stats
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [16]:

X_train = X_train.as_matrix()
Y_train = column_or_1d(Y_train)

AttributeError: 'numpy.ndarray' object has no attribute 'as_matrix'

In [17]:
from numpy import random

In [18]:
clf_xgb = XGBClassifier(objective = 'binary:logistic')
param_dist = {'n_estimators': stats.randint(200, 1000),
              'learning_rate': np.arange(0.01, 0.7, 0.01),
              'subsample': np.arange(0.95, 1.0, 0.001),
              'max_depth': [3, 4, 5, 6, 7, 9],
              'colsample_bytree': np.arange(0.95, 1.0, 0.001),
              'min_child_weight': [1, 2, 3, 4]
             }
param_dist = {'n_estimators': stats.randint(200, 1000),
              'learning_rate': 10 ** np.arange(-3, -1, 0.01),
              'subsample': [1],
              'max_depth': [3, 4, 5, 6, 7, 8, 9],
              'colsample_bytree': [1],
              'min_child_weight': [1, 2, 3],
              'gamma' : np.arange(0, 1.488, 0.01),
              'reg_alpha':np.arange(0, 1, 0.01), 
              'reg_lambda':np.arange(0, 1, 0.01)
             }

# numFolds = 5
# kfold_5 = cross_validation.KFold(n = len(X), shuffle = True, n_folds = numFolds)

clf = RandomizedSearchCV(clf_xgb, 
                         param_distributions = param_dist,
                         cv = 6,  
                         n_iter = 25, # you want 5 here not 25 if I understand you correctly 
                         scoring = 'roc_auc', 
                         error_score = 0, 
                         verbose = 3, 
                         n_jobs = -1)

In [19]:
X_train.shape

(20799, 48)

In [20]:
Y_train.shape

(20799,)

In [21]:
clf.fit(X_train, Y_train)

Fitting 6 folds for each of 25 candidates, totalling 150 fits
[CV] colsample_bytree=1, gamma=0.54, learning_rate=0.0013803842646, max_depth=3, min_child_weight=1, n_estimators=823, reg_alpha=0.84, reg_lambda=0.33, subsample=1 
[CV] colsample_bytree=1, gamma=0.54, learning_rate=0.0013803842646, max_depth=3, min_child_weight=1, n_estimators=823, reg_alpha=0.84, reg_lambda=0.33, subsample=1 
[CV] colsample_bytree=1, gamma=0.54, learning_rate=0.0013803842646, max_depth=3, min_child_weight=1, n_estimators=823, reg_alpha=0.84, reg_lambda=0.33, subsample=1 
[CV] colsample_bytree=1, gamma=0.54, learning_rate=0.0013803842646, max_depth=3, min_child_weight=1, n_estimators=823, reg_alpha=0.84, reg_lambda=0.33, subsample=1 
[CV] colsample_bytree=1, gamma=0.54, learning_rate=0.0013803842646, max_depth=3, min_child_weight=1, n_estimators=823, reg_alpha=0.84, reg_lambda=0.33, subsample=1 
[CV] colsample_bytree=1, gamma=0.54, learning_rate=0.0013803842646, max_depth=3, min_child_weight=1, n_estimators

KeyboardInterrupt: 

In [102]:

#trust your CV!
best_parameters, score, _ = max(clf.grid_scores_, key=lambda x: x[1])
print('Raw AUC score:', score)
for param_name in sorted(best_parameters.keys()):
    print("%s: %r" % (param_name, best_parameters[param_name]))

test_probs = clf.predict_proba(X_test.as_matrix())[:,1]

Raw AUC score: 0.789172634049
colsample_bytree: 1
gamma: 0.02
learning_rate: 0.017782794100388136
max_depth: 4
min_child_weight: 1
n_estimators: 264
reg_alpha: 0.029999999999999999
reg_lambda: 0.46000000000000002
subsample: 1




ValueError: feature_names mismatch: ['f0', 'f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9', 'f10', 'f11', 'f12', 'f13', 'f14', 'f15', 'f16', 'f17', 'f18', 'f19', 'f20', 'f21', 'f22', 'f23', 'f24', 'f25', 'f26', 'f27', 'f28', 'f29', 'f30', 'f31', 'f32', 'f33', 'f34', 'f35', 'f36', 'f37', 'f38', 'f39', 'f40', 'f41', 'f42', 'f43', 'f44', 'f45', 'f46', 'f47'] ['f0', 'f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9', 'f10', 'f11', 'f12', 'f13', 'f14', 'f15', 'f16', 'f17', 'f18', 'f19', 'f20', 'f21', 'f22', 'f23', 'f24', 'f25', 'f26', 'f27', 'f28', 'f29', 'f30', 'f31', 'f32', 'f33', 'f34', 'f35', 'f36', 'f37', 'f38', 'f39', 'f40', 'f41']
expected f46, f47, f45, f42, f44, f43 in input data

In [None]:
clf.grid_scores_

In [103]:
param_dist = {'n_estimators': stats.randint(200, 1000),
              'learning_rate': 10 ** np.arange(-3, -2, 0.01),
              'subsample': [1],
              'max_depth': [4, 5, 6, 7],
              'colsample_bytree': [1],
              'min_child_weight': [1, 2, 3, 4],
              'gamma' : np.arange(1.1, 3.488, 0.01),
              'reg_alpha':np.arange(0.2, 1, 0.01), 
              'reg_lambda':np.arange(0.6, 1, 0.01)
             }

clf = RandomizedSearchCV(clf_xgb, 
                         param_distributions = param_dist,
                         cv = 6,  
                         n_iter = 25, # you want 5 here not 25 if I understand you correctly 
                         scoring = 'roc_auc', 
                         error_score = 0, 
                         verbose = 3, 
                         n_jobs = -1)
clf.fit(X_train, Y_train)

#trust your CV!
best_parameters, score, _ = max(clf.grid_scores_, key=lambda x: x[1])
print('Raw AUC score:', score)
for param_name in sorted(best_parameters.keys()):
    print("%s: %r" % (param_name, best_parameters[param_name]))

test_probs = clf.predict_proba(X_test.as_matrix())[:,1]

Fitting 6 folds for each of 25 candidates, totalling 150 fits
[CV] colsample_bytree=1, gamma=2.21, learning_rate=0.00251188643151, max_depth=5, min_child_weight=4, n_estimators=249, reg_alpha=0.21, reg_lambda=0.85, subsample=1 
[CV] colsample_bytree=1, gamma=2.21, learning_rate=0.00251188643151, max_depth=5, min_child_weight=4, n_estimators=249, reg_alpha=0.21, reg_lambda=0.85, subsample=1 
[CV] colsample_bytree=1, gamma=2.21, learning_rate=0.00251188643151, max_depth=5, min_child_weight=4, n_estimators=249, reg_alpha=0.21, reg_lambda=0.85, subsample=1 
[CV] colsample_bytree=1, gamma=2.21, learning_rate=0.00251188643151, max_depth=5, min_child_weight=4, n_estimators=249, reg_alpha=0.21, reg_lambda=0.85, subsample=1 
[CV] colsample_bytree=1, gamma=2.21, learning_rate=0.00251188643151, max_depth=5, min_child_weight=4, n_estimators=249, reg_alpha=0.21, reg_lambda=0.85, subsample=1 
[CV] colsample_bytree=1, gamma=2.21, learning_rate=0.00251188643151, max_depth=5, min_child_weight=4, n_esti

[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:   45.1s


[CV]  colsample_bytree=1, gamma=2.47, learning_rate=0.0056234132519, max_depth=4, min_child_weight=3, n_estimators=877, reg_alpha=0.61, reg_lambda=0.98, subsample=1, score=0.792875, total=  30.2s
[CV] colsample_bytree=1, gamma=2.84, learning_rate=0.00239883291902, max_depth=7, min_child_weight=2, n_estimators=636, reg_alpha=0.82, reg_lambda=0.79, subsample=1 
[CV]  colsample_bytree=1, gamma=2.47, learning_rate=0.0056234132519, max_depth=4, min_child_weight=3, n_estimators=877, reg_alpha=0.61, reg_lambda=0.98, subsample=1, score=0.799096, total=  31.9s
[CV] colsample_bytree=1, gamma=2.84, learning_rate=0.00239883291902, max_depth=7, min_child_weight=2, n_estimators=636, reg_alpha=0.82, reg_lambda=0.79, subsample=1 
[CV]  colsample_bytree=1, gamma=2.55, learning_rate=0.00109647819614, max_depth=5, min_child_weight=1, n_estimators=916, reg_alpha=0.22, reg_lambda=0.71, subsample=1, score=0.771768, total=  39.9s
[CV] colsample_bytree=1, gamma=2.84, learning_rate=0.00239883291902, max_depth=

[CV]  colsample_bytree=1, gamma=2.02, learning_rate=0.00346736850453, max_depth=6, min_child_weight=2, n_estimators=577, reg_alpha=0.66, reg_lambda=0.63, subsample=1, score=0.797749, total=  37.1s
[CV] colsample_bytree=1, gamma=3.44, learning_rate=0.00338844156139, max_depth=7, min_child_weight=2, n_estimators=719, reg_alpha=0.6, reg_lambda=0.71, subsample=1 
[CV]  colsample_bytree=1, gamma=2.02, learning_rate=0.00346736850453, max_depth=6, min_child_weight=2, n_estimators=577, reg_alpha=0.66, reg_lambda=0.63, subsample=1, score=0.798132, total=  34.8s
[CV] colsample_bytree=1, gamma=2.14, learning_rate=0.0036307805477, max_depth=7, min_child_weight=2, n_estimators=758, reg_alpha=0.32, reg_lambda=0.75, subsample=1 
[CV]  colsample_bytree=1, gamma=2.02, learning_rate=0.00346736850453, max_depth=6, min_child_weight=2, n_estimators=577, reg_alpha=0.66, reg_lambda=0.63, subsample=1, score=0.792148, total=  36.6s
[CV] colsample_bytree=1, gamma=2.14, learning_rate=0.0036307805477, max_depth=7

[CV]  colsample_bytree=1, gamma=2.12, learning_rate=0.00575439937337, max_depth=5, min_child_weight=3, n_estimators=519, reg_alpha=0.52, reg_lambda=0.67, subsample=1, score=0.798694, total=  24.2s
[CV] colsample_bytree=1, gamma=2.46, learning_rate=0.00954992586021, max_depth=5, min_child_weight=1, n_estimators=861, reg_alpha=0.82, reg_lambda=0.8, subsample=1 
[CV]  colsample_bytree=1, gamma=3.48, learning_rate=0.00478630092323, max_depth=7, min_child_weight=1, n_estimators=618, reg_alpha=0.55, reg_lambda=0.95, subsample=1, score=0.797238, total=  43.4s
[CV] colsample_bytree=1, gamma=2.46, learning_rate=0.00954992586021, max_depth=5, min_child_weight=1, n_estimators=861, reg_alpha=0.82, reg_lambda=0.8, subsample=1 
[CV]  colsample_bytree=1, gamma=3.48, learning_rate=0.00478630092323, max_depth=7, min_child_weight=1, n_estimators=618, reg_alpha=0.55, reg_lambda=0.95, subsample=1, score=0.793806, total=  43.8s
[CV] colsample_bytree=1, gamma=2.3, learning_rate=0.00125892541179, max_depth=7

[CV]  colsample_bytree=1, gamma=3.39, learning_rate=0.00199526231497, max_depth=5, min_child_weight=1, n_estimators=901, reg_alpha=0.84, reg_lambda=0.7, subsample=1, score=0.784912, total=  42.5s
[CV] colsample_bytree=1, gamma=3.02, learning_rate=0.0030199517204, max_depth=7, min_child_weight=3, n_estimators=477, reg_alpha=0.4, reg_lambda=0.73, subsample=1 
[CV]  colsample_bytree=1, gamma=3.39, learning_rate=0.00199526231497, max_depth=5, min_child_weight=1, n_estimators=901, reg_alpha=0.84, reg_lambda=0.7, subsample=1, score=0.772090, total=  44.5s
[CV] colsample_bytree=1, gamma=3.02, learning_rate=0.0030199517204, max_depth=7, min_child_weight=3, n_estimators=477, reg_alpha=0.4, reg_lambda=0.73, subsample=1 
[CV]  colsample_bytree=1, gamma=3.39, learning_rate=0.00199526231497, max_depth=5, min_child_weight=1, n_estimators=901, reg_alpha=0.84, reg_lambda=0.7, subsample=1, score=0.792895, total=  44.5s
[CV] colsample_bytree=1, gamma=3.02, learning_rate=0.0030199517204, max_depth=7, min

[CV]  colsample_bytree=1, gamma=2.84, learning_rate=0.00851138038202, max_depth=5, min_child_weight=4, n_estimators=287, reg_alpha=0.26, reg_lambda=0.84, subsample=1, score=0.774656, total=  12.0s
[CV] colsample_bytree=1, gamma=2.84, learning_rate=0.00851138038202, max_depth=5, min_child_weight=4, n_estimators=287, reg_alpha=0.26, reg_lambda=0.84, subsample=1 
[CV]  colsample_bytree=1, gamma=2.84, learning_rate=0.00851138038202, max_depth=5, min_child_weight=4, n_estimators=287, reg_alpha=0.26, reg_lambda=0.84, subsample=1, score=0.770274, total=  12.5s
[CV] colsample_bytree=1, gamma=2.84, learning_rate=0.00851138038202, max_depth=5, min_child_weight=4, n_estimators=287, reg_alpha=0.26, reg_lambda=0.84, subsample=1 
[CV]  colsample_bytree=1, gamma=2.84, learning_rate=0.00851138038202, max_depth=5, min_child_weight=4, n_estimators=287, reg_alpha=0.26, reg_lambda=0.84, subsample=1, score=0.796371, total=  14.1s
[CV] colsample_bytree=1, gamma=2.84, learning_rate=0.00851138038202, max_dept

[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed:  8.8min


[CV]  colsample_bytree=1, gamma=1.48, learning_rate=0.00588843655356, max_depth=7, min_child_weight=2, n_estimators=912, reg_alpha=0.52, reg_lambda=0.82, subsample=1, score=0.775519, total= 1.1min
[CV] colsample_bytree=1, gamma=2.36, learning_rate=0.00275422870334, max_depth=6, min_child_weight=2, n_estimators=723, reg_alpha=0.57, reg_lambda=0.61, subsample=1 
[CV]  colsample_bytree=1, gamma=1.48, learning_rate=0.00588843655356, max_depth=7, min_child_weight=2, n_estimators=912, reg_alpha=0.52, reg_lambda=0.82, subsample=1, score=0.770426, total= 1.0min
[CV] colsample_bytree=1, gamma=2.36, learning_rate=0.00275422870334, max_depth=6, min_child_weight=2, n_estimators=723, reg_alpha=0.57, reg_lambda=0.61, subsample=1 
[CV]  colsample_bytree=1, gamma=2.84, learning_rate=0.00851138038202, max_depth=5, min_child_weight=4, n_estimators=287, reg_alpha=0.26, reg_lambda=0.84, subsample=1, score=0.798842, total=  14.8s
[CV] colsample_bytree=1, gamma=2.36, learning_rate=0.00275422870334, max_dept

[CV]  colsample_bytree=1, gamma=2.26, learning_rate=0.00229086765277, max_depth=4, min_child_weight=3, n_estimators=640, reg_alpha=0.56, reg_lambda=0.76, subsample=1, score=0.768621, total=  24.5s
[CV] colsample_bytree=1, gamma=2.26, learning_rate=0.00229086765277, max_depth=4, min_child_weight=3, n_estimators=640, reg_alpha=0.56, reg_lambda=0.76, subsample=1 
[CV]  colsample_bytree=1, gamma=2.17, learning_rate=0.00288403150313, max_depth=7, min_child_weight=1, n_estimators=647, reg_alpha=0.23, reg_lambda=0.8, subsample=1, score=0.796962, total=  47.0s
[CV] colsample_bytree=1, gamma=3.05, learning_rate=0.0063095734448, max_depth=5, min_child_weight=2, n_estimators=820, reg_alpha=0.26, reg_lambda=0.96, subsample=1 
[CV]  colsample_bytree=1, gamma=2.17, learning_rate=0.00288403150313, max_depth=7, min_child_weight=1, n_estimators=647, reg_alpha=0.23, reg_lambda=0.8, subsample=1, score=0.783733, total=  49.1s
[CV] colsample_bytree=1, gamma=3.05, learning_rate=0.0063095734448, max_depth=5,

[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed: 11.5min finished


Raw AUC score: 0.789379398328
colsample_bytree: 1
gamma: 3.0500000000000016
learning_rate: 0.0063095734448016848
max_depth: 5
min_child_weight: 2
n_estimators: 820
reg_alpha: 0.26000000000000006
reg_lambda: 0.9600000000000003
subsample: 1




ValueError: feature_names mismatch: ['f0', 'f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9', 'f10', 'f11', 'f12', 'f13', 'f14', 'f15', 'f16', 'f17', 'f18', 'f19', 'f20', 'f21', 'f22', 'f23', 'f24', 'f25', 'f26', 'f27', 'f28', 'f29', 'f30', 'f31', 'f32', 'f33', 'f34', 'f35', 'f36', 'f37', 'f38', 'f39', 'f40', 'f41', 'f42', 'f43', 'f44', 'f45', 'f46', 'f47'] ['f0', 'f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9', 'f10', 'f11', 'f12', 'f13', 'f14', 'f15', 'f16', 'f17', 'f18', 'f19', 'f20', 'f21', 'f22', 'f23', 'f24', 'f25', 'f26', 'f27', 'f28', 'f29', 'f30', 'f31', 'f32', 'f33', 'f34', 'f35', 'f36', 'f37', 'f38', 'f39', 'f40', 'f41']
expected f46, f47, f45, f42, f44, f43 in input data

In [None]:
test_probs

In [None]:
test_indices = X_test.index.values
test_indices

In [None]:
results = pd.DataFrame({'id':test_indices, 'y':test_probs})

In [None]:
results.head()

In [None]:
results_sorted = results.sort_values(['id'])

In [None]:
results_sorted.head()

In [None]:
test_indices = X_test.index.values
test_indices
results = pd.DataFrame({'id':test_indices, 'y':test_probs})
results_sorted = results.sort_values(['id'])
results_sorted.to_csv('test_submition2.csv', index=False)