In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import graphviz
import seaborn as sns
import tqdm
import time
%matplotlib inline

import sklearn
from sklearn.metrics import accuracy_score
import xgboost as xgb

%load_ext watermark
%load_ext blackcellmagic

In [2]:
#loading data
df_train_y = pd.read_csv("../CS155_PROJECT1/Data/caltech-cs155-2020/train.csv", index_col=0, usecols = [0,27])
df_train = pd.read_csv("../CS155_PROJECT1/Data/caltech-cs155-2020/train.csv", index_col=0, usecols = lambda column : column not in["y"])
df_train_full= pd.read_csv("../CS155_PROJECT1/Data/caltech-cs155-2020/train.csv", index_col=0)
df_test = pd.read_csv("../CS155_PROJECT1/Data/caltech-cs155-2020/test.csv", index_col=0)

#splitting into training and validation sets 
X_train, X_validate, y_train, y_validate = sklearn.model_selection.train_test_split(df_train, df_train_y, test_size=0.2, shuffle = False)

In [3]:
			#loading data into xgb format
dtrain = xgb.DMatrix(X_train, label = y_train)
dvalidate =xgb.DMatrix(X_validate, label = y_validate)
dtest = xgb.DMatrix(df_test)
dtrainfull = xgb.DMatrix(df_train, label = df_train_y)

In [4]:
params = {
    # Parameters that we are going to tune.
    'max_depth':5,
    'min_child_weight': 0,
    'eta':.01,
    'subsample': 1,
    'colsample_bytree': 1,
    'gamma' : 0,
    # Other parameters
    'objective':'binary:logistic',
    'eval_metric':'auc',
}

#define the eval metrics
evals = [(dtrain, 'train'), (dvalidate, 'validate')]

#high number of boosting rounds to allow the model to go to completion
num_boost_round = 2000

#### Train a model with the initial parameters as a baseline.

In [None]:
bst = xgb.train(params, dtrain, num_boost_round, evals=evals, early_stopping_rounds=10)

#### Train all combinatorially.

In [7]:
gridsearch_params = [
    (max_depth, min_child_weight, eta, subsample, colsample, gamma)
    for max_depth in range(3,8)
    for min_child_weight in range(0,2)
    for eta in (0.2, .1, .05, .01)
    for subsample in [i/10. for i in range(8,11)]
    for colsample in [i/10. for i in range(8,11)]
    for gamma in (.1, 0.0)
]

#### All combinations:

In [None]:
# Define initial best params and auc
max_auc = float(0)
best_params = None
for max_depth, min_child_weight, eta, subsample, colsample, gamma in gridsearch_params:
    print("CV with max_depth={}, min_child_weight={}, eta={}, subsample={}, colsample={}, gamma={}".format(
                             max_depth,
                             min_child_weight,
                             eta,
                             subsample,
                             colsample,
                             gamma))
    # Update our parameters
    params['max_depth'] = max_depth
    params['min_child_weight'] = min_child_weight
    params['eta'] = eta
    params['subsample'] = subsample
    params['colsample_bytree'] = colsample
    params['gamma'] = gamma
    
    # Run CV
    cv_results = xgb.cv(
        params,
        dtrain,
        num_boost_round=300,
        seed=28,
        nfold=10,
        metrics={'auc'},
        early_stopping_rounds=7	
    )
    
    # Update best AUC
    mean_auc = cv_results['test-auc-mean'].max()
    boost_rounds = cv_results['test-auc-mean'].idxmax()
    print("\tAUC {} for {} rounds". format(mean_auc, boost_rounds))
    if mean_auc > max_auc:
        max_auc = mean_auc
        best_params = (max_depth,min_child_weight,eta,subsample,colsample,gamma)
print("Best params: max_depth={}, min_child_weight={}, eta={}, subsample={}, colsample={}, gamma={} AUC: {}".format(best_params[0], best_params[1], best_params[2], best_params[3], best_params[4], best_params[5], max_auc))

In [124]:
#update the parameters
params['max_depth'] = best_params[0]
params['min_child_weight'] = best_params[1]
params['eta'] = best_params[2]
params['subsample'] = best_params[3]
params['colsample_bytree'] = best_params[4]
params['gamma'] = best_params[5]

In [None]:
model = xgb.train(
    params,
    dtrain,
    num_boost_round=num_boost_round,
    evals=evals,
    early_stopping_rounds=20
)

#### Train a model with the new params and exactly the right number of iterations to see its performance on the validation set.

In [None]:
num_boost_round = model.best_iteration + 1

best_model = xgb.train(
    params,
    dtrain,
    num_boost_round=num_boost_round,
    evals=evals
)

#### Is the above AUC score correct?

In [None]:
#predict on validation set
ypred = best_model.predict(dvalidate)

In [None]:
#calculate AUC on validation set
roc = sklearn.metrics.roc_auc_score(y_validate, ypred)
print("AUC: %.4f%% " % (roc * 100))

#### Predict!!

In [162]:
#predict on test set
predictions = best_model.predict(dtest)

# Kaggle needs the submission to have a certain format;
submission = pd.DataFrame({ 'id': df_test.index,
                            'Predicted': predictions})

In [165]:
#is the format correct?
submission.head()

Unnamed: 0,id,Predicted
0,592380,0.52899
1,592381,0.342812
2,592382,0.395533
3,592383,0.577469
4,592384,0.345996


In [33]:
submission.to_csv("XGboost_model2_combinatorial_optimizer_submission1.csv", index=False)

#### Save the model in case we want to use it later.

In [None]:
best_model.save_model("best_model.model")