In [1]:
import pandas as pd
import numpy as np

# Import dataset

In [2]:
train = pd.read_csv('../data/voting outcomes/train2016.csv')

# Extract target vector and drop useless data

In [3]:
target = train['Party']

In [4]:
train = train.drop(['Party','USER_ID'], axis=1)

# Dummy missing values imputation

In [5]:
train = train.fillna('NoData')

# Convert YOB to age

In [6]:
def convert_to_age(age):
    if age != 'NoData':
        return 2016 - age
    else:
        return 'NoData'

train['YOB'] = train['YOB'].apply(convert_to_age)

# Aggeregate age to age groups (numerical -> ordinal)

In [7]:
age_borders = [18, 27, 35, 50, 65, 80]

In [8]:
def return_border(age, borders):
    i = 0
    while i < len(borders):
        if age >= borders[i]:
            i += 1
        else:
            return str(i)
    
    return str(i)
            

In [9]:
mask = train['YOB']!='NoData'

train['YOB'][mask] = train['YOB'][mask].apply(return_border, borders = age_borders)

# One hot encoding for all columns

In [10]:
train = pd.get_dummies(train)

# Binarize target variable

In [11]:
from sklearn.preprocessing import LabelEncoder

In [12]:
lb = LabelEncoder()

In [13]:
target_binary = lb.fit_transform(target)

# Simple xgboost for classification:)

In [41]:
import xgboost as xgb
from sklearn.cross_validation import StratifiedKFold
from sklearn.metrics import accuracy_score

In [None]:
y = pd.Series(target_binary)
skf = StratifiedKFold(y.values, n_folds=10, random_state=1488)

In [44]:
# Cross validated xgboost 

def xgboostcv(eta,
              subsample,
              colsample_bytree,
              min_child_weight,
              max_depth,
              seed = 1488,
              verbose = False):
    
    params = {}
    params["objective"] = "binary:logistic"
    params["eta"] = eta
    params["min_child_weight"] = min_child_weight
    params["subsample"] = subsample
    params["colsample_bytree"] = colsample_bytree
    params["silent"] = 1
    params["max_depth"] = int(max_depth)
    params["eval_metric"] = "error"
    params['seed'] = seed
    results = []
    
    for (i, (train_index, test_index)) in enumerate(skf):
        xg_train = xgb.DMatrix(train.ix[train_index], label=y.ix[train_index] , missing=np.nan)
        xg_test = xgb.DMatrix(train.ix[test_index], label=y.ix[test_index], missing=np.nan)
        watchlist = [(xg_train,'train'), (xg_test, 'test')]
        bst = xgb.train(params, xg_train, num_round, watchlist, early_stopping_rounds=30, verbose_eval=False)
        if verbose:
            print 'Accuracy on fold {num}: {acc}'.format(num=i, acc=(1 - bst.best_score))
        results.append(bst.best_score)

    # bayes_opt use maximize() method, so we invert our target metric 
    # to convert task from minimization to maximization 
    return -np.mean(results)
    
    

# Manually selected params

In [46]:
avg_acc = xgboostcv(eta=0.01, 
                    subsample=0.9, 
                    colsample_bytree=0.8, 
                    min_child_weight=1, 
                    max_depth=6, 
                    verbose=True)

Accuracy on fold 0: 0.639785
Accuracy on fold 1: 0.660682
Accuracy on fold 2: 0.64632
Accuracy on fold 3: 0.637343
Accuracy on fold 4: 0.640934
Accuracy on fold 5: 0.639138
Accuracy on fold 6: 0.630162
Accuracy on fold 7: 0.636691
Accuracy on fold 8: 0.622302
Accuracy on fold 9: 0.564748


In [47]:
print "Average accuracy: {acc}".format(acc = (1 + avg_acc))

Average accuracy: 0.6318105


# Bayesian Optimization Intro

In [18]:
from bayes_opt import BayesianOptimization

## Define search space

In [48]:
search_space = {'eta': (0.01, 0.1),
                'subsample': (0.6, 0.95),
                'min_child_weight': (1, 4),
                'colsample_bytree': (0.6, 0.95),
                'max_depth': (3, 12)
    
}

## Run N_ITER number of iterations to find optimal parameters

In [49]:
N_ITER = 50

In [50]:
xgbBO = BayesianOptimization(xgboostcv, search_space)

xgbBO.maximize(n_iter=N_ITER)

print('-'*53)
print('Final Results')
print('Average accuracy: %f' % (1+xgbBO.res['max']['max_val']))

[31mInitialization[0m
[94m---------------------------------------------------------------------------------------------------------------[0m
 Step |   Time |      Value |   colsample_bytree |       eta |   max_depth |   min_child_weight |   subsample | 
    1 | 00m21s | [35m  -0.36693[0m | [32m            0.6431[0m | [32m   0.0199[0m | [32m     8.5418[0m | [32m            2.0275[0m | [32m     0.9061[0m | 
    2 | 00m09s | [35m  -0.36011[0m | [32m            0.7958[0m | [32m   0.0748[0m | [32m     4.7933[0m | [32m            1.9264[0m | [32m     0.9408[0m | 
    3 | 00m11s |   -0.36082 |             0.6281 |    0.0725 |      5.1527 |             1.8710 |      0.8455 | 
    4 | 00m16s |   -0.36316 |             0.6935 |    0.0596 |      6.6169 |             1.2981 |      0.8284 | 
    5 | 00m07s |   -0.36532 |             0.7410 |    0.0767 |      3.2957 |             2.4387 |      0.9264 | 
[31mBayesian Optimization[0m
[94m---------------------------------

In [51]:
xgbBO.res['max']['max_params']

{'colsample_bytree': 0.7881412413034331,
 'eta': 0.092129799923265018,
 'max_depth': 5.0621102463558092,
 'min_child_weight': 1.4488271403601218,
 'subsample': 0.73410419114918546}

## Bayesian optimal params

In [52]:
optimal_params = xgbBO.res['max']['max_params']

In [53]:
avg_acc = xgboostcv(eta=optimal_params['eta'], 
                    subsample=optimal_params['subsample'], 
                    colsample_bytree=optimal_params['colsample_bytree'], 
                    min_child_weight=optimal_params['min_child_weight'], 
                    max_depth=optimal_params['max_depth'], 
                    verbose=True)

Accuracy on fold 0: 0.646953
Accuracy on fold 1: 0.655296
Accuracy on fold 2: 0.67325
Accuracy on fold 3: 0.644524
Accuracy on fold 4: 0.642729
Accuracy on fold 5: 0.64991
Accuracy on fold 6: 0.669659
Accuracy on fold 7: 0.631295
Accuracy on fold 8: 0.660072
Accuracy on fold 9: 0.564748


In [54]:
print "Average accuracy with BayesOpt params: {acc}".format(acc = (1 + avg_acc))

Average accuracy with BayesOpt params: 0.6438436
