In [1]:
import pandas as pd
import numpy as np

# Import dataset

In [2]:
train = pd.read_csv('../data/voting outcomes/train2016.csv')

# Extract target vector and drop useless data

In [3]:
target = train['Party']

In [4]:
train = train.drop(['Party','USER_ID'], axis=1)

# Dummy missing values imputation

In [5]:
train = train.fillna('NoData')

# Convert YOB to age

In [6]:
def convert_to_age(age):
    if age != 'NoData':
        return 2016 - age
    else:
        return 'NoData'

train['YOB'] = train['YOB'].apply(convert_to_age)

# Aggeregate age to age groups (numerical -> ordinal)

In [7]:
age_borders = [18, 27, 35, 50, 65, 80]

In [8]:
def return_border(age, borders):
    i = 0
    while i < len(borders):
        if age >= borders[i]:
            i += 1
        else:
            return str(i)
    
    return str(i)
            

In [9]:
mask = train['YOB']!='NoData'

train['YOB'][mask] = train['YOB'][mask].apply(return_border, borders = age_borders)

# One hot encoding for all columns

In [10]:
train = pd.get_dummies(train)

# Binarize target variable

In [11]:
from sklearn.preprocessing import LabelEncoder

In [12]:
lb = LabelEncoder()

In [13]:
target_binary = lb.fit_transform(target)

# Simple xgboost for classification:)

In [14]:
import xgboost as xgb
from sklearn.cross_validation import StratifiedKFold
from sklearn.metrics import accuracy_score

params = {}
params["objective"] = "binary:logistic"
params["eta"] = 0.01
params["min_child_weight"] = 1
params["subsample"] = 0.9
params["colsample_bytree"] = 0.8
params["silent"] = 0
params["max_depth"] = 6
params["eval_metric"] = "error"
params['seed'] = 1488
num_round = 5000

y = pd.Series(target_binary)
skf = StratifiedKFold(y.values, n_folds=10, random_state=1488)
results = []
trees = []

for (i, (train_index, test_index)) in enumerate(skf):
    xg_train = xgb.DMatrix(train.ix[train_index], label=y.ix[train_index] , missing=np.nan)
    xg_test = xgb.DMatrix(train.ix[test_index], label=y.ix[test_index], missing=np.nan)
    watchlist = [(xg_train,'train'), (xg_test, 'test')]
    bst = xgb.train(params, xg_train, num_round, watchlist, early_stopping_rounds=30, verbose_eval=False)
    print 'Accuracy on fold {num}: {acc}'.format(num=i, acc=(1 - bst.best_score))
    results.append(bst.best_score)
    trees.append(bst)

Accuracy on fold 0: 0.639785
Accuracy on fold 1: 0.660682
Accuracy on fold 2: 0.64632
Accuracy on fold 3: 0.637343
Accuracy on fold 4: 0.640934
Accuracy on fold 5: 0.639138
Accuracy on fold 6: 0.630162
Accuracy on fold 7: 0.636691
Accuracy on fold 8: 0.622302
Accuracy on fold 9: 0.564748


In [15]:
print "Average accuracy: {acc}".format(acc = (1 - np.mean(results)))

Average accuracy: 0.6318105


# Bayesian Optimization Intro

In [46]:
from bayes_opt import BayesianOptimization

In [45]:
def xgboostcv(eta,
              subsample,
              colsample_bytree,
              min_child_weight,
              max_depth,
              seed = 1488):
    
    params = {}
    params["objective"] = "binary:logistic"
    params["eta"] = eta
    params["min_child_weight"] = min_child_weight
    params["subsample"] = subsample
    params["colsample_bytree"] = colsample_bytree
    params["silent"] = 1
    params["max_depth"] = int(max_depth)
    params["eval_metric"] = "error"
    params['seed'] = seed
    num_round = 5000
    bst = xgb.train(params, xg_train, num_round, watchlist, early_stopping_rounds=30, verbose_eval=False)
    print bst.best_score, bst.best_iteration, params
    
    return -bst.best_score
    
    

In [None]:
search_space = {'eta': (0.01, 0.1),
                'subsample': (0.1, 0.95),
                'min_child_weight': (1, 10),
                'colsample_bytree': (0.1, 0.95),
                'max_depth': (3, 20)
    
}
xgbBO = BayesianOptimization(xgboostcv, search_space )

xgbBO.maximize(n_iter=10)

print('-'*53)
print('Final Results')
print('RFC: %f' % xgbBO.res['max']['max_val'])

In [None]:
xgbBO.res