# Hyperparameter Optimization

In [1]:
import pandas as pd
import numpy as np

from functools import partial

from sklearn.datasets import make_regression

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

import xgboost as xgb

from hyperopt import hp
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials

import matplotlib.pyplot as plt
plt.rcParams['figure.figsize']=(12,4)
import seaborn as sns


%matplotlib inline

In [12]:
df = pd.read_hdf('../input/train.adult.h5')

In [15]:
df = df.fillna(-1)
cat_feats = df.select_dtypes(include=[np.object]).columns

for cat_feat in cat_feats:
    df['{0}_cat'.format(cat_feat)] = pd.factorize( df[cat_feat] )[0]

df['Sex_cat'] = df['Sex'].map(lambda x: int(x=True))
    
train = df.select_dtypes(include=[np.int8, np.int16, np.int64])
train.info()
feats = train.columns.values
feats = list(feats)
feats.remove('Target_cat')
feats.remove('Education_cat')
feats = np.array(feats)

X = train[feats]
y = train['Target_cat']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 32561 entries, 0 to 32560
Data columns (total 15 columns):
Age                   32561 non-null int8
fnlwgt                32561 non-null int64
Education-Num         32561 non-null int64
Capital Gain          32561 non-null int16
Capital Loss          32561 non-null int16
Hours per week        32561 non-null int8
Workclass_cat         32561 non-null int64
Education_cat         32561 non-null int64
Martial Status_cat    32561 non-null int64
Occupation_cat        32561 non-null int64
Relationship_cat      32561 non-null int64
Race_cat              32561 non-null int64
Country_cat           32561 non-null int64
Target_cat            32561 non-null int64
Sex_cat               32561 non-null int64
dtypes: int16(2), int64(11), int8(2)
memory usage: 3.2 MB


In [17]:
def objective(space):
    
    xgb_params = {
        'max_depth': int(space['max_depth']),
        'colsample_bytree': space['colsample_bytree'],
        'learning_rate': space['learning_rate'],
        'subsample': space['subsample'],
        'seed': int(space['seed']),
        'min_child_weight': int(space['min_child_weight']),
        'reg_alpha': space['reg_alpha'],
        'reg_lambda': space['reg_lambda'],
        'n_estimators': 100
    }
    
    model = xgb.XGBClassifier(**xgb_params)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    score = mean_squared_error(y_test, y_pred)
    
    print("SCORE: {0}".format(score))
    
    return{'loss':score, 'status': STATUS_OK }
    
space ={
    'max_depth': hp.quniform ('x_max_depth', 5, 20, 1),
    'colsample_bytree': hp.uniform ('x_colsample_bytree', 0.8, 1.),
    'learning_rate': hp.uniform ('x_learning_rate', 0.05, 0.2),
    'subsample': hp.uniform ('x_subsample', 0.7, 1.),
    'seed': hp.quniform ('x_seed', 0, 10000, 50),
    'min_child_weight': hp.quniform ('x_min_child_weight', 1, 10, 1),
    'reg_alpha': hp.loguniform ('x_reg_alpha', 0., 1.),
    'reg_lambda': hp.uniform ('x_reg_lambda', 0.7, 1.),
}


trials = Trials()
best_params = fmin(fn=objective,
            space=space,
            algo=partial(tpe.suggest, n_startup_jobs=1),
            max_evals=20,
            trials=trials)

print("The best params: ", best_params)

SCORE: 0.1303101648070427
SCORE: 0.1305148940526154
SCORE: 0.1273415907462381
SCORE: 0.12744395536902448
SCORE: 0.12754631999181082
SCORE: 0.1320503633944109
SCORE: 0.13061725867540178
SCORE: 0.1290817893336063
SCORE: 0.13573548981472003
SCORE: 0.12754631999181082
SCORE: 0.1305148940526154
SCORE: 0.12867233084246085
SCORE: 0.13348346811342
SCORE: 0.1303101648070427
SCORE: 0.1349165728324291
SCORE: 0.13420002047292456
SCORE: 0.1349165728324291
SCORE: 0.12785341386016993
SCORE: 0.12723922612345173
SCORE: 0.12990070631589723
The best params:  {'x_colsample_bytree': 0.9527966669828812, 'x_learning_rate': 0.18143258070321622, 'x_max_depth': 5.0, 'x_min_child_weight': 9.0, 'x_reg_alpha': 1.009865557984001, 'x_reg_lambda': 0.7394799229806378, 'x_seed': 100.0, 'x_subsample': 0.9558840589248941}


In [28]:
from sklearn.metrics import accuracy_score

best = {'colsample_bytree': 0.9527966669828812, 'learning_rate': 0.18143258070321622, 'max_depth': 5, 'min_child_weight': 9.0, 'reg_alpha': 1.009865557984001, 'reg_lambda': 0.7394799229806378, 'seed': 100, 'subsample': 0.9558840589248941}

model = xgb.XGBRegressor(**best)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
score =1-(mean_squared_error(y_test, y_pred))
score
    


0.9090404527307987