In [10]:
import numpy as np
import pandas as pd
from sklearn import pipeline, model_selection
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import FeatureUnion
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.metrics import log_loss
from xgboost.sklearn import XGBClassifier

Set our parameters for xgboost
For Imbalanced dataset we set max_delta_step = 1

In [41]:

params = {}
params['max_depth'] = [4,6,10]
params['max_delta_step'] = 1
params['n_estimators'] = [100, 250, 500]
params['subsample'] =  [0.9, 1.0]
params['colsample_bytree']= [0.9, 1.0]
test_size = 0.20
random_seed = 42
label_encoder = LabelEncoder()

- Read the data
- Separate the labels
- Remove the unnecessary variables

In [17]:
dataset = pd.read_csv('../../../data/processed_data.csv')
labels = dataset[['like']]
data = dataset.drop(['Unnamed: 0','player_id','subject_id','like'],axis=1)

Encoding function for Categorical variables

In [18]:
def encode_features(df, encoder):
        columnsToEncode = list(df.select_dtypes(include=['category','object']))
        for feature in columnsToEncode:
            try:
                df[feature] = encoder.fit_transform(df[feature])
            except:
                print('Error encoding '+ feature)
        return df

Data Preparation:
- Encode categorical data
- Divide the data into training and test set

In [19]:
def data_prep():
    data_label_encoded = encode_features(data, label_encoder)
    X_train, X_test, y_train, y_test = train_test_split(data_label_encoded, labels, test_size=test_size, random_state=random_seed)
    return X_train, X_test, y_train, y_test

In [20]:
X_train, X_test, y_train, y_test = data_prep()

In [35]:
X_train.shape

(400610, 29)

Initialize the XGBClassifier model

In [42]:
xgb_model = XGBClassifier( learning_rate=0.1, 
                                        min_child_weight=1, objective= 'binary:logistic', nthread=4, scale_pos_weight=1, seed=27)

Run grid search

In [43]:
grid_search = model_selection.GridSearchCV(estimator = xgb_model , 
                       param_grid = params, scoring='roc_auc',n_jobs=4, cv=5)
grid_search.fit(X_train,y_train.values.ravel())

GridSearchCV(cv=5, error_score='raise',
       estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=4, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=27, silent=True,
       subsample=1),
       fit_params={}, iid=True, n_jobs=4,
       param_grid={'max_depth': [4, 6, 10], 'n_estimators': [100, 250, 500], 'subsample': [0.9, 1.0], 'colsample_bytree': [0.9, 1.0]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='roc_auc', verbose=0)