In [None]:
import numpy as np
import pandas as pd
from sklearn import pipeline, model_selection
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import FeatureUnion
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.metrics import log_loss
from xgboost.sklearn import XGBClassifier
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt

Set our parameters for xgboost
For Imbalanced dataset we set max_delta_step = 1

In [None]:
params = {}
params['max_depth'] = [4,6,10]
params['max_delta_step'] = (1,3)
params['n_estimators'] = [100, 250, 500]
params['subsample'] =  [0.9, 1.0]
params['colsample_bytree']= [0.9, 1.0]
test_size = 0.20
random_seed = 42
label_encoder = LabelEncoder()

- Read the data
- Separate the labels
- Remove the unnecessary variables

In [None]:
dataset = pd.read_csv('../../../data/processed_data.csv')
labels = dataset[['like']]
data = dataset.drop(['Unnamed: 0','player_id','subject_id','like'],axis=1)

Encoding function for Categorical variables

In [None]:
def encode_features(df, encoder):
        columnsToEncode = list(df.select_dtypes(include=['category','object']))
        for feature in columnsToEncode:
            try:
                df[feature] = encoder.fit_transform(df[feature])
            except:
                print('Error encoding '+ feature)
        return df

Data Preparation:
- Encode categorical data
- Divide the data into training and test set

In [None]:
def data_prep():
    data_label_encoded = encode_features(data, label_encoder)
    X_train, X_test, y_train, y_test = train_test_split(data_label_encoded, labels, test_size=test_size, random_state=random_seed)
    return X_train, X_test, y_train, y_test

In [None]:
X_train, X_test, y_train, y_test = data_prep()

In [None]:
X_train.shape

Initialize the XGBClassifier model

In [None]:
xgb_model = XGBClassifier( learning_rate=0.1, 
                                        min_child_weight=1, objective= 'binary:logistic', nthread=4, scale_pos_weight=1, seed=random_seed)

Run grid search

In [None]:
grid_search = model_selection.GridSearchCV(estimator = xgb_model , 
                       param_grid = params, scoring='f1_weighted',n_jobs=4, cv=5)
grid_search.fit(X_train,y_train.values.ravel())

In [None]:
feat_imp = pd.Series(xgb_model.get_fscore()).sort_values(ascending=False)

In [None]:
feat_imp.plot(kind='bar', title='Feature Importances')
plt.ylabel('Feature Importance Score')

In [None]:
grid_search.predict_proba(X_test)[:,1]