## LightGBM
For installation, see the [documentation](https://lightgbm.readthedocs.io/en/latest/Installation-Guide.html#macos). I used the "Build from GitHub" method. Then, cd into the python-package folder and run 

In [1]:
import pickle

import numpy as np
import pandas as pd

import lightgbm as lgb

from sklearn.model_selection import GridSearchCV

In [15]:
(titanic_features, 
 label_col, 
 feature_cols,
 categorical_cols) = pickle.load(open( "data/titanic/features_df_categorical.p", "rb" ))

In [16]:
titanic_features.head()

Unnamed: 0,name_prefix,name_parenths,Sex,Embarked,Pclass,ticket_text,ticket_length,cabin_chars,Age,Fare,SibSp,Parch,Survived
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,22.0,7.25,1.0,0.0,0.0
1,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,38.0,71.2833,1.0,0.0,1.0
2,2.0,0.0,1.0,0.0,0.0,2.0,1.0,0.0,26.0,7.925,0.0,0.0,1.0
3,1.0,1.0,1.0,0.0,1.0,3.0,2.0,1.0,35.0,53.1,1.0,0.0,1.0
4,0.0,0.0,0.0,0.0,0.0,3.0,2.0,0.0,35.0,8.05,0.0,0.0,0.0


In [55]:
clf = lgb.LGBMClassifier(silent=False)

param_grid = [{'n_estimators': [100, 150, 200, 250], 
               'max_depth': [2, 3, 6],
               'learning_rate': [0.01, 0.05, 0.1],
               'num_leaves': [300, 900, 1200]}]

grid_search = GridSearchCV(clf, 
                           param_grid, 
                           scoring="roc_auc", 
                           n_jobs=-1, 
                           cv=5)

grid_search.fit(titanic_features[feature_cols], titanic_features[label_col])

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=LGBMClassifier(boosting_type='gbdt', class_weight=None,
                                      colsample_bytree=1.0,
                                      importance_type='split',
                                      learning_rate=0.1, max_depth=-1,
                                      min_child_samples=20,
                                      min_child_weight=0.001,
                                      min_split_gain=0.0, n_estimators=100,
                                      n_jobs=-1, num_leaves=31, objective=None,
                                      random_state=None, reg_alpha=0.0,
                                      reg_lambda=0.0, silent=False,
                                      subsample=1.0, subsample_for_bin=200000,
                                      subsample_freq=0),
             iid='warn', n_jobs=-1,
             param_grid=[{'learning_rate': [0.01, 0.05, 0.1],
                       

In [56]:
grid_search.best_estimator_

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               importance_type='split', learning_rate=0.1, max_depth=2,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=150, n_jobs=-1, num_leaves=300, objective=None,
               random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=False,
               subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [58]:
grid_search.best_score_

0.8829513165405939

In [59]:
# If splitting data into train/test or training on entire dataset
# use the below method

train_data = lgb.Dataset(titanic_features[feature_cols], 
                         label=titanic_features[label_col])


params = {"max_depth":     2,
          "num_leaves":    300,
          "n_estimators":  150,
          "learning_rate": 0.1
         }

fitted_model = lgb.train(params=params,
                         train_set=train_data, 
                         categorical_feature = categorical_cols)

New categorical_feature is ['Embarked', 'Parch', 'Pclass', 'Sex', 'SibSp', 'cabin_chars', 'name_parenths', 'name_prefix', 'ticket_length', 'ticket_text']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


In [54]:
fitted_model.predict(titanic_features[feature_cols])[:10]

array([0.06440868, 0.98450777, 0.58992114, 1.01066801, 0.1217976 ,
       0.15454554, 0.22515322, 0.55631827, 0.56745004, 1.08480821])