In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn import model_selection
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV

from lightgbm.sklearn import LGBMClassifier

import joblib

import warnings
warnings.filterwarnings("ignore")

## Load Data

In [2]:
data_df = pd.read_csv('titanic_data.csv')
pipe = joblib.load(f'titanic_pipe.pkl')
pipe.fit(data_df)

data_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,Family
0,1,0,3,1,0,22,1,0,7.25,6,0,1
1,2,1,1,3,1,38,1,0,71.2833,3,1,1
2,3,1,3,2,1,26,0,0,7.925,6,0,0
3,4,1,1,3,1,35,1,0,53.1,3,0,1
4,5,0,3,1,0,35,0,0,8.05,5,0,0


In [3]:
titanic_X = data_df.copy()

del titanic_X['PassengerId']
del titanic_X['Survived']

titanic_Y = np.array(data_df[['Survived']])

x_train, x_test, y_train, y_test = \
   model_selection.train_test_split(titanic_X, titanic_Y, test_size=0.3, random_state=0)

x_train = pipe.transform(x_train)
x_test = pipe.transform(x_test)

In [16]:
clf = LGBMClassifier(random_state=0, n_estimators = 200, max_depth = 2)
clf.fit(x_train, y_train)
print(f'LightGBM: ', accuracy_score(clf.predict(x_test), y_test))

LightGBM:  0.8432835820895522


In [18]:
params = {'boosting_type': ['gbdt','rf','dart'],
          'num_leaves': [15,31,63],
          'max_depth': list(range(1,9)),
          'learning_rate': [0.01,0.1,0.2,0.3,0.5,0.9],
          'n_estimators': list(range(100,1000,100)),
          'n_jobs': [-1],
          'random_state': [0],
          'subsample': [0.6, 0.8, 1.0],
          'colsample_bytree': [0.6, 0.8, 1.0]}

grid = GridSearchCV(
    LGBMClassifier(), params, refit=True, cv=10, scoring='accuracy')

grid.fit(x_train, y_train)

print('GridSearch Best Params:', grid.best_params_)
print(f'Logistic Regression Score:', accuracy_score(y_test, grid.predict(x_test)))

KeyboardInterrupt: 