In [13]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import missingno as msn

import scipy
import scipy.stats as st
import statsmodels as sm
from matplotlib.ticker import MaxNLocator
from scipy.stats import skew, boxcox_normmax, norm
from scipy.special import boxcox1p
import matplotlib.gridspec as gridspec
import datetime

from datetime import datetime
from sklearn import ensemble
import warnings
import logging

warnings.filterwarnings('ignore')
logging.captureWarnings(True)

from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PowerTransformer # default yeo-jhonson transformu uyguluyor
from sklearn.preprocessing import QuantileTransformer
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.preprocessing import RobustScaler


from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import LassoLarsCV
from sklearn.pipeline import Pipeline


import optuna.integration.lightgbm as lgbm



from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score

import pickle

In [2]:
pd.set_option("display.max_rows",None)
pd.set_option("display.max_columns",None)

In [3]:
train_data = pd.read_csv(r"D:\Kaggle\Titanic\train.csv")
train_data.set_index("PassengerId", inplace=True)
test_data = pd.read_csv(r"D:\Kaggle\Titanic\test.csv")
test_data.set_index("PassengerId", inplace=True)

In [4]:
continuous_cols = train_data[["Age","SibSp","Parch","Fare","Pclass"]]

nominal_cols=train_data[["Ticket","Embarked"]]

binary_cols=train_data[["Sex"]]
target_col=train_data[["Survived"]]

In [5]:
df = pd.concat((train_data.loc[:,'Pclass':'Embarked'], test_data.loc[:,'Pclass':'Embarked']))

df.drop(columns=["Age","Cabin","Name","Ticket"],axis=1,inplace=True)
df.Embarked.fillna("S",inplace=True)
df.Fare.fillna(test_data.Fare.mean(), inplace=True)

#train_data.drop(columns=["Age","Cabin","Name","Ticket"],axis=1,inplace=True)
#test_data.drop(columns=["Age","Cabin","Name","Ticket"],axis=1,inplace=True)
#
#train_data.Embarked.fillna("S",inplace=True)
#test_data.Fare.fillna(test_data.Fare.mean(), inplace=True)

map_sex = {"male":1, "female":0}
map_embarked = {"S":0,"C":1,"Q":2}

df.loc[:, "Embarked"] = df.Embarked.map(map_embarked)
df.loc[:, "Sex"] = df.Sex.map(map_sex)

In [6]:
train_data=df.loc[:target_col.shape[0],:]
test_data=df.loc[target_col.shape[0]+1:,:]

X_train, X_valid, y_train, y_valid = train_test_split(train_data, target_col, test_size= 0.2, random_state=42)

In [7]:
lgbm = LGBMClassifier().fit(X_train, y_train)
#Model Tuning
lgbm_params = {"colsample_bytree":[0.4,0.5,1],"learning_rate":[0.01,0.1,0.5], "max_depth": [5,10,50,100], "n_estimators": [40,100,200,1000]}
lgbm_cv = GridSearchCV(lgbm, lgbm_params, cv=5, n_jobs=-1, verbose=2)
lgbm_cv.fit(X_valid, y_valid)
lgbm_cv.best_params_ #{'colsample_bytree': 0.4, 'learning_rate': 0.01, 'max_depth': 10, 'n_estimators': 1000}

Fitting 5 folds for each of 144 candidates, totalling 720 fits


{'colsample_bytree': 0.5,
 'learning_rate': 0.5,
 'max_depth': 5,
 'n_estimators': 40}

In [12]:
#Final model
lgbm_tuned = LGBMClassifier(learning_rate=0.5, max_depth=5, n_estimators=40, colsample_bytree=0.5)
lgbm_tuned.fit(X_train, y_train)
y_test_pred = lgbm_tuned.predict(X_valid)
score = round(accuracy_score(y_valid, y_test_pred), 3)
score

0.81

# Deployement

In [17]:
pickle.dump(lgbm_tuned, open("lightgbm_titanic_model.pkl", "wb"))
pickled_lgbm_titanic = pickle.load(open("lightgbm_titanic_model.pkl", "rb"))

In [18]:
pickled_lgbm_titanic.predict(X_valid)

array([0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1,
       0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
       1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0,
       0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1,
       0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0,
       0, 1, 1], dtype=int64)