# Models

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn import model_selection
from sklearn.metrics import accuracy_score

import joblib

import warnings
warnings.filterwarnings("ignore")

## Load Data

In [2]:
data_df = pd.read_csv('titanic_data.csv')
data_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,Fare,Cabin,Embarked,Family
0,1,0,3,1,0,1,0,6,0,1
1,2,1,1,3,1,2,4,3,1,1
2,3,1,3,2,1,1,1,6,0,0
3,4,1,1,3,1,2,4,3,0,1
4,5,0,3,1,0,2,1,6,0,0


In [3]:
pipe = joblib.load(f'titanic_pipe.pkl')
pipe

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('cat',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  ['Pclass', 'Name', 'Sex',
                                                   'Age', 'Fare', 'Cabin',
                                                   'Embarked', 'Family'])]))])

In [4]:
titanic_X = data_df.drop(['PassengerId','Survived'], axis=1)
titanic_Y = np.array(data_df[['Survived']])

x_train, x_test, y_train, y_test = \
   model_selection.train_test_split(titanic_X, titanic_Y, test_size=0.3, random_state=0)

x_train = pipe.fit_transform(x_train)
x_test = pipe.transform(x_test)

## Model Fitting

In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import BernoulliNB
from xgboost import XGBClassifier
from lightgbm.sklearn import LGBMClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import cross_val_score

In [6]:
# Hyper-Parameter는 GridSearch의 결과를 참고하여 지정
models = [
    ('Logistic', LogisticRegression()),
    ('KNN', KNeighborsClassifier(16)),
    ('SVM', SVC(kernel='rbf', C=100, gamma=0.01)),
    ('DecisionTree', DecisionTreeClassifier(random_state = 0, max_depth = 4)),
    ('RandomForest', RandomForestClassifier(random_state = 0, n_estimators = 100, max_depth = 6)),
    ('NaiveBayes', BernoulliNB(alpha=10)),
    ('XGBoost', XGBClassifier(seed = 0, n_estimators = 200, max_depth = 2, verbosity=0)),
    ('LightGBM', LGBMClassifier(random_state=0, n_estimators = 200, max_depth = 2))]

In [7]:
for name, model in models:
    model.fit(x_train, y_train)
    print(f'{name}: ', accuracy_score(model.predict(x_test), y_test))

Logistic:  0.8432835820895522
KNN:  0.8283582089552238
SVM:  0.8432835820895522
DecisionTree:  0.8395522388059702
RandomForest:  0.835820895522388
NaiveBayes:  0.7947761194029851
XGBoost:  0.8432835820895522
LightGBM:  0.8395522388059702


In [8]:
clf = LinearDiscriminantAnalysis()
clf.fit(x_train.todense(), y_train)
print('Linear Discriminat Analysis:', accuracy_score(clf.predict(x_test), y_test))

Linear Discriminat Analysis: 0.835820895522388


<img src="../images/lda.jpeg" width="40%">

## Case 2: Numeric Age
수치형 나이 데이터를 모델에 적용한 경우

In [9]:
age_df = pd.read_csv('titanic_data2.csv')
age_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,Fare,Cabin,Embarked,Family
0,1,0,3,1,0,22,0,6,0,1
1,2,1,1,3,1,38,4,3,1,1
2,3,1,3,2,1,26,1,6,0,0
3,4,1,1,3,1,35,4,3,0,1
4,5,0,3,1,0,35,1,6,0,0


In [10]:
age_pipe = joblib.load(f'titanic_pipe2.pkl')
age_pipe

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num', StandardScaler(),
                                                  ['Age']),
                                                 ('cat',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  ['Pclass', 'Name', 'Sex',
                                                   'Fare', 'Cabin', 'Embarked',
                                                   'Family'])]))])

In [11]:
titanic_X = age_df.drop(['PassengerId','Survived'], axis=1)
titanic_Y = np.array(age_df[['Survived']])

x_train, x_test, y_train, y_test = \
   model_selection.train_test_split(titanic_X, titanic_Y, test_size=0.3, random_state=0)

x_train = age_pipe.fit_transform(x_train)
x_test = age_pipe.transform(x_test)

In [12]:
# Hyper-Parameter는 GridSearch의 결과를 참고하여 지정
models = [
    ('Logistic', LogisticRegression()),
    ('KNN', KNeighborsClassifier(16)),
    ('SVM', SVC(kernel='rbf', C=100, gamma=0.01)),
    ('DecisionTree', DecisionTreeClassifier(random_state = 0, max_depth = 4)),
    ('RandomForest', RandomForestClassifier(random_state = 0, n_estimators = 100, max_depth = 6)),
    ('NaiveBayes', BernoulliNB(alpha=10)),
    ('XGBoost', XGBClassifier(seed = 0, n_estimators = 200, max_depth = 2, verbosity=0)),
    ('LightGBM', LGBMClassifier(random_state=0, n_estimators = 200, max_depth = 2))]

In [13]:
for name, model in models:
    model.fit(x_train, y_train)
    print(f'{name}: ', accuracy_score(model.predict(x_test), y_test))

Logistic:  0.8283582089552238
KNN:  0.8171641791044776
SVM:  0.8432835820895522
DecisionTree:  0.8208955223880597
RandomForest:  0.835820895522388
NaiveBayes:  0.7835820895522388
XGBoost:  0.8470149253731343
LightGBM:  0.8283582089552238


In [14]:
clf = LinearDiscriminantAnalysis()
clf.fit(x_train.todense(), y_train)
print('Linear Discriminat Analysis:', accuracy_score(clf.predict(x_test), y_test))

Linear Discriminat Analysis: 0.8283582089552238


## GridSearch

In [15]:
from sklearn.model_selection import GridSearchCV

### Logistic Regression

In [16]:
logistic_param_grid = {'C': np.logspace(-3,3,7),
                       'penalty': ["l1","l2"]}

logistic_grid = GridSearchCV(
    LogisticRegression(), logistic_param_grid, refit=True, cv=10, scoring='accuracy')

logistic_grid.fit(x_train, y_train)

print('GridSearch Best Params:', logistic_grid.best_params_)
print(f'Logistic Regression Score:', accuracy_score(y_test, logistic_grid.predict(x_test)))

GridSearch Best Params: {'C': 1.0, 'penalty': 'l2'}
Logistic Regression Score: 0.8283582089552238


In [17]:
from sklearn.metrics import recall_score, precision_score, f1_score

print('Recall Score:', recall_score(y_test, logistic_grid.predict(x_test)))
print('Precision Score:', precision_score(y_test, logistic_grid.predict(x_test)))
print('F1 Score:', f1_score(y_test, logistic_grid.predict(x_test)))

Recall Score: 0.76
Precision Score: 0.7755102040816326
F1 Score: 0.7676767676767676


### KNN

In [18]:
knn_param_grid = {'n_neighbors': list(range(1,31))}

knn_grid = GridSearchCV(
    KNeighborsClassifier(), knn_param_grid, refit=True, cv=10, scoring='accuracy')

knn_grid.fit(x_train, y_train)

print('GridSearch Best Params:', knn_grid.best_params_)
print(f'KNN Score:', accuracy_score(y_test, knn_grid.predict(x_test)))

GridSearch Best Params: {'n_neighbors': 27}
KNN Score: 0.8097014925373134


### SVC

In [19]:
svc_param_grid = {'C': [0.1,1,10,100,1000], 
                 'gamma': [1,0.1,0.01,0.001,0.0001],
                 'kernel': ['rbf']}

svc_grid = GridSearchCV(
    SVC(), svc_param_grid, refit=True, cv=10, scoring='accuracy')

svc_grid.fit(x_train, y_train)

print('GridSearch Best Params:', svc_grid.best_params_)
print(f'SVC Score:', accuracy_score(svc_grid.predict(x_test), y_test))

GridSearch Best Params: {'C': 1000, 'gamma': 0.001, 'kernel': 'rbf'}
SVC Score: 0.8171641791044776


### Decision Tree

In [20]:
dtree_param_grid = {'criterion': ['gini', 'entropy'],
                    'max_depth': [2,4,6,8,10,12],
                    'random_state': [0]}

dtree_grid = GridSearchCV(
    DecisionTreeClassifier(), dtree_param_grid, refit=True, cv=10, scoring='accuracy')

dtree_grid.fit(x_train, y_train)

print('GridSearch Best Params:', dtree_grid.best_params_)
print(f'Decision Tree Score:', accuracy_score(dtree_grid.predict(x_test), y_test))

GridSearch Best Params: {'criterion': 'entropy', 'max_depth': 6, 'random_state': 0}
Decision Tree Score: 0.832089552238806


### Random Forest

In [21]:
rforest_param_grid = {'n_estimators': [100,200,300,1000],
                      'max_depth': [2,4,6,8,10,12],
                      'random_state': [0]}

rforest_grid = GridSearchCV(
    RandomForestClassifier(), rforest_param_grid, refit=True, cv=10, scoring='accuracy')

rforest_grid.fit(x_train, y_train)

print('GridSearch Best Params:', rforest_grid.best_params_)
print(f'Random Forest Score:', accuracy_score(rforest_grid.predict(x_test), y_test))

GridSearch Best Params: {'max_depth': 6, 'n_estimators': 200, 'random_state': 0}
Random Forest Score: 0.835820895522388


### Naive Bayes

In [22]:
bayes_param_grid = {'alpha': [1.0e-10, 0.0001, 0.001, 0.01, 0.1, 0.5, 1.0, 2.0, 10.0]}

bayes_grid = GridSearchCV(
    BernoulliNB(), bayes_param_grid, refit=True, cv=10, scoring='accuracy')

bayes_grid.fit(x_train, y_train)

print('GridSearch Best Params:', bayes_grid.best_params_)
print(f'Naive Bayes Score:', accuracy_score(bayes_grid.predict(x_test), y_test))

GridSearch Best Params: {'alpha': 10.0}
Naive Bayes Score: 0.7835820895522388


### K-Fold

In [None]:
titanic_X = pipe.transform(titanic_X)
scores = cross_val_score(LGBMClassifier(random_state=0, n_estimators = 200, max_depth = 2), titanic_X, titanic_Y, scoring='accuracy', cv=5)
print('교차 검증별 정확도:',np.round(scores, 4))
print('평균 검증 정확도:', np.mean(scores))

교차 검증별 정확도: [0.8101 0.8258 0.8427 0.8146 0.8596]
평균 검증 정확도: 0.8305504990270542


## Load Data

In [23]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

data_df = pd.read_csv('titanic_data.csv')

categorical_transformer = OneHotEncoder(categories='auto', handle_unknown='ignore')
categorical_features = ['Pclass', 'Name', 'Sex', 'Age',
                        'Fare', 'Embarked', 'Family']

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_features)])

pipe = Pipeline(steps=[('preprocessor', preprocessor)])

In [24]:
titanic_X = data_df.copy()

titanic_X = data_df.drop(['PassengerId','Survived','Cabin'], axis=1)
titanic_Y = np.array(data_df[['Survived']])

x_train, x_test, y_train, y_test = \
   model_selection.train_test_split(titanic_X, titanic_Y, test_size=0.3, random_state=0)

x_train = pipe.fit_transform(x_train)
x_test = pipe.transform(x_test)

## Model Fitting

In [25]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from xgboost import XGBClassifier
from lightgbm.sklearn import LGBMClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import cross_val_score

In [26]:
models = [
    ('Logistic', LogisticRegression()),
    ('KNN', KNeighborsClassifier(16)),
    ('SVM', SVC(kernel='rbf', C=100, gamma=0.01)),
    ('DecisionTree', DecisionTreeClassifier(random_state = 0, max_depth = 4)),
    ('RandomForest', RandomForestClassifier(random_state = 0, n_estimators = 100, max_depth = 6)),
    ('NaiveBayes', BernoulliNB(alpha=10)),
    ('XGBoost', XGBClassifier(seed = 0, n_estimators = 200, max_depth = 2, verbosity=0)),
    ('LightGBM', LGBMClassifier(random_state=0, n_estimators = 200, max_depth = 2))]

In [27]:
for name, model in models:
    model.fit(x_train, y_train)
    print(f'{name}: ', accuracy_score(model.predict(x_test), y_test))

Logistic:  0.8208955223880597
KNN:  0.8097014925373134
SVM:  0.8283582089552238
DecisionTree:  0.8432835820895522
RandomForest:  0.832089552238806
NaiveBayes:  0.7873134328358209
XGBoost:  0.8432835820895522
LightGBM:  0.8432835820895522
