# 3.8 타이타닉 생존율 예측

##3.8.1 문제 정의

In [None]:
import tensorflow as tf
import keras

## 3.8.2 다양한 형태로 데이터 구성 및 준비하기

In [None]:
import pandas as pd

URL = 'https://storage.googleapis.com/tf-datasets/titanic/train.csv'
df = pd.read_csv(URL)

type(df)

In [None]:
print(df.shape)

df.head()

In [None]:
import tensorflow_datasets as tfds

tfds.list_builders()[:5]

In [None]:
import tensorflow_datasets as tfds

ds, info = tfds.load('titanic', split='train', with_info=True)

type(ds)

In [None]:
info

In [None]:
tfds.as_dataframe(ds.take(4), info)

## 3.8.3 모델에서 활용할 수 있도록 데이터 전처리

In [None]:
import pandas as pd

url = 'https://storage.googleapis.com/tf-datasets/titanic/train.csv'
df = pd.read_csv(url)

df.info()

In [None]:
import missingno

missingno.matrix(df)

In [None]:
# from sklearn.preprocessing import LabelEncoder

# le = LabelEncoder()
# le = df[['sex', 'class', 'deck', 'embark_town','alone']].apply(le.fit_transform)

# df['sex'] = le['sex']
# df['class'] = le['class']
# df['deck'] = le['deck']
# df['embark_town'] = le['embark_town']
# df['alone'] = le['alone']

# df.head()
# -----------------------------------------

# 여러개의 컬럼인 경우 OrdinalEncoder 권장
from sklearn.preprocessing import OrdinalEncoder
import numpy as np
col_names = ["sex", "class", "deck", "embark_town", "alone"]
oe = OrdinalEncoder()
df[col_names] = oe.fit_transform(df[col_names]).astype(np.int32)

df.head()

In [None]:
df_y = df.pop('survived')
df_x = df.loc[:,['age','fare']].copy()

In [None]:
df_y

In [None]:
df_x.T

In [None]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(df_x, df_y, test_size=0.2)

print('train dataset :', x_train.shape, y_train.shape)
print('test dataset :', x_test.shape, y_test.shape)

## 3.8.4 다양한 레이어로 순차적인 모델 빌드


In [None]:
import keras
model = keras.Sequential()
model.add(keras.layers.Input(shape=(2,)))
model.add(keras.layers.Dense(32, activation="relu"))
model.add(keras.layers.Dense(16, activation="relu"))
model.add(keras.layers.Dense(1, activation="sigmoid"))

model.summary()

## 3.8.5 이진 분류에 대한 모델 컴파일

In [None]:
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

## 3.8.6 모델을 활용하여 학습 및 예측

In [None]:
history = model.fit(x_train, y_train,
          validation_split=0.2,
          batch_size = 1, # 기본값 32
          epochs=5)

In [None]:
loss, accuracy = model.evaluate(x_test, y_test)

print('test accuracy :', accuracy)

In [None]:
from sklearn.model_selection import train_test_split

df_x = df.copy()
df_y = df_y.copy()

x_train, x_test, y_train, y_test = train_test_split(df_x, df_y, test_size=0.2)

print('train dataset :', x_train.shape, y_train.shape)
print('test dataset :', x_test.shape, y_test.shape)

In [None]:
import keras

model = keras.Sequential()
model.add(keras.layers.Input(shape=(9,)))
model.add(keras.layers.Dense(32, activation="relu"))
model.add(keras.layers.Dense(16, activation="relu"))
model.add(keras.layers.Dense(1, activation="sigmoid"))

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

callback_EarlyStopping = keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, verbose=1)

history = model.fit(x_train, y_train,
          validation_split=0.2,
          batch_size = 16, # 기본값 32
          epochs=100,
          callbacks=callback_EarlyStopping)

In [None]:
loss, accuracy = model.evaluate(x_test, y_test)

print('test accuracy :', accuracy)

In [None]:
y_test.head()

In [None]:
predictions = model.predict(x_test)

print("predictions:", predictions[0:5], sep="\n")
print('y_test:', y_test.head(5), sep="\n")

## 3.8.7 훈련된 모델의 플롯 손실 및 정확도

In [None]:
from plotly.subplots import make_subplots
import plotly.graph_objects as go

fig = make_subplots(specs=[[{"secondary_y": True}]])

# model history
epoch = history.epoch

loss = history.history['loss']
val_loss = history.history['val_loss']
accuracy = history.history['accuracy']
val_accuracy = history.history['val_accuracy']

# Scatter
fig.add_trace(go.Scatter(x=epoch, y=loss, name="loss"),secondary_y=False,)
fig.add_trace(go.Scatter(x=epoch, y=val_loss, name="val_loss"),secondary_y=False,)
fig.add_trace(go.Scatter(x=epoch, y=accuracy, name="accuracy"),secondary_y=True,)
fig.add_trace(go.Scatter(x=epoch, y=val_accuracy, name="val_accuracy"),secondary_y=True,)

# Templates configuration, Default template: 'plotly'
# Available templates: ['ggplot2', 'seaborn', 'simple_white', 'plotly','plotly_white', 
#                       'plotly_dark', 'presentation', 'xgridoff','ygridoff', 'gridon', 'none']
fig.update_layout(title_text="<b>Loss/Accuracy of Model</b>", template='plotly')

fig.update_xaxes(title_text="Epoch")
fig.update_yaxes(title_text="Loss", secondary_y=False)
fig.update_yaxes(title_text="Accuracy", secondary_y=True)

fig.show()