<a href="https://colab.research.google.com/github/ljy9969/Study/blob/master/22-04-05%20Titanic.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Flatten, Dense, Conv2D, MaxPooling2D, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

# 이상치는 실제 데이터이기 때문에 처리하지 않고 결측치만 처리할 예정

# Raw Data Loading
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/TitanicData/train.csv')

df = df.drop(['PassengerId', 'Name', 'Ticket', 'Fare', 'Cabin'], axis=1, inplace=False)

df['Family'] = df['SibSp'] + df['Parch']
df = df.drop(['SibSp', 'Parch'], axis=1, inplace=False)

# 결측치 처리
df['Embarked'] = df['Embarked'].fillna('Q')
df['Age'] = df['Age'].fillna(df['Age'].mean())

# 문자로 되어 있는 값을 숫자로 변경
gender_string = {'male': 0, 'female': 1}
df['Sex'] = df['Sex'].map(gender_string)

embarked_string = {'S': 0, 'C': 1, 'Q': 2}
df['Embarked'] = df['Embarked'].map(embarked_string)

def age_category(age): # 
    if ((age >= 0) & (age < 25)):
        return 0
    elif ((age >= 25) & (age < 50)):
        return 1
    else:
        return 2
    
df['Age'] = df['Age'].map(age_category)

# Data Split
train_x_data, test_x_data, train_t_data, test_t_data = \
train_test_split(df.drop('Survived', axis=1, inplace=False), df['Survived'], test_size=0.3, random_state=1, stratify=df['Survived'])

# 정규화
scaler = MinMaxScaler()
scaler.fit(train_x_data)

norm_train_x_data = scaler.transform(train_x_data)
norm_test_x_data = scaler.transform(test_x_data)

In [None]:
# Sklearn 구현
model = LogisticRegression(C=1000)
model.fit(norm_train_x_data, train_t_data)
sklearn_result = model.score(norm_test_x_data, test_t_data)
print('Sklearn 정확도 : {}'.format(sklearn_result)) # 0.7947761194029851

Sklearn 정확도 : 0.7947761194029851


In [None]:
# Tensorflow 2.x 구현
keras_model = Sequential()

keras_model.add(Flatten(input_shape=(5,))) # input layer
keras_model.add(Dense(units=1, activation='sigmoid')) # output layer

keras_model.compile(optimizer=Adam(learning_rate=1e-3),
                    loss='binary_crossentropy',
                    metrics=['accuracy'])

# Early Stopping
es = EarlyStopping(monitor='val_loss',
                   min_delta=0.001,
                   patience=5,
                   verbose=1,
                   mode='auto',
                   restore_best_weights=True)

keras_model.fit(norm_train_x_data, train_t_data, epochs=1000, batch_size=100, validation_split=0.3, verbose=0, callbacks=[es])

keras_result = keras_model.evaluate(norm_test_x_data, test_t_data)
print('Tensorflow 정확도 : {}'.format(keras_result)) # loss: 0.4959 - accuracy: 0.8097

Restoring model weights from the end of the best epoch: 480.
Epoch 485: early stopping
Tensorflow 정확도 : [0.4958873689174652, 0.8097015023231506]


In [None]:
display(test_x_data.head()); display(test_t_data.head())

Unnamed: 0,Pclass,Sex,Age,Embarked,Family
433,3,0,0,0,0
221,2,0,1,0,0
217,2,0,1,0,1
376,3,1,0,0,0
447,1,0,1,0,0


433    0
221    0
217    0
376    1
447    1
Name: Survived, dtype: int64

In [None]:
test_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/TitanicData/test.csv')
test = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/TitanicData/test.csv')
# print(test.shape) # (418, 11)

test_df = test_df.drop(['PassengerId', 'Name', 'Ticket', 'Fare', 'Cabin'], axis=1, inplace=False)

test_df['Family'] = test_df['SibSp'] + test_df['Parch']
test_df = test_df.drop(['SibSp', 'Parch'], axis=1, inplace=False)

# 결측치 처리
test_df['Embarked'] = test_df['Embarked'].fillna('Q')
test_df['Age'] = test_df['Age'].fillna(test_df['Age'].mean())

# 문자로 되어 있는 값을 숫자로 변경
gender_string = {'male': 0, 'female': 1}
test_df['Sex'] = test_df['Sex'].map(gender_string)

embarked_string = {'S': 0, 'C': 1, 'Q': 2}
test_df['Embarked'] = test_df['Embarked'].map(embarked_string)

def age_category(age): # 
    if ((age >= 0) & (age < 25)):
        return 0
    elif ((age >= 25) & (age < 50)):
        return 1
    else:
        return 2
    
test_df['Age'] = test_df['Age'].map(age_category)

# 정규화
scaler = MinMaxScaler()
scaler.fit(test_df)

norm_test_df = scaler.transform(test_df)

prediction = keras_model.predict(norm_test_df)

print(prediction.shape, prediction.ndim)

submission = pd.DataFrame({
        "PassengerId": test["PassengerId"],
        "Survived": prediction.ravel()
    })

def aliveordead(Survived):
    if ((Survived >= 0) & (Survived < 0.5)):
        return 0
    else:
        return 1
    
submission['Survived'] = submission['Survived'].map(aliveordead)

submission.to_csv('/content/drive/MyDrive/Colab Notebooks/TitanicData/submission.csv', index=False)
submission = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/TitanicData/submission.csv')
submission.head()

(418, 1) 2


Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1
