In [64]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from torch.utils.data import Dataset, DataLoader, random_split

import warnings

warnings.filterwarnings("ignore")

plt.rcParams["font.family"] = "Malgun Gothic"
plt.rcParams["axes.unicode_minus"] = False
device = "cuda" if torch.cuda.is_available() else "cpu"

In [65]:
class TitanicDataset(Dataset):
    def __init__(self, dataframe, target_column=None, transform=None, is_train=True):
        self.dataframe = dataframe.copy() # 원본값 보존
        self.target_column = target_column
        self.transform = transform

        self._preprocess()

        # train dateset과 test dataset은 다르기 때문에 이런 과정이 필요
        self.is_train = is_train
        if self.is_train and target_column:
            self.targets = self.dataframe[target_column].values # sklearn이랑 다르게 값만 넣어줘야 함
            self.features = self.dataframe.drop([target_column], axis=1).values
        else:
            self.targets = None
            self.features = self.dataframe.values

    # _preprocess : "_..." => 밖에서 부르지마 ~
    def _preprocess(self):
        # 불필요한 컬럼 삭제
        columns_to_drop = ["PassengerId", "Name", "Ticket", "Cabin"]
        existing_columns = [
            col for col in columns_to_drop if col in self.dataframe.columns
        ]
        if existing_columns:
            self.dataframe.drop(existing_columns, axis=1, inplace=True)
    
        # 나이 결측값 처리 (중앙값)
        if "Age" in self.dataframe.columns:
            self.dataframe["Age"].fillna(self.dataframe["Age"].median(), inplace=True)

        # 승선항구 결측값 (최빈값)
        if "Embarked" in self.dataframe.columns:
            self.dataframe["Embarked"].fillna(
                self.dataframe["Embarked"].mode()[0], inplace=True
            )

        # 요금 (중앙값)
        if "Fare" in self.dataframe.columns:
            self.dataframe["Fare"].fillna(self.dataframe["Fare"].median(), inplace=True)

        # 새로운 특성
        if "SibSp" in self.dataframe.columns and "Parch" in self.dataframe.columns:
            self.dataframe["FamilySize"] = (
                self.dataframe["SibSp"] + self.dataframe["Parch"] + 1
            )
            self.dataframe["IsAlone"] = (self.dataframe["FamilySize"] == 1).astype(int)

        # 나이 그룹
        if "Age" in self.dataframe.columns:
            self.dataframe["AgeGroup"] = pd.cut(
                self.dataframe["Age"],
                bins=[0, 12, 18, 35, 60, 100],
                labels=[0, 1, 2, 3, 4],
            ).astype(int)

        # 요금 그룹
        if "Fare" in self.dataframe.columns:
            self.dataframe["FareGroup"] = pd.qcut(
                self.dataframe["Fare"], q=4, labels=[0, 1, 2, 3]
            ).astype(int)

        # 원-핫 인코딩
        if "Sex" in self.dataframe.columns:
            sex_dummies = pd.get_dummies(self.dataframe["Sex"], drop_first=True)
            self.dataframe = pd.concat([self.dataframe, sex_dummies], axis=1)
            self.dataframe.drop(["Sex"], axis=1, inplace=True)

        if "Embarked" in self.dataframe.columns:
            embarked_dummies = pd.get_dummies(
                self.dataframe["Embarked"], drop_first=True
            )
            self.dataframe = pd.concat([self.dataframe, embarked_dummies], axis=1)
            self.dataframe.drop(["Embarked"], axis=1, inplace=True)

        # 나머지 결측 (평균)
        self.dataframe.fillna(self.dataframe.mean(), inplace=True)
        # print(f"전처리 후 특성 수: {len(self.dataframe.columns)}")
        # print(f"특성 목록: {list(self.dataframe.columns)}")

    def __len__(self):
        return len(self.dataframe)
    
    def __getitem__(self, idx):
        # [1,2,3,4,5]일 때
        if torch.is_tensor(idx):
            idx = idx.tolist()
        features = self.features[idx]
        
        # 변환 적용
        if self.transform:
            features = self.transform(features)
            
        features = torch.FloatTensor(features) # tensor로 바꿈

        if self.is_train and self.targets is not None:
            target = torch.LongTensor([self.targets[idx]])[0] # 스칼라
            return features, target # train인 경우
        else:
            return features

In [66]:
from sklearn.preprocessing import StandardScaler
# tensor는 부울값을 자동으로 숫자로 안 만들어줘서 이 과정을 함
class StandardScaleTransform:

    def __init__(self):
        self.scaler = StandardScaler()
        self.fitted = False

    def fit(self, data):
        self.scaler.fit(data)
        self.fitted = True
        return self

    def __call__(self, sample):
        if not self.fitted: # 학습되었는지 확인
            raise ValueError(
                "스케일러가 아직 학습되지 않았습니다. fit() 메서드를 먼저 호출하세요."
            )

        if sample.ndim == 1:
            sample = sample.reshape(1, -1)
            return self.scaler.transform(sample).flatten()
        else:
            return self.scaler.transform(sample)


In [67]:
df_train = pd.read_csv("data/train.csv")
df_test = pd.read_csv("data/test.csv")

train_data = TitanicDataset(df_train, target_column="Survived")
test_data = TitanicDataset(df_test, is_train=False)

transform = StandardScaleTransform()
transform.fit(train_data.features)

# 숫자로 바꾸는 처리
train_data.transform = transform
test_data.transform = transform

In [68]:
train_dataset, val_dataset = random_split(train_data, [0.2, 0.8])

In [69]:
batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=False)

In [70]:
input_size = train_data.features.shape[1]
input_size

12

In [71]:
# super(TitanicNet, self).__init__()
#         nn.Linear(input_size, 256) # 12 -> 256
#         nn.BatchNorm1d(256) # 배치 정규화 : 레이어로 들어가는 입력값이 한쪽으로 쏠리거나 너무 퍼지거나 너무 좋아지지 않게 해주는 인공신경망 기법
#         nn.ReLU() # 활성화
#         nn.Dropout(0.5) # 임의의 것을 끊어냄 -> 신기하게 성능이 올라감... 근데 내려갈 때도 있다고 함...

In [72]:
class TitanicNet(nn.Module):
    def __init__(self, input_size, hidden_sizes=[256,128,64], dropout_rate=0.3):
        super(TitanicNet, self).__init__()
        layers=[]
        prev_size = input_size

        for i, hidden_size in enumerate(hidden_sizes):
            layers.extend([
                nn.Linear(prev_size,hidden_size),
                nn.BatchNorm1d(hidden_size),
                nn.ReLU(), # 이것들이 한 계층
                nn.Dropout(dropout_rate),
            ])
            prev_size = hidden_size

        layers.append(nn.Linear(prev_size, 2))
        self.network = nn.Sequential(*layers)

    def forward(self, x):
        return self.network(x)

In [73]:
input_size = train_data.features.shape[1]
model = TitanicNet(input_size)

In [74]:
model

TitanicNet(
  (network): Sequential(
    (0): Linear(in_features=12, out_features=256, bias=True)
    (1): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): Dropout(p=0.3, inplace=False)
    (4): Linear(in_features=256, out_features=128, bias=True)
    (5): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (6): ReLU()
    (7): Dropout(p=0.3, inplace=False)
    (8): Linear(in_features=128, out_features=64, bias=True)
    (9): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (10): ReLU()
    (11): Dropout(p=0.3, inplace=False)
    (12): Linear(in_features=64, out_features=2, bias=True)
  )
)