# 타이타닉 생존자 예측 (v1)
- 입력: 데이터의 독립변인 전부 (적은 게 좋음)
- 결과: 생존자를 예측(0/1)

## 과정
1. 데이터 불러오기
2. 데이터 전처리
3. 데이터 분할
4. 데이터 정규화
------
5. 학습/검증/테스트 데이터 (텐서로 변경해야 함)
6. 모델 생성 (a. 베이스 라인, b. 개선)
7. 학습
8. 예측
9. 평가
* (5번부터 PyTorch 사용)

In [64]:
import pandas as pd
import numpy as np

import torch
from torch.utils.data import Dataset, DataLoader

import torch.nn as nn

In [65]:
# 1. 데이터 불러오기
def load_data(file_path):
    df = pd.read_csv(file_path)
    # [확인해야 할 것]
    # 1. 형태(행, 열 크기)
    # - 열(독립변인, 측정값)
    # - 행(종속변인, 관측값)
    # 2. 자료형(info)
    # 3. 컬럼 이름
    print(f'\n결측치 : {df.isnull().sum()}')
    return df

In [66]:
df = load_data('data/titanic/train.csv')


결측치 : PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64


In [67]:
df.columns.tolist()

['PassengerId',
 'Survived',
 'Pclass',
 'Name',
 'Sex',
 'Age',
 'SibSp',
 'Parch',
 'Ticket',
 'Fare',
 'Cabin',
 'Embarked']

In [68]:
# 2. 데이터 전처리
def preporcess_data(df):
    df = df.copy()
    target_col = 'Survived'
    
    # 타깃 변수가 결측인 행 제거
    df = df.dropna(subset=[target_col])      
    columns_to_drop = ['PassengerId', 'Name', 'Ticket', 'Cabin']
    df = df.drop(columns=columns_to_drop)
    
    # 데이터 값 숫자로 바꾸기
    if 'Sex' in df.columns:
        df['Sex'] = df['Sex'].map({'male': 0, 'female': 1})
    if 'Embarked' in df.columns:
        df = pd.get_dummies(df, columns=['Embarked'], prefix='Embarked')

    numeric_cols = df.select_dtypes(include=[np.number]).columns.to_list()
    feature_cols = [col for col in numeric_cols if col != target_col]
    X = df[feature_cols].copy()
    y = df[target_col].copy()
    X = X.fillna(X.mean())
    
    return X.values, y.values, feature_cols

In [69]:
# 3. 데이터 분할
def split_data(X, y, train_ratio=0.8):
    n_samples = len(X)
    n_train = int(n_samples * train_ratio)

    indices = np.random.permutation(n_samples)  # 셔플 필수
    train_indices = indices[:n_train]
    test_indices = indices[n_train:]

    X_train = X[train_indices]
    X_test = X[train_indices]
    y_train = y[train_indices]
    y_test = y[test_indices]

    return X_train, X_test, y_train, y_test

In [70]:
# 4. 데이터 정규화
# (평균 = 0, 표준편차 = 1)

def normalize_features(X_train, X_test):
    mean = X_train.mean(axis=0)
    std = X_train.std(axis=0) + 1e-8

    X_train_norm = (X_train - mean) / std
    X_test_norm = (X_test - mean) / std

    return X_train_norm, X_test_norm, mean, std

In [71]:
# 5. 학습/검증/테스트 데이터
class TitanicDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.FloatTensor(X)
        self.y = torch.LongTensor(y)    # 분류 문제라서 LongTensor 사용
    
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, index):
        return self.X[index], self.y[index]

In [80]:
# 6. 모델 생성
class LinearClassficationModel(nn.Module):
    def __init__(self, input_dim, num_classes):
        super(LinearClassficationModel, self).__init__()
        # self.linear = nn.Linear(input_dim, num_classes)
        self.net = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(32, 16),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(16, num_classes)
        )

    def forward(self, x):
        return self.linear(x)

In [73]:
# 7. 학습
def train_model(model, train_loader, criterion, optimizer, epochs=100):
    model.train()
    loss_history = []
    acc_history = []
    
    for epoch in range(epochs):
        epoch_loss = 0.0
        correct = 0
        total = 0
        
        for batch_X, batch_y in train_loader:
            optimizer.zero_grad()   # 기울기 0으로 고정
            predictions = model(batch_X)
            loss = criterion(predictions, batch_y)
            loss.backward()
            optimizer.step()

            # 측정
            epoch_loss += loss.item()
            _, predicted = torch.max(predictions.data, 1)   # 열을 없애고 → 행마다 결과 1개
            total += batch_y.size(0)
            correct += (predicted == batch_y).sum().item()

        avg_loss = epoch_loss / len(train_loader)
        acc = 100 * correct / total
        loss_history.append(avg_loss)
        acc_history.append(acc)
        if (epoch + 1) % 10 == 0:
            print(f'{epoch + 1} : {avg_loss}, {acc}')
            
    return loss_history, acc_history

In [None]:
# 9. 평가
def evaluate_model(model, test_loader):
    model.eval()
    predictions = []
    actuals = []
    correct = 0
    total = 0

    with torch.no_grad():
        for batch_X, batch_y in test_loader:
            pred = model(batch_X)
            _, predicted = torch.max(pred.data, 1)
            predictions.extend(predicted.numpy())
            actuals.extend(batch_y.numpy())

            total += batch_y.size(0)
            correct += (predicted == batch_y).sum().item()

    accuracy = 100 * correct / total
    
    predictions = np.array(predictions)
    actuals = np.array(actuals)

    print(f'\n=== 테스트 세트 평가 ===')
    print(f'Accuarcy: {accuracy}')
    print(f'Correct: {correct}/{total}')

    return predictions, actuals, accuracy

In [79]:
# 1. 데이터 불러오기
df = load_data('data/titanic/train.csv')
# 2. 데이터 전처리
X, y, feature_cols = preporcess_data(df)
# 3. 데이터 분할
X_train, X_test, y_train, y_test = split_data(X, y)
# 4. 데이터 정규화
X_train_norm, X_test_norm, mean, std = normalize_features(X_train, X_test)
# 5. 학습/검증/테스트 데이터
train_dataset = TitanicDataset(X_train_norm, y_train)
test_dataset = TitanicDataset(X_test_norm, y_test)

batch_size = 32     # 하이퍼 파라미터
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
# 6. 모델 생성
# 초기 데이터의 크기와 결과에 대해 미리 알고 있어야 함
input_dim = X_train_norm.shape[1]
num_classes = 2
model = LinearClassficationModel(input_dim, num_classes)
# 7. 학습
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
train_model(model, train_loader, criterion, optimizer)
# 9. 평가
predictions, actuals, accuracy = evaluate_model(model, test_loader)


결측치 : PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64
10 : 0.7136646146359651, 46.91011235955056
20 : 0.5499602698761484, 78.51123595505618
30 : 0.4976226842921713, 78.08988764044943
40 : 0.4748382412868997, 79.49438202247191
50 : 0.45735851707665814, 79.21348314606742
60 : 0.43828727499298425, 79.49438202247191
70 : 0.44423243144284125, 79.21348314606742
80 : 0.4351071229447489, 79.35393258426966
90 : 0.44191490696824115, 79.21348314606742
100 : 0.4483886970126111, 79.21348314606742


IndexError: index 179 is out of bounds for dimension 0 with size 179

In [None]:
type(X_train_norm)

numpy.ndarray

# 연습 1. 배치사이즈 4(데이터의 개수), (강아지, 고양이, 코끼리)를 분류하는 모델로 가정

(1) 모델의 예측과 오차(predictions)
데이터1: [0.1, 0.8, 0.1]    -> 고양이 1
데이터2: [0.2, 0.2, 0.6]    -> 코끼리 2
데이터3: [0.7, 0.2, 0.1]    -> 강아지 0
데이터4: [0.3, 0.4, 0.3]    -> 고양이 1

(2) 실제정답: [1, 2, 1, 1]

(3) 학습평가(predicted) [1, 2, 0, 1] => [1, 1, 0, 1]

(4) 평가(correct)

(5) 오차누적(0.5)