전처리

In [3]:
import numpy as np
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

dataset = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/DATA/train_titanic.csv')
X_test = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/DATA/test_titanic.csv')

dataset_title = [i.split(',')[1].split('.')[0].strip() for i in dataset['Name']]
dataset['Title'] = pd.Series(dataset_title)
dataset['Title'] = dataset['Title'].replace(['Lady', 'the Countess', 'Countess', 'Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona', 'Ms', 'Mme', 'Mlle'], 'Rare')

dataset_title = [i.split(',')[1].split('.')[0].strip() for i in X_test['Name']]
X_test['Title'] = pd.Series(dataset_title)
X_test['Title'] = X_test['Title'].replace(['Lady', 'the Countess', 'Countess', 'Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona', 'Ms', 'Mme', 'Mlle'], 'Rare')

dataset['FamilyS'] = dataset['SibSp'] + dataset['Parch'] + 1
X_test['FamilyS'] = X_test['SibSp'] + X_test['Parch'] + 1

# 나머지 결측치 처리
dataset = dataset.fillna({
    'Age' : dataset.groupby("Title")["Age"].transform("median"),
    'Embarked': dataset['Embarked'].mode()[0]
})
X_test = X_test.fillna({
    'Age' : X_test.groupby("Title")["Age"].transform("median"),
    'Embarked': dataset['Embarked'].mode()[0],
    'Fare': dataset['Fare'].median()
})

dataset = dataset.drop(['PassengerId', 'Cabin', 'Name', 'SibSp', 'Parch', 'Ticket'], axis=1)
X_test_passengers = X_test['PassengerId']
X_test = X_test.drop(['PassengerId', 'Cabin', 'Name', 'SibSp', 'Parch', 'Ticket'], axis=1)

X_train = dataset.iloc[:, 1:9].values
Y_train = dataset.iloc[:, 0].values
X_test = X_test.values

for _ in [1, 4, 5]:
  X_train[:, _] = LabelEncoder().fit_transform(X_train[:, _])
  X_test[:, _] = LabelEncoder().fit_transform(X_test[:, _])

ct = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), [0, 1, 4, 5, 6]), # 범주형
        ('num', StandardScaler(), [2, 3]) # 수치형
    ],
    remainder='passthrough'
)

X_train = ct.fit_transform(X_train)
X_test = ct.transform(X_test)

if hasattr(X_train, "toarray"):
    X_train = X_train.toarray()
    X_test = X_test.toarray()

X_train = np.array(X_train, dtype=np.float32)
X_test = np.array(X_test, dtype=np.float32)


NN모듈

In [4]:
#트레이닝

import torch
import torch.nn as nn
import torch.nn.functional as F
import math
from sklearn.utils import shuffle
from torch.autograd import Variable
from torch.utils.data import DataLoader, TensorDataset
import random

def set_seed(seed=55):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(55)

input_size = X_train.shape[1]

class Net(nn.Module):
    def __init__(self, input_size):
        super(Net, self).__init__()
        self.layer = nn.Sequential(
            nn.Linear(input_size, 64),
            nn.BatchNorm1d(64),
            nn.ReLU(),
            nn.Dropout(0.2),

            nn.Linear(64, 32),
            nn.BatchNorm1d(32),
            nn.ReLU(),
            nn.Dropout(0.2),

            nn.Linear(32, 2)
        )

    def forward(self, x):
        return self.layer(x)

net = Net(input_size)

batch_size = 64
num_epochs = 80
learning_rate = 0.0005

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(net.parameters(), lr=learning_rate, weight_decay=1e-5)

g = torch.Generator()
g.manual_seed(55)
train_data = TensorDataset(torch.FloatTensor(X_train), torch.LongTensor(Y_train))
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True, generator=g)

for epoch in range(num_epochs):
    net.train()
    for batch_x, batch_y in train_loader:
        optimizer.zero_grad()
        outputs = net(batch_x)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()

    if (epoch + 1) % 10 == 0:
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

Epoch [10/80], Loss: 0.3681
Epoch [20/80], Loss: 0.3560
Epoch [30/80], Loss: 0.6087
Epoch [40/80], Loss: 0.5026
Epoch [50/80], Loss: 0.2904
Epoch [60/80], Loss: 0.2639
Epoch [70/80], Loss: 0.3005
Epoch [80/80], Loss: 0.2576


In [5]:
net.eval()

with torch.no_grad():
    test_inputs = torch.FloatTensor(X_test)
    test_outputs = net(test_inputs)

    _, test_preds = torch.max(test_outputs, 1)
    survived = test_preds.numpy()

    nn_probs = torch.softmax(test_outputs, dim=1)[:, 1].numpy()
print("예측이 완료되었습니다.")
import csv

submission = [['PassengerId', 'Survived']]
for i in range(len(survived)):
    submission.append([X_test_passengers[i], survived[i]])

with open('submission_nn.csv', 'w') as submissionFile:
    writer = csv.writer(submissionFile)
    writer.writerows(submission)

print('Writing Complete!')

예측이 완료되었습니다.
Writing Complete!


랜덤포레스트 & XGBoost


In [16]:
#트레이닝

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier

rf_model = RandomForestClassifier(
    n_estimators=500,
    max_depth=5,
    min_samples_split=10,
    min_samples_leaf=2,
    random_state=55,
    n_jobs=-1
    )

rf_model.fit(X_train, Y_train)

xgb_model = XGBClassifier(
    n_estimators=50,
    learning_rate=0.01,
    max_depth=4,
    subsample=0.8,
    colsample_bytree=0.8,
    use_label_encoder=False,
    eval_metric='logloss',
    random_state=55
)
# estimator 100, learningrate 0.05에서 낮췄더니 0.76315 -> 0.7799
xgb_model.fit(X_train, Y_train)



Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [21]:
rf_probs = rf_model.predict_proba(X_test)[:, 1]
xgb_probs = xgb_model.predict_proba(X_test)[:, 1]


#앙상블, 랜덤포레스트, XGBoost
xgb_rf_labels = (((rf_probs * 0.5) + (xgb_probs * 0.5)) > 0.5).astype(int)
xgb_rf_nn_labels = ((rf_probs * 0.3) + (xgb_probs * 0.5) + (nn_probs * 0.2) > 0.5).astype(int)
rf_labels = (rf_probs > 0.5).astype(int)
xgb_labels = (xgb_probs > 0.5).astype(int)


test

In [23]:
import csv

def save_submission(preds, name):
    submission = pd.DataFrame({
        "PassengerId": X_test_passengers,
        "Survived": preds.astype(int)
    })
    submission.to_csv(f'submission_{name}.csv', index=False)
    print(f'Saved: submission_{name}.csv')

save_submission(xgb_rf_labels, 'xgb_rf')
save_submission(xgb_rf_nn_labels, 'xgb_rf_nn')
save_submission(rf_labels, 'rf')
save_submission(xgb_labels, 'xgb')

#xgb -> 0.7799
#rf -> 0.77751
#xgb_rf -> 0.78708
#xgb_rf_nn -> 77511

Saved: submission_xgb_rf.csv
Saved: submission_xgb_rf_nn.csv
Saved: submission_rf.csv
Saved: submission_xgb.csv
