In [1]:
from google.colab import drive
drive.mount('/content/drive')
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import re
%matplotlib inline

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import VotingClassifier
from xgboost import XGBClassifier, XGBRFClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
train_df = pd.read_csv('/content/drive/MyDrive/data1/titanic/train.csv')
test_df = pd.read_csv('/content/drive/MyDrive/data1/titanic/test.csv')
test2= pd.read_csv('/content/drive/MyDrive/data1/titanic/test.csv')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
combine = [train_df, test_df]

# Deck
for df in combine:
    df['Cabin'] = df['Cabin'].fillna('U')
    df['Deck'] = df['Cabin'].apply(lambda x: re.search(r'([A-Za-z])', x).group() if x != 'U' else 'U')

deck_counts = train_df['Deck'].value_counts()
rare_decks = deck_counts[deck_counts < 20].index.tolist()

for df in combine:
    df['Deck'] = df['Deck'].apply(lambda x: 'Rare' if x in rare_decks else x)

deck_train = pd.get_dummies(train_df['Deck'], prefix='Deck')
deck_test = pd.get_dummies(test_df['Deck'], prefix='Deck')

for col in deck_train.columns:
    if col not in deck_test.columns:
        deck_test[col] = 0
deck_test = deck_test[deck_train.columns]
train_df = pd.concat([train_df, deck_train], axis=1)
test_df = pd.concat([test_df, deck_test], axis=1)
combine = [train_df, test_df]

# Tit
for dataset in combine:
    dataset['Title'] = dataset['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
    dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Ms', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')
    rare_titles = ['Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Lady', 'Countess', 'Dona']
    dataset.loc[dataset['Title'].isin(rare_titles), 'Title'] = 'Rare'

title_mapping = {'Rare': 0, 'Miss': 1, 'Mrs': 2, 'Master': 3, 'Mr': 4}
for dataset in combine:
    dataset['Title'] = dataset['Title'].map(title_mapping)

# Sex
for dataset in combine:
    dataset['Sex'] = dataset['Sex'].map({'female': 1, 'male': 0}).astype(int)

# Fare
fare_median = train_df["Fare"].median()
test_df["Fare"] = test_df["Fare"].fillna(fare_median)
fare_bins = pd.qcut(train_df['Fare'], 4, retbins=True)[1]
train_df['FareBand'] = pd.cut(train_df['Fare'], bins=fare_bins, labels=False, include_lowest=True)
test_df['FareBand'] = pd.cut(test_df['Fare'], bins=fare_bins, labels=False, include_lowest=True)
train_df['FareBand'] = train_df['FareBand'].astype(int)
test_df['FareBand'] = test_df['FareBand'].astype(int)

# Age
age_median = train_df.groupby(['Title', 'Sex', 'Pclass'])['Age'].median()
train_df['Age'] = train_df.apply(
    lambda row: age_median.loc[row['Title'], row['Sex'], row['Pclass']]
    if pd.isna(row['Age']) and (row['Title'], row['Sex'], row['Pclass']) in age_median.index else row['Age'],
    axis=1
)
train_df['Age'] = train_df['Age'].fillna(train_df['Age'].median())
test_df['Age'] = test_df.apply(
    lambda row: age_median.loc[row['Title'], row['Sex'], row['Pclass']]
    if pd.isna(row['Age']) and (row['Title'], row['Sex'], row['Pclass']) in age_median.index else row['Age'],
    axis=1
)
test_df['Age'] = test_df['Age'].fillna(train_df['Age'].median())

age_bins = pd.qcut(train_df['Age'], 4, retbins=True, duplicates='drop')[1]
train_df['AgeBand'] = pd.cut(train_df['Age'], bins=age_bins, labels=False, include_lowest=True)
test_df['AgeBand'] = pd.cut(test_df['Age'], bins=age_bins, labels=False, include_lowest=True)
test_df['AgeBand'] = test_df['AgeBand'].fillna(round(train_df['AgeBand'].median())).astype(int)

# IsA
for dataset in combine:
    dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1
for dataset in combine:
    dataset['IsAlone'] = 0
    dataset.loc[dataset['FamilySize'] == 1, 'IsAlone'] = 1

# Tic
for dataset in combine:
    dataset['Ticket_type'] = dataset['Ticket'].apply(lambda x: str(x).replace('.', '').replace('/', '').split()[0] if not x.isdigit() else 'NUMERIC')
le = LabelEncoder()
le.fit(train_df['Ticket_type'])
train_df['Ticket_type'] = le.transform(train_df['Ticket_type'])
test_df['Ticket_type'] = test_df['Ticket_type'].apply(lambda x: x if x in le.classes_ else 'Unknown')
le.classes_ = np.append(le.classes_, 'Unknown')
test_df['Ticket_type'] = le.transform(test_df['Ticket_type'])

# S*C
train_df['Sex_Pclass'] = train_df['Sex'].astype(str) + '_' + train_df['Pclass'].astype(str)
sex_pclass_train = pd.get_dummies(train_df['Sex_Pclass'], prefix='SexPclass')
train_df = pd.concat([train_df, sex_pclass_train], axis=1)
train_df.drop(['Sex_Pclass'], axis=1, inplace=True)
test_df['Sex_Pclass'] = test_df['Sex'].astype(str) + '_' + test_df['Pclass'].astype(str)
sex_pclass_test_df = pd.get_dummies(test_df['Sex_Pclass'], prefix='SexPclass')
for col in sex_pclass_train.columns:
    if col not in sex_pclass_test_df.columns:
        sex_pclass_test_df[col] = 0
sex_pclass_test_df = sex_pclass_test_df[sex_pclass_train.columns]
test_df = pd.concat([test_df, sex_pclass_test_df], axis=1)
test_df.drop(['Sex_Pclass'], axis=1, inplace=True)

# Emb
freq_port = train_df.Embarked.dropna().mode()[0]
for dataset in combine:
    dataset['Embarked'] = dataset['Embarked'].fillna(freq_port)
embarked_train_df = pd.get_dummies(train_df['Embarked'], prefix='Embarked')
embarked_test_df = pd.get_dummies(test_df['Embarked'], prefix='Embarked')
for col in embarked_train_df.columns:
    if col not in embarked_test_df.columns:
        embarked_test_df[col] = 0
embarked_test_df = embarked_test_df[embarked_train_df.columns]
train_df = pd.concat([train_df, embarked_train_df], axis=1)
test_df = pd.concat([test_df, embarked_test_df], axis=1)

# Pcl
pclass_train_df = pd.get_dummies(train_df['Pclass'], prefix='Pclass')
pclass_test_df = pd.get_dummies(test_df['Pclass'], prefix='Pclass')
for col in pclass_train_df.columns:
    if col not in pclass_test_df.columns:
        pclass_test_df[col] = 0
pclass_test_df = pclass_test_df[pclass_train_df.columns]
train_df = pd.concat([train_df, pclass_train_df], axis=1)
test_df = pd.concat([test_df, pclass_test_df], axis=1)

In [3]:
train_df.drop(['Ticket', 'Cabin', 'Name', 'PassengerId', 'Parch', 'SibSp', 'Fare',
               'FamilySize', 'Embarked', 'Pclass', 'Deck', 'Age'], axis=1, inplace=True)
test_df.drop(['Ticket', 'Cabin', 'Name', 'Parch', 'SibSp', 'Fare',
              'FamilySize', 'Embarked', 'Pclass', 'Deck', 'Age'], axis=1, inplace=True)

train_x = train_df.drop('Survived', axis=1)
train_y = train_df['Survived']
test = test_df.drop('PassengerId', axis=1).copy()

print(train_x.shape, train_y.shape, test.shape)
print("Train 결측값:\n", train_df.isnull().sum())
print("\nTest 결측값:\n", test_df.isnull().sum())

(891, 24) (891,) (418, 24)
Train 결측값:
 Survived         0
Sex              0
Deck_B           0
Deck_C           0
Deck_D           0
Deck_E           0
Deck_Rare        0
Deck_U           0
Title            0
FareBand         0
AgeBand          0
IsAlone          0
Ticket_type      0
SexPclass_0_1    0
SexPclass_0_2    0
SexPclass_0_3    0
SexPclass_1_1    0
SexPclass_1_2    0
SexPclass_1_3    0
Embarked_C       0
Embarked_Q       0
Embarked_S       0
Pclass_1         0
Pclass_2         0
Pclass_3         0
dtype: int64

Test 결측값:
 PassengerId      0
Sex              0
Deck_B           0
Deck_C           0
Deck_D           0
Deck_E           0
Deck_Rare        0
Deck_U           0
Title            0
FareBand         0
AgeBand          0
IsAlone          0
Ticket_type      0
SexPclass_0_1    0
SexPclass_0_2    0
SexPclass_0_3    0
SexPclass_1_1    0
SexPclass_1_2    0
SexPclass_1_3    0
Embarked_C       0
Embarked_Q       0
Embarked_S       0
Pclass_1         0
Pclass_2         0
Pclas

In [4]:
xgbc = XGBClassifier(n_estimators=550, max_depth=6, random_state=888, eval_metric='logloss')
rf = RandomForestClassifier(n_estimators=350, max_depth=6, min_samples_split=4, random_state=888)
lr = LogisticRegression(max_iter=600, solver="liblinear", C=1.0, random_state=888)
xgbr = XGBRFClassifier(n_estimators=350, max_depth=5, subsample=0.85, colsample_bytree=0.85,
                      random_state=888, eval_metric='logloss')

ensemble = VotingClassifier(
    estimators=[
        ('xgbc', xgbc),
        ('xgbr', xgbr),
        ('rf', rf),
        ('lr', lr)
    ],
    voting='soft'
)

ensemble.fit(train_x, train_y)

X_test = test[train_x.columns]
predictions = ensemble.predict(X_test)


for i in range(min(5, len(test))):
    if test.loc[i, 'Sex'] == 1:
        predictions[i] = 1


submission = pd.DataFrame({
    'PassengerId': test2['PassengerId'],
    'Survived': predictions
})
submission.to_csv('submission.csv', index=False)


survival_percentage = (submission['Survived'].sum() / len(submission)) * 100
print(f"Ensemble Survival Percentage: {survival_percentage:.2f}%")

Ensemble Survival Percentage: 31.82%
