In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

In [4]:
# 수치형 데이터 스케일링 처리
numeric_features = ['Fare', 'SibSp', 'Parch']
numeric_features_with_missing = ['Age']

In [5]:
# OHE 대상
categorical_features = ['Sex', 'Pclass']
categorical_features_with_missing = ['Embarked']

In [6]:
#  - StandardScaler: 평균 0, 분산 1로 스케일링
numeric_transformer = Pipeline(steps=[
  ('scaler', StandardScaler())
])

In [7]:
#  - SimpleImputer: 결측치를 '중앙값(median)'으로 대체
#  - StandardScaler: 스케일링
numeric_impute_transformer = Pipeline(steps=[
  ('imputer', SimpleImputer(strategy='median')),
  ('scaler', StandardScaler())
])

In [8]:
#  - OneHotEncoder: 범주를 0/1의 더미 변수로 변환 (handle_unknown='ignore'는 테스트 데이터에 새로운 값이 나와도 에러 방지)
categorical_transformer = Pipeline(steps=[
  ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

In [9]:
#  - SimpleImputer: 결측치를 '최빈값(most_frequent)'으로 대체
#  - OneHotEncoder: 원-핫 인코딩
categorical_impute_transformer = Pipeline(steps=[
  ('imputer', SimpleImputer(strategy='most_frequent')),
  ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

In [10]:

# 파이프라인 통합

preprocessor = ColumnTransformer(
  transformers=[
    # (이름, 파이프라인, 적용할 컬럼 리스트)
    ('num', numeric_transformer, numeric_features),
    ('num_impute', numeric_impute_transformer, numeric_features_with_missing),
    ('cat', categorical_transformer, categorical_features),
    ('cat_impute', categorical_impute_transformer, categorical_features_with_missing)
  ],
  remainder='passthrough' # 위에서 정의하지 않은 컬럼은 통과시킴 (현재는 없음)
)

In [17]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

In [15]:
# 데이터 로드
train_df = pd.read_csv('train.csv')


In [16]:
# 제외 항목 제거
X = train_df.drop(['Survived', 'PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)
y = train_df['Survived']

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=79)

In [19]:
# 전처리와 모델을 하나의 파이프라인으로 결합
full_pipeline = Pipeline(steps=[
  ('preprocessor', preprocessor),
  ('classifier', LogisticRegression())
])


In [20]:
print("파이프라인 학습 시작...")
full_pipeline.fit(X_train, y_train)
print("파이프라인 학습 완료.")

파이프라인 학습 시작...
파이프라인 학습 완료.


In [21]:
#예측
y_pred = full_pipeline.predict(X_test)

In [22]:
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score, confusion_matrix, ConfusionMatrixDisplay

# 1. 핵심 지표 계산
accuracy = accuracy_score(y_test, y_pred)
f1_report = classification_report(y_test, y_pred, target_names=['Died(0)', 'Survived(1)'])
auc_score = roc_auc_score(y_test, full_pipeline.predict_proba(X_test)[:, 1])

print(f"--- 1. Accuracy ---")
print(f"{accuracy:.4f}")

print(f"\n--- 2. F1-Score (Classification Report) ---")
print(f1_report)

print(f"\n--- 3. ROC-AUC Score ---")
print(f"{auc_score:.4f}")

--- 1. Accuracy ---
0.8045

--- 2. F1-Score (Classification Report) ---
              precision    recall  f1-score   support

     Died(0)       0.82      0.88      0.85       113
 Survived(1)       0.76      0.68      0.72        66

    accuracy                           0.80       179
   macro avg       0.79      0.78      0.78       179
weighted avg       0.80      0.80      0.80       179


--- 3. ROC-AUC Score ---
0.8595
