In [1]:
# Data manipulation
from seaborn import load_dataset
import numpy as np
import pandas as pd
from functions import calculate_roc_auc
pd.options.display.precision = 4
pd.options.mode.chained_assignment = None  

# Machine learning pipeline
from sklearn.model_selection import train_test_split
from transformers import FeatureExtractor, Imputer, CardinalityReducer, Encoder
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

# Load data
columns = ['alive', 'class', 'embarked', 'who', 'alone', 'adult_male']
df = load_dataset('titanic').drop(columns=columns)
df['deck'] = df['deck'].astype('object')
print(df.shape)
df.head()

(891, 9)


Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,deck,embark_town
0,0,3,male,22.0,1,0,7.25,,Southampton
1,1,1,female,38.0,1,0,71.2833,C,Cherbourg
2,1,3,female,26.0,0,0,7.925,,Southampton
3,1,1,female,35.0,1,0,53.1,C,Southampton
4,0,3,male,35.0,0,0,8.05,,Southampton


In [2]:
SEED = 42
TARGET = 'survived'
FEATURES = df.columns.drop(TARGET)

NUMERICAL = df[FEATURES].select_dtypes('number').columns
print(f"Numerical features: {', '.join(NUMERICAL)}")

CATEGORICAL = pd.Index(np.setdiff1d(FEATURES, NUMERICAL))
print(f"Categorical features: {', '.join(CATEGORICAL)}\n")

X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=TARGET), df[TARGET], 
                                                    test_size=.2, random_state=SEED, 
                                                    stratify=df[TARGET])

print(f"Training features shape: {X_train.shape}")
print(f"Test features shape: {X_test.shape}")

Numerical features: pclass, age, sibsp, parch, fare
Categorical features: deck, embark_town, sex

Training features shape: (712, 8)
Test features shape: (179, 8)


In [3]:
pipe = Pipeline([
    ('feature_extractor', FeatureExtractor()), 
    ('cat_imputer', Imputer(CATEGORICAL)), 
    ('cardinality_reducer', CardinalityReducer(CATEGORICAL, threshold=0.1)),
    ('encoder', Encoder(CATEGORICAL)),
    ('num_imputer', Imputer(NUMERICAL, method='mean')), 
    ('feature_selector', RFE(LogisticRegression(random_state=SEED, max_iter=500), n_features_to_select=8)), 
    ('model', LogisticRegression(random_state=SEED, max_iter=500))
])

pipe.fit(X_train, y_train)
print(f"Train ROC-AUC: {calculate_roc_auc(pipe, X_train, y_train):.4f}")
print(f"Test ROC-AUC: {calculate_roc_auc(pipe, X_test, y_test):.4f}")

Train ROC-AUC: 0.8637
Test ROC-AUC: 0.8416


In [4]:
top_features = pipe['feature_selector'].feature_names_in_[pipe['feature_selector'].support_]
print(f"Top {len(top_features)} features: {', '.join(top_features)}")

Top 8 features: pclass, age, sibsp, parch, deck_other, embark_town_Southampton, embark_town_other, sex_male
