In [2]:
# Add os command to import this directory
import os
os.chdir('/Users/xyz-ai/Developer/python3-code/kaggle-titanic/submission/01_base_dt')

In [3]:
from feature_engineering import (
    FareByPclassImputer, AgeByClassSexImputer, FamilySizeAdder,
    TicketFeatureAdder, CabinDeckExtractor
)

In [4]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

# Feature Engineering Pipeline
feature_engineering_pipeline = Pipeline([
    ('fare_imputer', FareByPclassImputer()),
    ('age_imputer', AgeByClassSexImputer()),
    ('family_size_adder', FamilySizeAdder(add_family_size_category=True)),
    ('ticket_feature_adder', TicketFeatureAdder()),
    ('cabin_deck_extractor', CabinDeckExtractor())
])

# Define columns to transform
categorical_features = ['Pclass', 'Sex', 'Embarked',
                        'TicketPrefix', 'Deck', 'FamilySizeCategory']
numerical_features = ['Fare', 'Age', 'FamilySize', 'TicketGroupSize']

# Preprocessing Pipeline
preprocessing_pipeline = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features),
    ('num', StandardScaler(), numerical_features)
])

# Final Titanic full preprocessor
full_titanic_pipeline = Pipeline([
    ('feature_engineering', feature_engineering_pipeline),
    ('preprocessing', preprocessing_pipeline)
])

In [None]:
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.dummy import DummyClassifier

In [None]:
# Full ML pipeline: preprocessing + model
final_pipeline = Pipeline([
    ('full_preprocessing', full_titanic_pipeline),
    # Placeholder â€” model will be set by GridSearchCV
    ('classifier', DummyClassifier())
])

In [None]:
from sklearn.model_selection import GridSearchCV

# Define hyperparameter space
param_grid = [
    {
        'classifier': [SVC(probability=True, random_state=42)],
        'classifier__kernel': ['rbf', 'linear'],
        'classifier__C': [0.1, 1, 10],
        'classifier__gamma': ['scale', 'auto']
    },
    {
        'classifier': [DecisionTreeClassifier(random_state=42)],
        'classifier__max_depth': [3, 5, 7, None],
        'classifier__min_samples_split': [2, 5, 10]
    },
    {
        'classifier': [RandomForestClassifier(random_state=42)],
        'classifier__n_estimators': [100, 200],
        'classifier__max_depth': [5, 10, None],
        'classifier__min_samples_split': [2, 5]
    },
    {
        'classifier': [LogisticRegression(max_iter=500, random_state=42)],
        'classifier__C': [0.01, 0.1, 1, 10],
        'classifier__solver': ['liblinear', 'lbfgs']
    }
]

In [None]:
# GridSearchCV
grid_search = GridSearchCV(
    final_pipeline,
    param_grid,
    cv=5,
    scoring='accuracy',
    verbose=2,
    n_jobs=-1  # Use all cores
)