# Feature Engineering Strategies — Student Lab (Titanic)

Focus: missing data, encoding, nonlinear transforms, and interactions — **leakage-safe**.

In [None]:
import os
import re
import numpy as np
import pandas as pd

from sklearn.model_selection import StratifiedKFold, train_test_split, cross_validate
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, FunctionTransformer
from sklearn.linear_model import LogisticRegression

def check(name: str, cond: bool):
    if not cond:
        raise AssertionError(f'Failed: {name}')
    print(f'OK: {name}')

rng = np.random.default_rng(0)

## Section 0 — Load Titanic (Kaggle) with fallback
Expected: `data/titanic/train.csv`

If missing, a tiny synthetic dataset is used so the notebook runs.

In [None]:
def load_titanic_or_synthetic():
    path = os.path.join(os.getcwd(), 'data', 'titanic', 'train.csv')
    if os.path.exists(path):
        return 'kaggle', pd.read_csv(path)

    df = pd.DataFrame({
        'Survived': [0,1,1,0,1,0,0,1],
        'Pclass': [3,1,3,3,2,3,2,1],
        'Sex': ['male','female','female','male','female','male','male','female'],
        'Age': [22, 38, np.nan, 35, 28, 2, 54, np.nan],
        'SibSp': [1,1,0,1,0,3,0,0],
        'Parch': [0,0,0,0,0,1,0,0],
        'Fare': [7.25, 71.3, 7.92, 53.1, 13.0, 21.1, 51.9, 30.0],
        'Embarked': ['S','C','S','S',np.nan,'S','S','C'],
        'Cabin': [np.nan, 'C85', np.nan, np.nan, np.nan, np.nan, 'E46', np.nan],
        'Name': [
            'Braund, Mr. Owen Harris',
            'Cumings, Mrs. John Bradley',
            'Heikkinen, Miss. Laina',
            'Allen, Mr. William Henry',
            'Moran, Mr. James',
            'Palsson, Master. Gosta Leonard',
            'McCarthy, Mr. Timothy J',
            'Fortune, Miss. Mabel Helen',
        ],
    })
    return 'synthetic', df

mode, df = load_titanic_or_synthetic()
print('mode', mode, 'rows', len(df))
df.head()

## Section 1 — Baseline pipeline

### Task 1.1: Train a baseline with leakage-safe preprocessing

Use:
- Numeric: Age, Fare, SibSp, Parch
- Categorical: Pclass, Sex, Embarked

# TODO:
- Create X/y
- Build ColumnTransformer (impute + scale numeric, impute + one-hot categorical)
- Evaluate LogisticRegression with StratifiedKFold CV

**Checkpoint:** Why is the pipeline necessary to avoid leakage in CV?

In [None]:
y = df['Survived'].astype(int)
X = df.drop(columns=['Survived'])

numeric_features = ['Age','Fare','SibSp','Parch']
categorical_features = ['Pclass','Sex','Embarked']

# TODO
preprocess = ...

model = LogisticRegression(max_iter=5000)
pipe = Pipeline(steps=[('preprocess', preprocess), ('model', model)])

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
scores = ...
print('baseline acc mean', np.mean(scores['test_acc']))

## Section 2 — Missing data: indicators

### Task 2.1: Add missingness indicator features

Add: Age_is_missing, Cabin_is_missing

# TODO:
- Create these columns
- Add them to numeric pipeline
- Re-run CV and compare

**Checkpoint:** Why can missingness itself be predictive?

In [None]:
X2 = X.copy()
# TODO
X2['Age_is_missing'] = ...
X2['Cabin_is_missing'] = ...

numeric_features2 = numeric_features + ['Age_is_missing','Cabin_is_missing']

# TODO rebuild preprocess and evaluate
preprocess2 = ...
pipe2 = Pipeline(steps=[('preprocess', preprocess2), ('model', LogisticRegression(max_iter=5000))])
scores2 = ...
print('with indicators acc mean', np.mean(scores2['test_acc']))

## Section 3 — Encoding: One-hot vs Target encoding

### Task 3.1: Extract Title from Name and one-hot encode

Example titles: Mr, Mrs, Miss, Master.

# TODO:
- Extract Title with regex
- Add it as categorical feature
- Re-run CV


In [None]:
def extract_title(name: str):
    if not isinstance(name, str):
        return 'Unknown'
    m = re.search(r',\s*([^\.]*)\.', name)
    return m.group(1).strip() if m else 'Unknown'

X3 = X2.copy()
X3['Title'] = X3['Name'].apply(extract_title)

categorical_features3 = categorical_features + ['Title']
numeric_features3 = numeric_features2

# TODO: preprocess3 + CV
preprocess3 = ...
pipe3 = Pipeline(steps=[('preprocess', preprocess3), ('model', LogisticRegression(max_iter=5000))])
scores3 = ...
print('with Title acc mean', np.mean(scores3['test_acc']))

### Task 3.2: Naive target encoding (demonstrate leakage)

We will *intentionally* do something wrong: compute mean Survived per Title using the full dataset, then map it back.

# TODO:
- Create Title_target_mean using full data (this is leakage)
- Compare CV score and explain why it inflates

**Checkpoint:** How do you fix target encoding correctly?

In [None]:
X4 = X3.copy()
# TODO (leaky)
title_mean = ...
X4['Title_target_mean_leaky'] = X4['Title'].map(title_mean).fillna(y.mean())

numeric_features4 = numeric_features3 + ['Title_target_mean_leaky']
categorical_features4 = categorical_features  # drop Title to keep it purely numeric encoding

preprocess4 = ...
pipe4 = Pipeline(steps=[('preprocess', preprocess4), ('model', LogisticRegression(max_iter=5000))])
scores4 = ...
print('LEAKY target encoding acc mean', np.mean(scores4['test_acc']))

## Section 4 — Nonlinear transforms + binning

### Task 4.1: Log-transform Fare

# TODO: create Fare_log = log1p(Fare) and evaluate.


In [None]:
X5 = X3.copy()
X5['Fare_log'] = np.log1p(pd.to_numeric(X5['Fare'], errors='coerce'))

numeric_features5 = [c for c in numeric_features3 if c != 'Fare'] + ['Fare_log']
categorical_features5 = categorical_features3

# TODO preprocess5 + CV
preprocess5 = ...
pipe5 = Pipeline(steps=[('preprocess', preprocess5), ('model', LogisticRegression(max_iter=5000))])
scores5 = ...
print('with Fare_log acc mean', np.mean(scores5['test_acc']))

## Section 5 — Interaction features

### Task 5.1: FamilySize and IsAlone
FamilySize = SibSp + Parch + 1
IsAlone = FamilySize == 1

### Task 5.2: Sex × Pclass cross
Create a categorical cross feature like `Sex_Pclass`.

# TODO: implement and evaluate.


In [None]:
X6 = X5.copy()
X6['FamilySize'] = pd.to_numeric(X6['SibSp'], errors='coerce').fillna(0) + pd.to_numeric(X6['Parch'], errors='coerce').fillna(0) + 1
X6['IsAlone'] = (X6['FamilySize'] == 1).astype(int)
X6['Sex_Pclass'] = X6['Sex'].astype(str) + '_P' + X6['Pclass'].astype(str)

numeric_features6 = numeric_features5 + ['FamilySize','IsAlone']
categorical_features6 = categorical_features5 + ['Sex_Pclass']

# TODO preprocess6 + CV
preprocess6 = ...
pipe6 = Pipeline(steps=[('preprocess', preprocess6), ('model', LogisticRegression(max_iter=5000))])
scores6 = ...
print('with interactions acc mean', np.mean(scores6['test_acc']))

---
## Submission Checklist
- Baseline pipeline + CV
- Missingness indicators compared
- Title feature added
- Leakage explanation for target encoding
- Nonlinear transform (Fare_log) evaluated
- Interaction features evaluated
