<a href="https://colab.research.google.com/github/matthewpblock/applied-ml-matthew-block/blob/main/notebooks/project04/ml04_block.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# P4 - Continuous Target Prediction Using Regression (Titanic)
Matthew Block  
15 Nov 2025  
Opening with short intro describing dataset and objectives.

In [13]:
# Imports
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, ElasticNet
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

## Section 1: Import and Inspect the Data

In [14]:
# Load Titanic dataset from seaborn and verify
titanic = sns.load_dataset("titanic")
titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


## Section 2: Data Exploration and Preparation

In [15]:
titanic['age'] = titanic['age'].fillna(titanic['age'].median()) #Updated based on Future Warning regarding inplace usage

titanic = titanic.dropna(subset=['fare'])

titanic['family_size'] = titanic['sibsp'] + titanic['parch'] + 1

## Section 3: Feature Selection and Justification

In [16]:
# Identify categorical columns
categorical_cols = titanic.select_dtypes(include=['object', 'category', 'bool']).columns

# Handle missing values in 'embarked' and 'embark_town' with mode imputation
for col in ['embarked', 'embark_town']:
    if col in categorical_cols:
        mode_val = titanic[col].mode()[0]
        titanic[col] = titanic[col].fillna(mode_val)

# Handle missing values in 'deck' by treating NaN as a separate category
if 'deck' in categorical_cols:
    titanic['deck'] = titanic['deck'].astype(object).fillna('Unknown')

# Create a copy of the DataFrame for encoding
titanic_encoded = titanic.copy()

# Perform Target Encoding (Mean Encoding) for each categorical column
# Replace each category with the mean of the 'fare' for that category.
for col in categorical_cols:
    if col != 'fare': # 'fare' is our target, so we don't encode it
        # Calculate the mean 'fare' for each category
        # Use observed=False to avoid FutureWarning with categorical groupers
        mean_fare_map = titanic.groupby(col, observed=False)['fare'].mean()
        # Map the categories in the copied DataFrame to their mean 'fare'
        titanic_encoded[col] = titanic_encoded[col].map(mean_fare_map)
        # Convert the column to numeric before filling NaN, as fill value is numeric
        titanic_encoded[col] = pd.to_numeric(titanic_encoded[col])
        # Fill any NaN values (e.g., if a category appears in a future test set
        # but not in the training set) with the global mean 'fare'
        titanic_encoded[col] = titanic_encoded[col].fillna(titanic['fare'].mean())

print("Shape of original DataFrame:", titanic.shape)
print("Shape of encoded DataFrame:", titanic_encoded.shape)
print("First 5 rows of the encoded DataFrame (with target encoded categorical features):")
print(titanic_encoded.head())

Shape of original DataFrame: (891, 16)
Shape of encoded DataFrame: (891, 16)
First 5 rows of the encoded DataFrame (with target encoded categorical features):
   survived  pclass        sex   age  sibsp  parch     fare   embarked  \
0         0       3  25.523893  22.0      1      0   7.2500  27.243651   
1         1       1  44.479818  38.0      1      0  71.2833  59.954144   
2         1       3  44.479818  26.0      0      0   7.9250  27.243651   
3         1       1  44.479818  35.0      1      0  53.1000  27.243651   
4         0       3  25.523893  35.0      0      0   8.0500  27.243651   

       class        who  adult_male        deck  embark_town      alive  \
0  13.675550  24.864182   24.864182   19.181079    27.243651  22.117887   
1  84.154687  46.570711   43.338655  100.151341    59.954144  48.395408   
2  13.675550  46.570711   43.338655   19.181079    27.243651  48.395408   
3  84.154687  46.570711   43.338655  100.151341    27.243651  48.395408   
4  13.675550  24.8641

In [18]:
numerical_features = titanic_encoded.select_dtypes(include=['number']).columns
correlation_with_fare = titanic_encoded[numerical_features].corr()['fare'].sort_values(ascending=False)
print("Correlation with 'fare':\n", correlation_with_fare)

Correlation with 'fare':
 fare           1.000000
class          0.594217
deck           0.576773
embarked       0.280443
embark_town    0.280443
alone          0.271832
alive          0.257307
survived       0.257307
family_size    0.217138
parch          0.216225
who            0.196536
sex            0.182333
adult_male     0.182024
sibsp          0.159651
age            0.096688
pclass        -0.549500
Name: fare, dtype: float64


In [None]:
# Case 1. age
X1 = titanic[['age']]
y1 = titanic['fare']

# Case 2. family_size
X2 = titanic[['family_size']]
y2 = titanic['fare']

# Case 3. age, family_size
X3 = titanic[['age', 'family_size']]
y3 = titanic['fare']

# Case 4. Custom Combo
X4 = titanic[['class', 'deck', 'embark_town', 'family_size']]
y4 = titanic['fare']

### Reflection Questions:

    Why might these features affect a passengerâ€™s fare:
    -
    List all available features:
    -
    Which other features could improve predictions and why:
    -
    How many variables are in your Case 4:
    -
    Which variable(s) did you choose for Case 4 and why do you feel those could make good inputs:
    -


## Section 4: Train a Regression Model (Linear Regression)

## Section 5: Compare Alternative Models (Ridge, Elastic Net, Polynomial Regression)

## Section 6: Final Thoughts & Insights