In [2]:
# ==============================================================
# ðŸš¢ Titanic Dataset: 2-Way and 3-Way Split Example
# ==============================================================

# Step 1: Import libraries
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split

# Step 2: Load real-world dataset (Titanic)
try:
    dataset = sns.load_dataset('titanic')
except Exception as e:
    print("Error loading Titanic dataset:", e)
    exit()

print("âœ… Dataset successfully loaded. Here are the first 5 rows:")
print(dataset.head())

print("\n" + "="*60 + "\n")

# Step 3: Basic cleanup and feature selection
# Select numeric features and drop missing target values
dataset = dataset.dropna(subset=['survived'])
X = dataset[['pclass', 'age', 'sibsp', 'parch', 'fare']].copy()
y = dataset['survived']

# Fill missing values in features (for 'age')
X['age'] = X['age'].fillna(X['age'].median())

print(f"Features (X) shape: {X.shape}")
print(f"Target (y) shape: {y.shape}\n")

# ==============================================================
# --- 1. Performing a 2-Way Split (80% Train, 20% Test) ---
# ==============================================================
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("--- 1. Performing a 2-Way Split (80% Train, 20% Test) ---")
print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")

print("\n" + "="*60 + "\n")

# ==============================================================
# --- 2. Performing a 3-Way Split (70% Train, 15% Validation, 15% Test) ---
# ==============================================================
X_train_val, X_test_3way, y_train_val, y_test_3way = train_test_split(
    X, y, test_size=0.15, random_state=42, stratify=y
)

val_size_relative = 0.15 / 0.85  # to make validation ~15% of total
X_train_3way, X_val, y_train_3way, y_val = train_test_split(
    X_train_val, y_train_val, test_size=val_size_relative, random_state=42, stratify=y_train_val
)

print("--- 2. Performing a 3-Way Split (70% Train, 15% Validation, 15% Test) ---")
print(f"Total original samples: {len(X)}")
print(f"Training set size: {len(X_train_3way)} ({len(X_train_3way)/len(X):.0%})")
print(f"Validation set size: {len(X_val)} ({len(X_val)/len(X):.0%})")
print(f"Test set size: {len(X_test_3way)} ({len(X_test_3way)/len(X):.0%})")

print("\nFinal Shapes:")
print(f"X_train shape: {X_train_3way.shape}")
print(f"X_val shape: {X_val.shape}")
print(f"X_test shape: {X_test_3way.shape}")


âœ… Dataset successfully loaded. Here are the first 5 rows:
   survived  pclass     sex   age  sibsp  parch     fare embarked  class  \
0         0       3    male  22.0      1      0   7.2500        S  Third   
1         1       1  female  38.0      1      0  71.2833        C  First   
2         1       3  female  26.0      0      0   7.9250        S  Third   
3         1       1  female  35.0      1      0  53.1000        S  First   
4         0       3    male  35.0      0      0   8.0500        S  Third   

     who  adult_male deck  embark_town alive  alone  
0    man        True  NaN  Southampton    no  False  
1  woman       False    C    Cherbourg   yes  False  
2  woman       False  NaN  Southampton   yes   True  
3  woman       False    C  Southampton   yes  False  
4    man        True  NaN  Southampton    no   True  


Features (X) shape: (891, 5)
Target (y) shape: (891,)

--- 1. Performing a 2-Way Split (80% Train, 20% Test) ---
X_train shape: (712, 5)
X_test shape: (179, 