In [1]:
# ============================================================
# 🧭 Titanic - Random Forest Baseline (Optimized & Clean)
# ============================================================

import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
# === 1. Load Data ===

train_df = pd.read_csv(r"C:\Users\antoi\Desktop\Machine_LearningProjet\data\train.csv")
test_df  = pd.read_csv(r"C:\Users\antoi\Desktop\Machine_LearningProjet\data\test.csv")

print(f"Train shape: {train_df.shape}")
print(f"Test shape : {test_df.shape}")
# === 2. Feature Engineering Function ===

def add_features(df):
    df = df.copy()

    # --- Extract Title from Name ---
    df['Title'] = df['Name'].str.extract(r',\s*([^\.]+)\.', expand=False).str.strip()
    rare_titles = ['Lady', 'Countess', 'Capt', 'Col', 'Don', 'Dr',
                   'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona']
    df['Title'] = df['Title'].replace(rare_titles, 'Rare')
    df['Title'] = df['Title'].replace(['Mlle', 'Ms'], 'Miss')
    df['Title'] = df['Title'].replace('Mme', 'Mrs')

    # --- Family features ---
    df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
    df['IsAlone'] = (df['FamilySize'] == 1).astype(int)

    # --- Cabin info ---
    df['HasCabin'] = df['Cabin'].notna().astype(int)
    df['Deck'] = df['Cabin'].fillna('M').astype(str).str[0]  # 'M' for Missing

    # --- Fare ---
    df['Fare'] = df['Fare'].fillna(df['Fare'].median())
    df['LogFare'] = np.log1p(df['Fare'])

    # --- Age bins (temporary; recomputed after imputation) ---
    bins = [0, 12, 18, 35, 60, 120]
    labels = ['Child', 'Teen', 'YoungAdult', 'Adult', 'Senior']
    df['AgeBin'] = pd.cut(df['Age'], bins=bins, labels=labels)

    # --- Family size bins ---
    bins_fam = [0, 1, 4, 100]
    labels_fam = ['Solo', 'Small', 'Large']
    df['FamilyBin'] = pd.cut(df['FamilySize'], bins=bins_fam, labels=labels_fam)

    # --- Combined feature ---
    df['Sex_Pclass'] = df['Sex'].astype(str) + "_" + df['Pclass'].astype(str)

    return df
# === 3. Impute Age using group median (Title, Pclass, Sex) ===

def impute_age(df, age_medians):
    def fill_age(row):
        if pd.isnull(row['Age']):
            return age_medians.loc[row['Title'], row['Pclass'], row['Sex']]
        else:
            return row['Age']
    return df.apply(fill_age, axis=1)


# Add features
train_df = add_features(train_df)
test_df  = add_features(test_df)

# Compute median ages by (Title, Pclass, Sex)
age_medians = train_df.groupby(['Title', 'Pclass', 'Sex'])['Age'].median()

# Apply age imputation
train_df['Age'] = impute_age(train_df, age_medians)
test_df['Age']  = impute_age(test_df, age_medians)

# Recreate AgeBin after imputation
bins = [0, 12, 18, 35, 60, 120]
labels = ['Child', 'Teen', 'YoungAdult', 'Adult', 'Senior']
train_df['AgeBin'] = pd.cut(train_df['Age'], bins=bins, labels=labels)
test_df['AgeBin']  = pd.cut(test_df['Age'], bins=bins, labels=labels)
# === 4. Fill missing Embarked with mode ===

embarked_mode = train_df['Embarked'].mode()[0]
train_df['Embarked'] = train_df['Embarked'].fillna(embarked_mode)
test_df['Embarked']  = test_df['Embarked'].fillna(embarked_mode)
# === 5. Encode categorical features ===

categorical_cols = ['Sex', 'Embarked', 'Title', 'Deck',
                    'Sex_Pclass', 'AgeBin', 'FamilyBin']

full_df = pd.concat([train_df, test_df], sort=False)

for col in categorical_cols:
    le = LabelEncoder()
    full_df[col] = le.fit_transform(full_df[col].astype(str))

train_df = full_df.iloc[:len(train_df), :].copy()
test_df  = full_df.iloc[len(train_df):, :].copy()
# === 6. Select features and target ===

features = [
    'Pclass', 'Sex', 'Age', 'Fare', 'FamilySize', 'Embarked',
    'Title', 'Deck', 'Sex_Pclass', 'LogFare',
    'AgeBin', 'FamilyBin', 'IsAlone', 'HasCabin'
]

X = train_df[features]
y = train_df['Survived']
X_test = test_df[features]
# === 7. Impute remaining missing values ===

imputer = SimpleImputer(strategy='median')
X_imputed = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)
X_test_imputed = pd.DataFrame(imputer.transform(X_test), columns=X_test.columns)
# === 8. Split Train/Validation ===

X_train, X_val, y_train, y_val = train_test_split(
    X_imputed, y, test_size=0.2, random_state=42, stratify=y
)
# === 9. Train Random Forest ===

model = RandomForestClassifier(
    n_estimators=200,
    max_depth=7,
    random_state=42
)

model.fit(X_train, y_train)
# === 10. Validation Performance ===

y_pred = model.predict(X_val)
accuracy = accuracy_score(y_val, y_pred)
print(f"✅ Validation Accuracy: {accuracy:.4f}")
# === 11. Predict on Test Set ===

test_pred = model.predict(X_test_imputed)
# === 12. Create Submission File ===

submission = pd.DataFrame({
    'PassengerId': test_df['PassengerId'],
    'Survived': test_pred.astype(int)
})

submission.to_csv(
    r"C:\Users\antoi\Desktop\Machine_LearningProjet\submission_combined.csv",
    index=False
)

print("💾 Submission saved as 'submission_combined.csv'")


Train shape: (891, 12)
Test shape : (418, 11)
✅ Validation Accuracy: 0.7989
💾 Submission saved as 'submission_combined.csv'
