# Task 5: Machine Learning Modeling Pipeline
This notebook demonstrates a complete ML workflow: preprocessing → training → evaluation → interpretation.

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report
import matplotlib.pyplot as plt

## Step 1: Load Dataset

In [None]:
# Load Titanic dataset (place titanic.csv in same folder)
df = pd.read_csv('titanic.csv')
df.head()

## Step 2: Data Cleaning & Feature Engineering

In [None]:
# Handle missing values
df['Age'].fillna(df['Age'].median(), inplace=True)
df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)

# Feature Engineering
df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
df['IsAlone'] = (df['FamilySize'] == 1).astype(int)

## Step 3: Select Features & Target

In [None]:
features = ['Pclass', 'Sex', 'Age', 'Fare', 'Embarked', 'FamilySize', 'IsAlone']
X = df[features]
y = df['Survived']

## Step 4: Preprocessing Pipeline

In [None]:
categorical_features = ['Sex', 'Embarked']
numerical_features = ['Pclass', 'Age', 'Fare', 'FamilySize', 'IsAlone']

preprocessor = ColumnTransformer([
    ('num', StandardScaler(), numerical_features),
    ('cat', OneHotEncoder(drop='first'), categorical_features)
])

## Step 5: Train-Test Split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

## Step 6: Train Multiple Models

In [None]:
models = {
    'Logistic Regression': Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', LogisticRegression())
    ]),
    'Random Forest': Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
    ])
}

## Step 7: Evaluate Models

In [None]:
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    print(f"\n{name}")
    print("Accuracy:", round(accuracy, 4))
    print("F1 Score:", round(f1, 4))
    print(classification_report(y_test, y_pred))

## Step 8: Feature Importance (Random Forest Interpretation)

In [None]:
rf_model = models['Random Forest']
rf_model.fit(X_train, y_train)

# Get feature names after encoding
encoded_features = rf_model.named_steps['preprocessor']\
    .named_transformers_['cat'].get_feature_names_out(categorical_features)

all_features = numerical_features + list(encoded_features)

importances = rf_model.named_steps['classifier'].feature_importances_

plt.figure(figsize=(8,5))
plt.barh(all_features, importances)
plt.title('Feature Importance')
plt.xlabel('Importance Score')
plt.show()