# Titanic ML Assignment
### Submitted by: Maheen Touqeer

This notebook demonstrates an end-to-end ML pipeline on the **Titanic dataset** from Kaggle.

## 1. Setup & Data Loading

In [None]:
import os, warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_curve, auc
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

warnings.filterwarnings('ignore')
sns.set()
plt.rcParams['figure.figsize'] = (8,5)

# Load data
data_path = 'data/train.csv'
if not os.path.exists(data_path):
    print('Kaggle train.csv not found. Using included sample dataset.')
df = pd.read_csv(data_path)
df.head()

## 2. Data Cleaning

In [None]:
print("Missing values before cleaning:\n", df.isna().sum())

# Fill missing values
if 'Embarked' in df.columns and df['Embarked'].isna().sum()>0:
    df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)
if 'Age' in df.columns and df['Age'].isna().sum()>0:
    df['Age'].fillna(df['Age'].median(), inplace=True)

# Drop high-missing or irrelevant columns
df.drop(columns=['Cabin','Ticket','Name'], errors='ignore', inplace=True)

print("\nAfter cleaning:\n", df.isna().sum())
df.head()

## 3. Exploratory Data Analysis (EDA)

In [None]:
os.makedirs('images', exist_ok=True)

# Age distribution
ax = sns.histplot(df['Age'], kde=True)
ax.set_title('Age Distribution')
plt.tight_layout()
plt.savefig('images/eda_distribution.png')
plt.show()

# Survival by Class
ax = sns.barplot(x='Pclass', y='Survived', data=df)
ax.set_title('Survival Rate by Class')
plt.tight_layout()
plt.savefig('images/survival_by_class.png')
plt.show()

# Correlation Heatmap
corr = df.select_dtypes(include=[np.number]).corr()
ax = sns.heatmap(corr, annot=True, fmt='.2f', cmap='coolwarm')
ax.set_title('Correlation Heatmap')
plt.tight_layout()
plt.savefig('images/heatmap.png')
plt.show()

## 4. Feature Engineering & Encoding

In [None]:
# Create new features
df['FamilySize'] = df.get('SibSp',0) + df.get('Parch',0) + 1
df['IsAlone'] = (df['FamilySize'] == 1).astype(int)

# Encode categorical variables
if 'Sex' in df.columns and df['Sex'].dtype=='object':
    df['Sex'] = df['Sex'].map({'male':0,'female':1})
if 'Embarked' in df.columns and df['Embarked'].dtype=='object':
    df = pd.get_dummies(df, columns=['Embarked'], drop_first=True)

df.head()

## 5. Train/Test Split & Scaling

In [None]:
target='Survived'
X = df.drop(columns=[target]).select_dtypes(include=[np.number])
y = df[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

## 6. Model Training

In [None]:
models = {
    'LogisticRegression': LogisticRegression(max_iter=1000),
    'DecisionTree': DecisionTreeClassifier(random_state=42),
    'RandomForest': RandomForestClassifier(n_estimators=200, random_state=42),
    'SVM-RBF': SVC(kernel='rbf', probability=True, random_state=42)
}
results = {}

for name, clf in models.items():
    if name in ['LogisticRegression','SVM-RBF']:
        clf.fit(X_train_scaled, y_train)
        preds = clf.predict(X_test_scaled)
        proba = clf.predict_proba(X_test_scaled)[:,1]
    else:
        clf.fit(X_train, y_train)
        preds = clf.predict(X_test)
        proba = clf.predict_proba(X_test)[:,1]
    
    acc = accuracy_score(y_test, preds)
    results[name] = {'acc': acc, 'preds': preds, 'proba': proba, 'model': clf}

results

## 7. Model Evaluation

In [None]:
best_name = max(results, key=lambda k: results[k]['acc'])
best = results[best_name]

print(f'Best model: {best_name} (Accuracy={best['acc']:.3f})')

# Confusion Matrix
cm = confusion_matrix(y_test, best['preds'])
fig, ax = plt.subplots()
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax)
ax.set_title(f'Confusion Matrix - {best_name}')
ax.set_xlabel('Predicted')
ax.set_ylabel('Actual')
plt.tight_layout()
plt.savefig('images/confusion_matrix.png')
plt.show()

# ROC Curve
fpr, tpr, _ = roc_curve(y_test, best['proba'])
roc_auc = auc(fpr, tpr)

plt.figure()
plt.plot(fpr, tpr, label=f'{best_name} (AUC={roc_auc:.2f})')
plt.plot([0,1],[0,1],'--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc='lower right')
plt.tight_layout()
plt.savefig('images/roc_curve.png')
plt.show()

# Classification Report
print(classification_report(y_test, best['preds']))

## 8. Conclusion

- The pipeline successfully trained multiple ML models.  
- Random Forest often performs best on Titanic dataset, but exact results depend on training split.  
- Replace the sample data with Kaggle's full `train.csv` to get robust results.  
- Plots and metrics are saved into the **images/** folder.  
