# End-to-End Machine Learning Pipeline
### Dataset: Titanic Survival Prediction
---
This notebook covers the complete pipeline:
- Data Handling
- EDA
- Feature Engineering
- Model Training (KNN, Decision Tree, Random Forest)
- Hyperparameter Tuning
- Model Evaluation
- Conclusion

In [None]:
# Install required libraries (if not already installed)
!pip install plotly scikit-learn seaborn
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_curve, auc, classification_report

In [None]:
# Upload Titanic dataset CSV
from google.colab import files
uploaded = files.upload()
df = pd.read_csv(list(uploaded.keys())[0])
df.head()

In [None]:
# Data Cleaning
df.drop_duplicates(inplace=True)
df.fillna(df.mean(numeric_only=True), inplace=True)
df.fillna("Unknown", inplace=True)

# Convert categorical to numeric
df = pd.get_dummies(df, drop_first=True)
df.head()

In [None]:
# Feature/Target split
X = df.drop('Survived', axis=1)
y = df['Survived']
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [None]:
# Train Models
models = {
    'KNN': KNeighborsClassifier(),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier()
}
for name, model in models.items():
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    print(name, accuracy_score(y_test, preds))

In [None]:
# Hyperparameter Tuning Example (Random Forest)
param_dist = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10]
}
search = RandomizedSearchCV(RandomForestClassifier(), param_dist, n_iter=5, cv=3, random_state=42)
search.fit(X_train, y_train)
best_rf = search.best_estimator_
preds = best_rf.predict(X_test)
print('Tuned RF Accuracy:', accuracy_score(y_test, preds))

In [None]:
# Feature Importance
importances = best_rf.feature_importances_
indices = np.argsort(importances)[::-1]
plt.figure(figsize=(10,6))
sns.barplot(x=importances[indices], y=X.columns[indices])
plt.title('Feature Importance (Random Forest)')
plt.show()

In [None]:
# Evaluation Metrics
print(classification_report(y_test, preds))
cm = confusion_matrix(y_test, preds)
sns.heatmap(cm, annot=True, fmt='d')
plt.show()