<a href="https://colab.research.google.com/github/maheshsmc2/POWERBI/blob/main/day1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# ===== Day 2 · Track 1 · Titanic pipeline (Colab starter) =====
!pip -q install scikit-learn pandas numpy matplotlib

import pandas as pd, numpy as np, time
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix
from sklearn.inspection import permutation_importance

# 1) Load
url = "https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv"
df = pd.read_csv(url)

# 2) Feature engineering (leak-safe)
def add_features(df):
    out = df.copy()
    out["FamilySize"] = out["SibSp"].fillna(0) + out["Parch"].fillna(0) + 1
    out["IsAlone"] = (out["FamilySize"] == 1).astype(int)
    out["Title"] = out["Name"].str.extract(r',\s*([^\.]+)\.', expand=False).fillna("Unknown")
    title_map = {"Mlle":"Miss","Ms":"Miss","Mme":"Mrs",
                 "Lady":"Royalty","Countess":"Royalty","Sir":"Royalty","Don":"Royalty","Jonkheer":"Royalty",
                 "Capt":"Officer","Col":"Officer","Major":"Officer","Dr":"Officer","Rev":"Officer"}
    out["Title"] = out["Title"].replace(title_map)
    out["CabinDeck"] = out["Cabin"].astype(str).str[0].where(out["Cabin"].notna(), "U")
    # Drop noisy IDs
    drop_cols = [c for c in ["PassengerId","Ticket","Name","Cabin"] if c in out.columns]
    return out.drop(columns=drop_cols, errors="ignore")

y = df["Survived"]
X = add_features(df.drop(columns=["Survived"]))

num_features = X.select_dtypes(include=[np.number]).columns.tolist()
cat_features = X.select_dtypes(exclude=[np.number]).columns.tolist()

# 3) Pipelines
num_pipe = Pipeline([("imputer", SimpleImputer(strategy="median")),
                     ("scaler", StandardScaler())])
cat_pipe = Pipeline([("imputer", SimpleImputer(strategy="most_frequent")),
                     ("onehot", OneHotEncoder(handle_unknown="ignore"))])

pre = ColumnTransformer([
    ("num", num_pipe, num_features),
    ("cat", cat_pipe, cat_features)
])

model = Pipeline([
    ("pre", pre),
    ("clf", RandomForestClassifier(n_estimators=200, random_state=42))
])

# 4) Split, fit, evaluate
X_tr, X_te, y_tr, y_te = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)
model.fit(X_tr, y_tr)
y_pred = model.predict(X_te)
proba = model.predict_proba(X_te)[:,1]

print(classification_report(y_te, y_pred))
print("ROC-AUC:", round(roc_auc_score(y_te, proba), 4))
print("Confusion matrix:\n", confusion_matrix(y_te, y_pred))

# 5) (Optional) Permutation importance — top 12 features
pi = permutation_importance(model, X_te, y_te, n_repeats=10, random_state=42)
importances = pd.DataFrame({
    "feature": X_te.columns,  # Use the columns from the test set
    "importance": pi.importances_mean
}).sort_values("importance", ascending=False).head(12)
display(importances)

              precision    recall  f1-score   support

           0       0.82      0.85      0.84       110
           1       0.75      0.70      0.72        69

    accuracy                           0.79       179
   macro avg       0.78      0.78      0.78       179
weighted avg       0.79      0.79      0.79       179

ROC-AUC: 0.8219
Confusion matrix:
 [[94 16]
 [21 48]]


Unnamed: 0,feature,importance
1,Sex,0.100559
9,Title,0.05419
2,Age,0.032961
5,Fare,0.024022
0,Pclass,0.010056
8,IsAlone,0.006704
3,SibSp,0.005028
6,Embarked,-0.002793
7,FamilySize,-0.003911
4,Parch,-0.006145


In [None]:
import matplotlib.pyplot as plt
importances = importances.sort_values("importance", ascending=True) # Sort ascending for horizontal bar chart

plt.figure(figsize=(10, 6))
plt.barh(importances["feature"], importances["importance"])
plt.xlabel("Permutation Importance")
plt.ylabel("Feature")
plt.title("Top 12 Permutation Importances")
plt.show()