In [1]:


import pandas as pd
import numpy as np

from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import KNNImputer
from sklearn.pipeline import Pipeline


from sklearn.linear_model import LogisticRegression, Perceptron, SGDClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

train = pd.read_csv(r"C:\Users\punna\Downloads\titanic\train.csv")
test = pd.read_csv(r"C:\Users\punna\Downloads\titanic\train.csv")
gender = pd.read_csv(r"C:\Users\punna\Downloads\titanic\train.csv")  # only used for Kaggle submission, not training

for df in [train, test]:
    # Extract Title
    df["Title"] = df["Name"].str.extract(" ([A-Za-z]+)\.", expand=False)
    df["Title"] = df["Title"].replace(["Lady","Countess","Capt","Col","Don","Dr",
                                       "Major","Rev","Sir","Jonkheer","Dona"], "Rare")
    df["Title"] = df["Title"].replace({"Mlle":"Miss","Ms":"Miss","Mme":"Mrs"})
    
   
    df["FamilySize"] = df["SibSp"] + df["Parch"] + 1
    df["IsAlone"] = (df["FamilySize"] == 1).astype(int)
    
    df["FarePerPerson"] = df["Fare"] / df["FamilySize"]
    
    df["Deck"] = df["Cabin"].astype(str).str[0]


train = train.drop(["PassengerId","Name","Ticket","Cabin"], axis=1)
test_passenger_ids = test["PassengerId"]
test = test.drop(["PassengerId","Name","Ticket","Cabin"], axis=1)


X = train.drop("Survived", axis=1)
y = train["Survived"]

categorical_cols = ["Sex", "Embarked", "Title", "Deck"]
numeric_cols = [col for col in X.columns if col not in categorical_cols]


numeric_transformer = Pipeline(steps=[
    ("imputer", KNNImputer(n_neighbors=5)),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_cols),
        ("cat", categorical_transformer, categorical_cols)
    ])

models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "SVM": SVC(),
    "KNN": KNeighborsClassifier(),
    "Random Forest": RandomForestClassifier(),
    "Naive Bayes": GaussianNB(),
    "Perceptron": Perceptron(),
    "SGD": SGDClassifier(max_iter=1000, tol=1e-3),
    "Linear SVC": LinearSVC(max_iter=2000),
    "Decision Tree": DecisionTreeClassifier()
}

param_grids = {
    "Logistic Regression": {"classifier__C":[0.1,1,10]},
    "SVM": {"classifier__C":[0.5,1,10], "classifier__kernel":["rbf","linear"]},
    "KNN": {"classifier__n_neighbors":[3,5,7,9]},
    "Random Forest": {"classifier__n_estimators":[100,200], "classifier__max_depth":[4,6,8]},
    "Decision Tree": {"classifier__max_depth":[3,5,7,9]}
}

print("\n=== Improved Accuracies (Optimized) ===")
for name, model in models.items():
    pipe = Pipeline(steps=[("preprocessor", preprocessor),
                           ("classifier", model)])
    
    if name in param_grids:
        grid = GridSearchCV(pipe, param_grids[name], cv=5,
                            scoring="accuracy", n_jobs=-1)
        grid.fit(X, y)
        print(f"{name}: {grid.best_score_:.4f} (best params: {grid.best_params_})")
    else:
        scores = cross_val_score(pipe, X, y, cv=5, scoring="accuracy")
        print(f"{name}: {scores.mean():.4f}")



=== Improved Accuracies (Optimized) ===
Logistic Regression: 0.8260 (best params: {'classifier__C': 1})
SVM: 0.8294 (best params: {'classifier__C': 1, 'classifier__kernel': 'rbf'})
KNN: 0.8137 (best params: {'classifier__n_neighbors': 5})
Random Forest: 0.8350 (best params: {'classifier__max_depth': 6, 'classifier__n_estimators': 100})
Naive Bayes: 0.7677
Perceptron: 0.7396
SGD: 0.7667
Linear SVC: 0.8260
Decision Tree: 0.8148 (best params: {'classifier__max_depth': 3})


In [2]:
# =====================
# === Extension Cell (Full) ===
# =====================

import os
import matplotlib.pyplot as plt
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.tree import plot_tree, DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

csv_dir = r"C:\Users\punna\Downloads\titanic"

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_cols),
        ("cat", categorical_transformer, categorical_cols)
    ])


dt_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", DecisionTreeClassifier(random_state=42))
])

param_grid_dt = {
    "classifier__criterion": ["gini", "entropy", "log_loss"],
    "classifier__max_depth": [3, 5, 7, 9, None],
    "classifier__min_samples_split": [2, 5, 10],
    "classifier__min_samples_leaf": [1, 2, 4],
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

grid_dt = GridSearchCV(dt_pipeline, param_grid_dt, cv=cv,
                       scoring="accuracy", n_jobs=-1, refit=True)
grid_dt.fit(X, y)

print("\nBest Decision Tree CV accuracy: ", grid_dt.best_score_)
print("Best DT parameters:", grid_dt.best_params_)


best_dt = grid_dt.best_estimator_
ohe = best_dt.named_steps["preprocessor"].named_transformers_["cat"].named_steps["encoder"]
feature_names = np.concatenate([
    numeric_cols,
    ohe.get_feature_names_out(categorical_cols)
])

# Save plot in the same directory as Titanic CSV files
plot_path = os.path.join(csv_dir, "decision_tree_preview.png")

plt.figure(figsize=(22, 12))
plot_tree(best_dt.named_steps["classifier"],
          feature_names=feature_names,
          class_names=["Not Survived", "Survived"],
          filled=True, rounded=True, max_depth=3)
plt.tight_layout()
plt.savefig(plot_path, dpi=200)
plt.close()
print(f"Decision tree plot saved to: {plot_path}")


rf_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", RandomForestClassifier(random_state=42, n_jobs=-1))
])

param_grid_rf = {
    "classifier__n_estimators": [200, 400],
    "classifier__max_depth": [None, 6, 10],
    "classifier__min_samples_split": [2, 5, 10],
    "classifier__min_samples_leaf": [1, 2, 4],
    "classifier__max_features": ["sqrt", None],
}

grid_rf = GridSearchCV(rf_pipeline, param_grid_rf, cv=cv,
                       scoring="accuracy", n_jobs=-1, refit=True)
grid_rf.fit(X, y)

print("\nBest Random Forest CV accuracy: ", grid_rf.best_score_)
print("Best RF parameters:", grid_rf.best_params_)


print("\n=== Model Comparison (5-Fold CV Accuracy) ===")
print(f"Decision Tree: {grid_dt.best_score_:.4f}")
print(f"Random Forest: {grid_rf.best_score_:.4f}")

if grid_rf.best_score_ > grid_dt.best_score_:
    print("→ Random Forest performs better due to reduced variance and ensemble averaging.")
else:
    print("→ Decision Tree performs similarly or slightly better; check overfitting or data size.")



Best Decision Tree CV accuracy:  0.8361245370660976
Best DT parameters: {'classifier__criterion': 'gini', 'classifier__max_depth': 5, 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 10}
Decision tree plot saved to: C:\Users\punna\Downloads\titanic\decision_tree_preview.png

Best Random Forest CV accuracy:  0.855200552382148
Best RF parameters: {'classifier__max_depth': 10, 'classifier__max_features': None, 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 10, 'classifier__n_estimators': 400}

=== Model Comparison (5-Fold CV Accuracy) ===
Decision Tree: 0.8361
Random Forest: 0.8552
→ Random Forest performs better due to reduced variance and ensemble averaging.
