In [3]:
# Titanic Data Preprocessing and Modeling (Improved)

import pandas as pd
import numpy as np

from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import KNNImputer
from sklearn.pipeline import Pipeline

# Models
from sklearn.linear_model import LogisticRegression, Perceptron, SGDClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# =====================
# Load Data
# =====================
train = pd.read_csv(r"C:\Users\punna\Downloads\titanic\train.csv")
test = pd.read_csv(r"C:\Users\punna\Downloads\titanic\train.csv")
gender = pd.read_csv(r"C:\Users\punna\Downloads\titanic\train.csv")  # only used for Kaggle submission, not training

# =====================
# Feature Engineering
# =====================
for df in [train, test]:
    # Extract Title
    df["Title"] = df["Name"].str.extract(" ([A-Za-z]+)\.", expand=False)
    df["Title"] = df["Title"].replace(["Lady","Countess","Capt","Col","Don","Dr",
                                       "Major","Rev","Sir","Jonkheer","Dona"], "Rare")
    df["Title"] = df["Title"].replace({"Mlle":"Miss","Ms":"Miss","Mme":"Mrs"})
    
    # Family-related features
    df["FamilySize"] = df["SibSp"] + df["Parch"] + 1
    df["IsAlone"] = (df["FamilySize"] == 1).astype(int)
    
    # Fare per person
    df["FarePerPerson"] = df["Fare"] / df["FamilySize"]
    
    # Deck from Cabin
    df["Deck"] = df["Cabin"].astype(str).str[0]

# Drop unused
train = train.drop(["PassengerId","Name","Ticket","Cabin"], axis=1)
test_passenger_ids = test["PassengerId"]
test = test.drop(["PassengerId","Name","Ticket","Cabin"], axis=1)

# Separate X and y
X = train.drop("Survived", axis=1)
y = train["Survived"]

# Define categorical & numeric columns
categorical_cols = ["Sex", "Embarked", "Title", "Deck"]
numeric_cols = [col for col in X.columns if col not in categorical_cols]

# =====================
# Preprocessing
# =====================
numeric_transformer = Pipeline(steps=[
    ("imputer", KNNImputer(n_neighbors=5)),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_cols),
        ("cat", categorical_transformer, categorical_cols)
    ])

# =====================
# Models
# =====================
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "SVM": SVC(),
    "KNN": KNeighborsClassifier(),
    "Random Forest": RandomForestClassifier(),
    "Naive Bayes": GaussianNB(),
    "Perceptron": Perceptron(),
    "SGD": SGDClassifier(max_iter=1000, tol=1e-3),
    "Linear SVC": LinearSVC(max_iter=2000),
    "Decision Tree": DecisionTreeClassifier()
}

# Hyperparameter grids
param_grids = {
    "Logistic Regression": {"classifier__C":[0.1,1,10]},
    "SVM": {"classifier__C":[0.5,1,10], "classifier__kernel":["rbf","linear"]},
    "KNN": {"classifier__n_neighbors":[3,5,7,9]},
    "Random Forest": {"classifier__n_estimators":[100,200], "classifier__max_depth":[4,6,8]},
    "Decision Tree": {"classifier__max_depth":[3,5,7,9]}
}

print("\n=== Improved Accuracies (Optimized) ===")
for name, model in models.items():
    pipe = Pipeline(steps=[("preprocessor", preprocessor),
                           ("classifier", model)])
    
    if name in param_grids:
        grid = GridSearchCV(pipe, param_grids[name], cv=5,
                            scoring="accuracy", n_jobs=-1)
        grid.fit(X, y)
        print(f"{name}: {grid.best_score_:.4f} (best params: {grid.best_params_})")
    else:
        scores = cross_val_score(pipe, X, y, cv=5, scoring="accuracy")
        print(f"{name}: {scores.mean():.4f}")



=== Improved Accuracies (Optimized) ===
Logistic Regression: 0.8260 (best params: {'classifier__C': 1})
SVM: 0.8294 (best params: {'classifier__C': 1, 'classifier__kernel': 'rbf'})
KNN: 0.8103 (best params: {'classifier__n_neighbors': 5})
Random Forest: 0.8316 (best params: {'classifier__max_depth': 6, 'classifier__n_estimators': 100})
Naive Bayes: 0.7677
Perceptron: 0.6801
SGD: 0.7453
Linear SVC: 0.8249
Decision Tree: 0.8148 (best params: {'classifier__max_depth': 3})
