In [2]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.metricstrain_raw.describe()
 import accuracy_score, confusion_matrix, classification_report
from sklearn.ensemble import HistGradientBoostingClassifier

In [3]:
train_raw = pd.read_csv("/kaggle/input/titanic/train.csv")
test_raw  = pd.read_csv("/kaggle/input/titanic/test.csv")

print("Train shape:", train_raw.shape)
print("Test shape:", test_raw.shape)

Train shape: (891, 12)
Test shape: (418, 11)


In [4]:
train_raw.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
train_raw.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [6]:
train_raw["Survived"].value_counts()

Survived
0    549
1    342
Name: count, dtype: int64

In [7]:
train_raw["Survived"].value_counts(normalize=True)

Survived
0    0.616162
1    0.383838
Name: proportion, dtype: float64

In [8]:
train_raw.isnull().sum().sort_values(ascending=False)

Cabin          687
Age            177
Embarked         2
PassengerId      0
Name             0
Pclass           0
Survived         0
Sex              0
Parch            0
SibSp            0
Fare             0
Ticket           0
dtype: int64

In [9]:
test_raw.isnull().sum().sort_values(ascending=False)


Cabin          327
Age             86
Fare             1
Name             0
Pclass           0
PassengerId      0
Sex              0
Parch            0
SibSp            0
Ticket           0
Embarked         0
dtype: int64

In [10]:
def preprocess(df):
    df = df.copy()

    # Title from Name
    df["Title"] = df["Name"].str.extract(r" ([A-Za-z]+)\.", expand=False)
    df["Title"] = df["Title"].replace(
        ["Lady","Countess","Capt","Col","Don","Dr","Major","Rev","Sir","Jonkheer","Dona"],
        "Rare"
    )
    df["Title"] = df["Title"].replace({"Mlle":"Miss", "Ms":"Miss", "Mme":"Mrs"})

    # Fill missing values
    df["Age"] = df["Age"].fillna(df["Age"].median())
    df["Embarked"] = df["Embarked"].fillna(df["Embarked"].mode()[0])
    df["Fare"] = df["Fare"].fillna(df["Fare"].median())

    # Family features
    df["FamilySize"] = df["SibSp"] + df["Parch"] + 1
    df["IsAlone"] = (df["FamilySize"] == 1).astype(int)

    # Drop noisy columns
    df = df.drop(["Name", "Ticket", "Cabin"], axis=1)

    # Encode Sex
    df["Sex"] = df["Sex"].map({"male": 0, "female": 1})

    # One-hot encoding
    df = pd.get_dummies(df, columns=["Embarked", "Title"], drop_first=True)

    return df


In [11]:
train = preprocess(train_raw)
test  = preprocess(test_raw)

train.shape, test.shape

((891, 16), (418, 15))

In [12]:
train.isnull().sum().sort_values(ascending=False).head()

PassengerId    0
Survived       0
Pclass         0
Sex            0
Age            0
dtype: int64

In [13]:
train.isnull().sum().sum(), test.isnull().sum().sum()

(np.int64(0), np.int64(0))

In [14]:
X = train.drop("Survived", axis=1)
y = train["Survived"]

test_passenger_id = test["PassengerId"]

X = X.drop("PassengerId", axis=1)
test_features = test.drop("PassengerId", axis=1)

X, test_features = X.align(test_features, join="left", axis=1, fill_value=0)

In [15]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.25,
    random_state=42,
    stratify=y
)

In [30]:

model = HistGradientBoostingClassifier(
    learning_rate=0.05,
    max_iter=600,
    max_depth=2,
    min_samples_leaf=40,
    l2_regularization=2.0,
    random_state=42
)

model.fit(X_train, y_train)

In [31]:

train_pred = model.predict(X_train)
test_pred  = model.predict(X_test)

train_acc = accuracy_score(y_train, train_pred)
test_acc  = accuracy_score(y_test, test_pred)

print("Train Accuracy:", train_acc)
print("Test  Accuracy:", test_acc)
print("Gap:", train_acc - test_acc)

Train Accuracy: 0.8847305389221557
Test  Accuracy: 0.7982062780269058
Gap: 0.0865242608952499


In [32]:
gap = train_acc - test_acc

if gap > 0.06:
    print("Overfitting detected")
elif train_acc < 0.7 and test_acc < 0.7:
    print("Underfitting detected")
else:
    print("Model fit looks reasonable")


Overfitting detected


In [33]:
confusion_matrix(y_test, test_pred)

array([[121,  16],
       [ 29,  57]])

In [34]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(model, X, y, cv=cv, scoring="accuracy")

cv_scores.mean(), cv_scores.std()

(np.float64(0.8406251961584331), np.float64(0.020309546604370163))

In [35]:
model.fit(X, y)
final_test_pred = model.predict(test_features)

In [37]:
submission = pd.DataFrame({
    "PassengerId": test_passenger_id,
    "Survived": final_test_pred.astype(int)
})

submission.to_csv("submission.csv", index=False)
submission.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1
