# Import Libraries

In [1]:
import os
import re
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from xgboost import XGBClassifier
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [2]:
# To connect with kaggle
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/titanic/train.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/gender_submission.csv


# Load and Explore the Data

In [3]:
train_path= "/kaggle/input/titanic/train.csv"
test_path= "/kaggle/input/titanic/test.csv"
test= pd.read_csv(test_path)
df= pd.read_csv(train_path)
print("---------- TEST ----------")
print("test shape", test.shape)
print(test.head(3))
print("---------- TRAIN ----------")
print("train shape", df.shape)
print(df.head())

---------- TEST ----------
test shape (418, 11)
   PassengerId  Pclass                              Name     Sex   Age  SibSp  \
0          892       3                  Kelly, Mr. James    male  34.5      0   
1          893       3  Wilkes, Mrs. James (Ellen Needs)  female  47.0      1   
2          894       2         Myles, Mr. Thomas Francis    male  62.0      0   

   Parch  Ticket    Fare Cabin Embarked  
0      0  330911  7.8292   NaN        Q  
1      0  363272  7.0000   NaN        S  
2      0  240276  9.6875   NaN        Q  
---------- TRAIN ----------
train shape (891, 12)
   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florenc

In [4]:
df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


# EDA

In [6]:
df.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

# Feature Engineering

In [7]:
# For the time being ignoring ticket in later version I will add as a feature
pattern_deck= re.compile(r"[a-zA-Z]+")
pattern_num= re.compile(r"\d+")
def handle_null(df):
    df["Age"]= df["Age"].fillna(df["Age"].median())
    df["Cabin"]= df["Cabin"].fillna("unknown")
    df["Fare"]= df["Fare"].fillna(df["Fare"].mean())
    df.dropna(subset=["Embarked"], inplace=True)
    return df

def handle_textual_categories(df):
    df["Sex"]= (df["Sex"]=="male").astype('int') # 1 for male 0 for female
    df["multiple_cabin"]= df["Cabin"].apply(lambda x:1 if len(x.split())>1 else 0)
    df["Cabin"]= df["Cabin"].apply(lambda x: x.split()[0] if isinstance(x,str) else x) # If multiple cabins then take one only
    df["deck"]=df["Cabin"].apply(lambda x: pattern_deck.findall(x)[0] if pattern_deck.findall(x) else "unknown")
    df["cabin_number"]= df["Cabin"].apply(lambda x: int(pattern_num.findall(x)[0]) if pattern_num.findall(x) else -1)
    df.drop("Cabin", axis=1, inplace=True)
    return df

def add_family_features(df):
    df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
    df['IsAlone'] = (df['FamilySize'] == 1).astype(int)
    return df


def encoding(df, encoder=None, fit=True):
    categorical_features= ["Embarked", "deck"]
    if fit or encoder is None:
        encoder= OneHotEncoder(handle_unknown="ignore", sparse_output=False)
        encoder.fit(df[categorical_features])

    encoded_array= encoder.transform(df[categorical_features])
    temp= pd.DataFrame(encoded_array, columns= encoder.get_feature_names_out(categorical_features), index=df.index)
    df= pd.concat([df.drop(categorical_features, axis=1), temp], axis=1)
    return df, encoder


def add_age_group(df):
    # Define bins and labels
    bins = [-1, 12, 19, 35, 50, 65, 120]  # -1 and 120 cover all possible ages
    labels = ['Child', 'Teen', 'YoungAdult', 'Adult', 'Senior', 'Elderly']
    
    df['AgeGroup'] = pd.cut(df['Age'], bins=bins, labels=labels)
    return df


def scaling(df, scaler=None, fit=True):
    features= ["Age", "Fare"]
    if fit or scaler is None:
        scaler= StandardScaler()
        scaler.fit(df[features])
    transformed_array= scaler.transform(df[features])
    transformed_df= pd.DataFrame(transformed_array, columns=features, index= df.index)
    df[features]= transformed_df
    return df, scaler

In [8]:
# family_size = Sibsp+parch+1
# isAlone for solo traveler
# title from name extracts social status and gender role
# hasCabin= cabin !=unknown
# farePerPerson= fare/familysize

In [9]:
dfx= handle_null(df.copy())
dfx= handle_textual_categories(dfx.copy())
dfx= add_family_features(dfx.copy())
features= ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked", "deck", "FamilySize", "IsAlone"]
dfy, encoder= encoding(dfx[features].copy())
X, scaler= scaling(dfy.copy())
y= dfx["Survived"].copy()

# Model Training

In [10]:
X_train, X_test, y_train, y_test= train_test_split(X, y, random_state=1234, test_size=0.2)

In [11]:
lr = XGBClassifier(
    learning_rate=0.05,
    n_estimators=400,         # slightly higher
    max_depth=3,
    subsample=0.85,
    colsample_bytree=0.9,
    reg_lambda=1.2,           # L2 regularization
    reg_alpha=0.4,            # L1 regularization
    min_child_weight=3,
    gamma=0.2,                # minimum loss reduction
    random_state=42,
    eval_metric='logloss'
)


lr.fit(X_train, y_train)

In [12]:
print(f"train score: {lr.score(X_train, y_train)}")
print(f"Test score: {lr.score(X_test, y_test)}")

train score: 0.8987341772151899
Test score: 0.8314606741573034


In [13]:
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

ensemble = VotingClassifier(
    estimators=[
        ('xgb', XGBClassifier(
            n_estimators=200, learning_rate=0.05, max_depth=4,
            subsample=0.8, colsample_bytree=0.8, random_state=42, use_label_encoder=False, eval_metric='logloss'
        )),
        ('rf', RandomForestClassifier(
            n_estimators=200, max_depth=6, random_state=42
        )),
        ('lr', LogisticRegression(max_iter=1000, C=0.8, solver='lbfgs'))
    ],
    voting='hard'
)

ensemble.fit(X_train, y_train)

# Model Evaluation

In [14]:
print(f"train score: {ensemble.score(X_train, y_train)}")
print(f"Test score: {ensemble.score(X_test, y_test)}")

train score: 0.8818565400843882
Test score: 0.8258426966292135


In [15]:
test_processed= handle_null(test.copy())
test_processed= handle_textual_categories(test_processed.copy())
test_processed= add_family_features(test_processed.copy())
features= ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked", "deck", "FamilySize", "IsAlone"]
test_processed, encoder= encoding(test_processed[features].copy(), fit=False, encoder= encoder)
test_processed, scaler= scaling(test_processed.copy(), scaler=scaler, fit=False)
test_processed.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,FamilySize,IsAlone,Embarked_C,Embarked_Q,Embarked_S,deck_A,deck_B,deck_C,deck_D,deck_E,deck_F,deck_G,deck_T,deck_unknown
0,3,1,0.399522,0,0,-0.488579,1,1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,3,0,1.362718,1,0,-0.505273,2,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,2,1,2.518553,0,0,-0.451165,1,1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,3,1,-0.178396,0,0,-0.471802,1,1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,3,0,-0.563674,1,1,-0.398819,3,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [16]:
prediction= lr.predict(test_processed)

In [17]:
test["Survived"]= prediction
test[["PassengerId", "Survived"]].to_csv("submission.csv", index=False)