In [45]:
# !pip install ipykernel
# !python -m ipykernel install --user --name titanic-venv --display-name "Python (titanic-venv)"

In [None]:
# !pip install numpy pandas scikit-learn

In [1]:
import numpy as np
import pandas as pd
import os

from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

In [2]:
train_path = "../data/raw/titanic/train.csv"
test_path  = "../data/raw/titanic/test.csv"

df_train = pd.read_csv(train_path)
df_test = pd.read_csv(test_path)

print(df_train.shape, df_test.shape)

(891, 12) (418, 11)


In [49]:
df_train["Title"] = df_train["Name"].str.extract(r",\s*([^\.]+)\.")
df_train['Title'].value_counts()

Title
Mr              517
Miss            182
Mrs             125
Master           40
Dr                7
Rev               6
Major             2
Mlle              2
Col               2
Don               1
Mme               1
Ms                1
Lady              1
Sir               1
Capt              1
the Countess      1
Jonkheer          1
Name: count, dtype: int64

In [50]:
def create_features(df):
        df = df.copy()

        rare_titles = [
            'Dr', 'Rev', 'Col', 'Major', 'Don', 'Lady', 'Sir',
            'Capt', 'the Countess', 'Jonkheer', 'Dona'
        ]

        # ======================
        # Title
        # ======================
        df["Title"] = df["Name"].str.extract(r",\s*([^\.]+)\.")
        df['Title'] = df['Title'].replace(['Mlle', 'Ms'], 'Miss')
        df['Title'] = df['Title'].replace('Mme', 'Mrs')
        df['Title'] = df['Title'].replace(rare_titles, 'Rare')

        # ======================
        # Deck
        # ======================
        df['Deck'] = df['Cabin'].str[0]
        df['Deck'] = df['Deck'].fillna('Unknown')

        df['Deck'] = df['Deck'].replace('T', 'A')
        df['Deck'] = df['Deck'].replace(['A', 'B', 'C'], 'ABC')
        df['Deck'] = df['Deck'].replace(['D', 'E'], 'DE')
        df['Deck'] = df['Deck'].replace(['F', 'G'], 'FG')

        # ======================
        # Family size
        # ======================
        df['Family_Size'] = df['SibSp'] + df['Parch'] + 1


        
        # ======================
        # Ticket Group
        # ======================
        df['Ticket'] = df['Ticket'].astype(str)
        df['Ticket_Frequency'] = df.groupby('Ticket')['Ticket'].transform('count')

        def group_ticket_freq(freq):
            if freq == 1:
                return 'Solo'
            elif 2 <= freq <= 4:
                return 'Small_Group'
            else:
                return 'Large_Group'

        df['Ticket_Group'] = df['Ticket_Frequency'].apply(group_ticket_freq)

        # ======================
        # Drop unused columns
        # ======================
        df.drop(
            ['Name', 'Ticket', 'Ticket_Frequency', 'Cabin', 'PassengerId'],
            axis=1,
            inplace=True
        )

        return df

In [51]:
df_train = create_features(df_train)
df_test  = create_features(df_test)

y = df_train['Survived']
X = df_train.drop('Survived', axis=1)


In [52]:
print(X.columns)
print(df_test.columns)


Index(['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked', 'Title',
       'Deck', 'Family_Size', 'Ticket_Group'],
      dtype='str')
Index(['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked', 'Title',
       'Deck', 'Family_Size', 'Ticket_Group'],
      dtype='str')


In [57]:
df_train

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title,Deck,Family_Size,Ticket_Group
0,0,3,male,22.0,1,0,7.2500,S,Mr,Unknown,2,Solo
1,1,1,female,38.0,1,0,71.2833,C,Mrs,ABC,2,Solo
2,1,3,female,26.0,0,0,7.9250,S,Miss,Unknown,1,Solo
3,1,1,female,35.0,1,0,53.1000,S,Mrs,ABC,2,Small_Group
4,0,3,male,35.0,0,0,8.0500,S,Mr,Unknown,1,Solo
...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,Rare,Unknown,1,Solo
887,1,1,female,19.0,0,0,30.0000,S,Miss,ABC,1,Solo
888,0,3,female,,1,2,23.4500,S,Miss,Unknown,4,Small_Group
889,1,1,male,26.0,0,0,30.0000,C,Mr,ABC,1,Solo


In [None]:
num_cols = [
    'Age',
    'Fare',
    'Family_Size',
    'SibSp',
    'Parch'
]

cat_cols = [
    'Sex',
    'Embarked',
    'Pclass',
    'Title',
    'Deck',
    'Ticket_Group'
]



num_pipe = Pipeline([
    ('impute', SimpleImputer(strategy='median')),
    ('scale', StandardScaler())
])

cat_pipe = Pipeline([
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocess = ColumnTransformer([
    ('num', num_pipe, num_cols),
    ('cat', cat_pipe, cat_cols)
])


In [59]:
model = RandomForestClassifier(
    n_estimators=800,
    max_depth=7,
    random_state=42
)

pipe = Pipeline([
    ('prep', preprocess),
    ('model', model)
])

In [60]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

scores = cross_val_score(pipe, X, y, cv=skf, scoring='accuracy')
print("CV Accuracy:", scores.mean())

CV Accuracy: 0.831617600903898


In [61]:
pipe.fit(X, y)

test_pred = pipe.predict(df_test)

In [62]:
submission = pd.DataFrame({
    "PassengerId": pd.read_csv(test_path)['PassengerId'],
    "Survived": test_pred.astype(int)
})

submission.to_csv("submission.csv", index=False)
submission.head()


Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1
