In [6]:
import numpy as np
import pandas as pd
import os

from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

In [9]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        if filename == 'train.csv':
            train_path = os.path.join(dirname, filename)
        if filename == 'test.csv':
            test_path = os.path.join(dirname, filename)

df_train = pd.read_csv(train_path)
df_test = pd.read_csv(test_path)

print(df_train.shape, df_test.shape)


(891, 12) (418, 11)


In [10]:
df_train["Title"] = df_train["Name"].str.extract(r",\s*([^\.]+)\.")
df_train['Title'].value_counts()

Title
Mr              517
Miss            182
Mrs             125
Master           40
Dr                7
Rev               6
Col               2
Mlle              2
Major             2
Ms                1
Mme               1
Don               1
Lady              1
Sir               1
Capt              1
the Countess      1
Jonkheer          1
Name: count, dtype: int64

In [11]:
rare_titles = [
    'Dr', 'Rev', 'Col', 'Major', 'Don', 'Lady', 'Sir',
    'Capt', 'the Countess', 'Jonkheer', 'Dona'
]

df_train['Title'] = df_train['Title'].replace(['Mlle', 'Ms'], 'Miss')
df_train['Title'] = df_train['Title'].replace('Mme', 'Mrs')
df_train['Title'] = df_train['Title'].replace(rare_titles, 'Rare')
df_train.drop(columns=['Name'], inplace=True)
df_train['Title'].value_counts()

Title
Mr        517
Miss      185
Mrs       126
Master     40
Rare       23
Name: count, dtype: int64

In [12]:
def create_features(df):
    df = df.copy()
    
    # # Title
    # df['Title'] = df['Name'].str.split(', ', expand=True)[1].str.split('.', expand=True)[0]
    # df['Title'] = df['Title'].replace(
    #     ['Miss','Mrs','Ms','Mlle','Lady','Mme','the Countess','Dona'],
    #     'Miss/Mrs/Ms'
    # )
    # df['Title'] = df['Title'].replace(
    #     ['Dr','Col','Major','Jonkheer','Capt','Sir','Don','Rev'],
    #     'Rare'
    # )

    # Family size
    df['Family_Size'] = df['SibSp'] + df['Parch'] + 1

    # Ticket frequency
    df['Ticket_Frequency'] = df.groupby('Ticket')['Ticket'].transform('count')

    # Drop useless
    df.drop(['Name','Ticket','Cabin','PassengerId'], axis=1, inplace=True)

    return df


In [4]:
df_train = create_features(df_train)
df_test  = create_features(df_test)

y = df_train['Survived']
X = df_train.drop('Survived', axis=1)


In [5]:
df_train

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title,Family_Size,Ticket_Frequency
0,0,3,male,22.0,1,0,7.2500,S,Mr,2,1
1,1,1,female,38.0,1,0,71.2833,C,Miss/Mrs/Ms,2,1
2,1,3,female,26.0,0,0,7.9250,S,Miss/Mrs/Ms,1,1
3,1,1,female,35.0,1,0,53.1000,S,Miss/Mrs/Ms,2,2
4,0,3,male,35.0,0,0,8.0500,S,Mr,1,1
...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,Rare,1,1
887,1,1,female,19.0,0,0,30.0000,S,Miss/Mrs/Ms,1,1
888,0,3,female,,1,2,23.4500,S,Miss/Mrs/Ms,4,2
889,1,1,male,26.0,0,0,30.0000,C,Mr,1,1


In [5]:
num_cols = ['Age','Fare','Family_Size','Ticket_Frequency','SibSp','Parch']
cat_cols = ['Sex','Embarked','Pclass','Title']

num_pipe = Pipeline([
    ('impute', SimpleImputer(strategy='median')),
    ('scale', StandardScaler())
])

cat_pipe = Pipeline([
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocess = ColumnTransformer([
    ('num', num_pipe, num_cols),
    ('cat', cat_pipe, cat_cols)
])


In [6]:
model = RandomForestClassifier(
    n_estimators=800,
    max_depth=7,
    random_state=42
)

pipe = Pipeline([
    ('prep', preprocess),
    ('model', model)
])


In [7]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

scores = cross_val_score(pipe, X, y, cv=skf, scoring='accuracy')
print("CV Accuracy:", scores.mean())


CV Accuracy: 0.8338773460548616


In [8]:
pipe.fit(X, y)

test_pred = pipe.predict(df_test)


In [9]:
submission = pd.DataFrame({
    "PassengerId": pd.read_csv(test_path)['PassengerId'],
    "Survived": test_pred.astype(int)
})

submission.to_csv("submission.csv", index=False)
submission.head()


Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1
