In [7]:
import numpy as np
import pandas as pd
import os

from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

In [8]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        if filename == 'train.csv':
            train_path = os.path.join(dirname, filename)
        if filename == 'test.csv':
            test_path = os.path.join(dirname, filename)

df_train = pd.read_csv(train_path)
df_test = pd.read_csv(test_path)

print(df_train.shape, df_test.shape)


(891, 12) (418, 11)


In [9]:
def create_features(df):
    rare_titles = [
        'Dr', 'Rev', 'Col', 'Major', 'Don', 'Lady', 'Sir',
        'Capt', 'the Countess', 'Jonkheer', 'Dona']

    def group_ticket_freq(freq):
        if freq == 1:
            return 'Solo'
        elif 2 <= freq <= 4:
            return 'Small_Group'
        else:
            return 'Large_Group'
    df = (df.copy()
                .assign(
                    Title=lambda x: (x["Name"].str.extract(r",\s*([^\.]+)\.")
                            .replace(['Mlle', 'Ms'], 'Miss')
                            .replace('Mme', 'Mrs')
                            .replace(rare_titles, 'Rare')),
                    Deck=lambda x: (x["Cabin"].str[0].fillna("Unknown")
                            .replace('T', 'A')
                            .replace(['A', 'B', 'C'], 'ABC')
                            .replace(['D', 'E'], 'DE')
                            .replace(['F', 'G'], 'FG')),
                    Family_Size=lambda x: x["SibSp"] + x["Parch"] + 1,
                    Ticket=lambda x: x["Ticket"].astype(str),
                    Ticket_Frequency=lambda x: x.groupby("Ticket")["Ticket"].transform("count"),
                    Ticket_Group=lambda x: x["Ticket_Frequency"].apply(group_ticket_freq),)
          
                .drop(columns=["Name", "Ticket", "Ticket_Frequency", "Cabin", "PassengerId"])
            )
    return df


In [10]:
df_train = create_features(df_train)
df_test  = create_features(df_test)

y = df_train['Survived']
X = df_train.drop('Survived', axis=1)

In [16]:
print(X.columns)
print(df_test.columns)

Index(['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked', 'Title',
       'Deck', 'Family_Size', 'Ticket_Group'],
      dtype='object')
Index(['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked', 'Title',
       'Deck', 'Family_Size', 'Ticket_Group'],
      dtype='object')


In [17]:
df_train

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title,Deck,Family_Size,Ticket_Group
0,0,3,male,22.0,1,0,7.2500,S,Mr,Unknown,2,Solo
1,1,1,female,38.0,1,0,71.2833,C,Mrs,ABC,2,Solo
2,1,3,female,26.0,0,0,7.9250,S,Miss,Unknown,1,Solo
3,1,1,female,35.0,1,0,53.1000,S,Mrs,ABC,2,Small_Group
4,0,3,male,35.0,0,0,8.0500,S,Mr,Unknown,1,Solo
...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,Rare,Unknown,1,Solo
887,1,1,female,19.0,0,0,30.0000,S,Miss,ABC,1,Solo
888,0,3,female,,1,2,23.4500,S,Miss,Unknown,4,Small_Group
889,1,1,male,26.0,0,0,30.0000,C,Mr,ABC,1,Solo


In [18]:
num_cols = ['Age','Fare','Family_Size','SibSp','Parch']

cat_cols = ['Sex','Embarked','Pclass','Title','Deck','Ticket_Group']

num_pipe = Pipeline([
    ('impute', SimpleImputer(strategy='median')),
    ('scale', StandardScaler())])

cat_pipe = Pipeline([
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocess = ColumnTransformer([
    ('num', num_pipe, num_cols),
    ('cat', cat_pipe, cat_cols)
])

In [19]:
model = RandomForestClassifier(
    n_estimators=800,
    max_depth=7,
    random_state=42
)

pipe = Pipeline([
    ('prep', preprocess),
    ('model', model)
])


In [20]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

scores = cross_val_score(pipe, X, y, cv=skf, scoring='accuracy')
print("CV Accuracy:", scores.mean())


CV Accuracy: 0.831617600903898


In [None]:
pipe.fit(X, y)

test_pred = pipe.predict(df_test)

In [None]:
submission = pd.DataFrame({
    "PassengerId": pd.read_csv(test_path)['PassengerId'],
    "Survived": test_pred.astype(int)
})

submission.to_csv("submission.csv", index=False)
submission.head()


In [None]:
#