In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix

train = pd.read_csv('/kaggle/input/titanic/train.csv', index_col='PassengerId')
test = pd.read_csv('/kaggle/input/titanic/test.csv', index_col='PassengerId')
full_data = pd.concat([train.drop('Survived', axis=1), test])
train.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [2]:
def preprocess(df):
    # imputation
    df['Embarked'].fillna('S', inplace=True)
    df['Age'].fillna(full_data.Age.mean(), inplace=True)
    # feature creation
    df['high_class_lady'] = df.Pclass.isin([1,2]).bool and df.Sex == 'female'
    df['low_class_man'] = df.Pclass.isin([2,3]).bool and df.Sex == 'male'
    df['child'] = df.Age < 6
    df['free_ride'] = df.Fare == 0
    df['embarked_c'] = df.Embarked == 'C'
    df['travel_alone'] = df.SibSp.eq(0).bool and df.Parch.eq(0)
    df['has_cabin'] = pd.isna(df.Cabin)
    return df

train = preprocess(train)
test = preprocess(test)

In [3]:
y = train['Survived']

features = ['high_class_lady', 'low_class_man', 'child', 'free_ride', 'embarked_c', 'travel_alone', 'has_cabin']
X = pd.get_dummies(train[features])
X_test = pd.get_dummies(test[features])

model = RandomForestClassifier(n_estimators=100, criterion='entropy', max_depth=5, random_state=42)
model.fit(X, y)

scores = cross_val_score(model, X, y, cv=3)
print("Accuracy: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std() * 2))
print("TN {} FP {} FN {} TP {}".format(*confusion_matrix(y, model.predict(X)).ravel()))

predictions = model.predict(X_test)
output = pd.DataFrame({'PassengerId': test.index, 'Survived': predictions})
output.to_csv('my_submission.csv', index=False)
print("Your submission was successfully saved!")

Accuracy: 0.780 (+/- 0.026)
TN 470 FP 79 FN 104 TP 238
Your submission was successfully saved!
