In [26]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

In [27]:
df_train = pd.read_csv('../data/train.csv')
df_test = pd.read_csv('../data/test.csv')

### Data cleanup

In [28]:
df_median_age = df_train.groupby(['Sex', 'Pclass'])['Age'].median().reset_index()
df_median_age = df_median_age.rename(columns={'Age' : 'Median_Age'})

df_train = df_train.merge(df_median_age, on=['Sex', 'Pclass'], how='left')
df_train.loc[df_train['Age'].isna(), 'Age'] = df_train['Median_Age']
df_train = df_train.drop(columns = 'Median_Age')

df_test = df_test.merge(df_median_age, on=['Sex', 'Pclass'], how='left')
df_test.loc[df_test['Age'].isna(), 'Age'] = df_test['Median_Age']
df_test = df_test.drop(columns = 'Median_Age')

In [29]:
df_train['Cabin_Type'] = df_train.loc[df_train['Cabin'].notna()]['Cabin'].apply(lambda x: x[0])
df_test['Cabin_Type'] = df_test.loc[df_test['Cabin'].notna()]['Cabin'].apply(lambda x: x[0])

df_train['Cabin_Nos'] = df_train.loc[df_train['Cabin'].notna()]['Cabin'].apply(lambda x: len(x.split()))
df_test['Cabin_Nos'] = df_test.loc[df_test['Cabin'].notna()]['Cabin'].apply(lambda x: len(x.split()))

df_train = df_train.drop(columns='Cabin')
df_test = df_test.drop(columns='Cabin')

In [30]:
new_df_train = pd.get_dummies(df_train['Sex'])
df_train = pd.concat([df_train, new_df_train], axis=1)

new_df_test = pd.get_dummies(df_test['Sex'])
df_test = pd.concat([df_test, new_df_test], axis=1)

df_train = df_train.drop(columns=['Sex'])
df_test = df_test.drop(columns=['Sex'])

In [31]:
df_train['Embarked'] = df_train['Embarked'].fillna('S')
df_test['Embarked'] = df_test['Embarked'].fillna('S')

new_df_train = pd.get_dummies(df_train['Embarked'])
new_cols = ['Embarked_{}'.format(x) for x in new_df_train.columns.values]
new_df_train.columns = new_cols
df_train = pd.concat([df_train, new_df_train], axis=1)

new_df_test = pd.get_dummies(df_test['Embarked'])
new_cols = ['Embarked_{}'.format(x) for x in new_df_test.columns.values]
new_df_test.columns = new_cols
df_test = pd.concat([df_test, new_df_test], axis=1)

df_train = df_train.drop(columns=['Embarked'])
df_test = df_test.drop(columns=['Embarked'])

In [32]:
new_df_train = pd.get_dummies(df_train['Cabin_Type'])
new_cols = ['Cabin_Type_{}'.format(x) for x in new_df_train.columns.values]
new_df_train.columns = new_cols
df_train = pd.concat([df_train, new_df_train], axis=1)

new_df_test = pd.get_dummies(df_test['Cabin_Type'])
new_cols = ['Cabin_Type_{}'.format(x) for x in new_df_test.columns.values]
new_df_test.columns = new_cols
df_test = pd.concat([df_test, new_df_test], axis=1)

df_train = df_train.drop(columns=['Cabin_Type'])
df_test = df_test.drop(columns=['Cabin_Type'])

In [33]:
class_fare_dict = df_train.groupby('Pclass')['Fare'].median().to_dict()

df_train['Pclass_Fare_median'] = df_train['Pclass'].apply(lambda x: class_fare_dict[x])
df_test['Pclass_Fare_median'] = df_test['Pclass'].apply(lambda x: class_fare_dict[x])

df_train.loc[df_train['Fare'].isna(), 'Fare'] = df_train['Pclass_Fare_median']
df_test.loc[df_test['Fare'].isna(), 'Fare'] = df_test['Pclass_Fare_median']

df_train = df_train.drop(columns = 'Pclass_Fare_median')
df_test = df_test.drop(columns = 'Pclass_Fare_median')

In [34]:
new_df_train = pd.get_dummies(df_train['Pclass'])
new_cols = ['Pclass_{}'.format(x) for x in new_df_train.columns.values]
new_df_train.columns = new_cols
df_train = pd.concat([df_train, new_df_train], axis=1)

new_df_test = pd.get_dummies(df_test['Pclass'])
new_cols = ['Pclass_{}'.format(x) for x in new_df_test.columns.values]
new_df_test.columns = new_cols
df_test = pd.concat([df_test, new_df_test], axis=1)

df_train = df_train.drop(columns=['Pclass'])
df_test = df_test.drop(columns=['Pclass'])

In [35]:
df_train['Cabin_Nos'] = df_train['Cabin_Nos'].fillna(1)
df_test['Cabin_Nos'] = df_test['Cabin_Nos'].fillna(1)

In [36]:
df_test['Cabin_Type_T'] = 0

In [51]:
inp_col_list = list(df_train.columns.values)
inp_col_list.remove('PassengerId')
inp_col_list.remove('Survived')
inp_col_list.remove('Name')
inp_col_list.remove('Ticket')
inp_col_list.remove('Age')

In [52]:
X = df_train[inp_col_list]
Y = df_train['Survived']

x = X.values
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
X_scaled = pd.DataFrame(x_scaled)

x_train, x_test, y_train, y_test = train_test_split(X_scaled, Y, test_size=0.2, random_state=0)

In [59]:
rfc = RandomForestClassifier(max_depth = 6, min_samples_split = 5, random_state = 42)
rfc.fit(x_train, y_train)

RandomForestClassifier(max_depth=6, min_samples_split=5, random_state=42)

In [60]:
score = rfc.score(x_test, y_test)
print(score)

0.8156424581005587


In [61]:
X = df_test[inp_col_list]
x = X.values
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
X_scaled = pd.DataFrame(x_scaled)

In [62]:
predictions = rfc.predict(X_scaled)

In [63]:
df_test['Survived'] = predictions

In [64]:
df_test[['PassengerId', 'Survived']].to_csv('../submissions/random_forest_4_submission.csv', index=False)