In [None]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

In [None]:
titanic_df = pd.read_csv('data/titanic_train.csv')
titanic_df.head()

In [None]:
syntanic_df = pd.read_csv('data/syntanic_train.csv')
syntanic_df.head()

In [None]:
titanic_df.isnull().sum()

In [None]:
def transform(df, train_df=None):
    df['family_members'] = df['SibSp'] + df['Parch']
    # Pclass
    dummy_df = pd.get_dummies(df['Pclass'], prefix='Pclass', prefix_sep='_')
    df = pd.concat([df, dummy_df], axis='columns')
    # Sex
    df['Sex'] = df['Sex'].apply(lambda sex: sex.lower() == 'male').astype('int')
    # Age
    mean_age = df['Age'].mean()
    if train_df is not None:
        mean_age = train_df['Age'].mean()
    df['Age'] = df['Age'].fillna(mean_age)
    df['Age'] = df['Age'].astype('int')
    # Cabin
    df['Cabin'] = df['Cabin'].apply(lambda cabin: cabin[0].upper() if (type(cabin) == str) else np.nan)
    most_common_cabin = df['Cabin'].value_counts().index[0]
    if train_df is not None:
        train_df['Cabin'] = train_df['Cabin'].apply(lambda cabin: cabin[0].upper() if (type(cabin) == str) else np.nan)
        most_common_cabin = train_df['Cabin'].value_counts().index[0]
    df['Cabin'] = df['Cabin'].fillna(most_common_cabin)
    dummy_df = pd.get_dummies(df['Cabin'], prefix='Cabin', prefix_sep='_')
    df = pd.concat([df, dummy_df], axis='columns')
    # Embarked
    most_common_embarkment = df['Embarked'].value_counts().index[0]
    if train_df is not None:
        most_common_embarkment = train_df['Embarked'].value_counts().index[0]
    df['Embarked'] = df['Embarked'].fillna(most_common_embarkment)
    dummy_df = pd.get_dummies(df['Embarked'], prefix='Embarked', prefix_sep='_')
    df = pd.concat([df, dummy_df], axis='columns')
    # Fare
    mean_fare = df['Fare'].mean()
    if train_df is not None:
        mean_fare = train_df['Fare'].mean()
    df['Fare'] = df['Fare'].fillna(mean_fare)
    # Drop columns
    df = df.drop(['PassengerId', 'Name', 'Ticket', 'SibSp', 'Parch', 'Pclass', 'Cabin', 'Embarked'], axis='columns')
    return df

In [None]:
train_df = transform(titanic_df)
train_df.head()

In [None]:
validate_df = transform(syntanic_df, titanic_df)
validate_df.head()

In [None]:
validate_df.isna().sum()

In [None]:
rf_clf = RandomForestClassifier()
rf_clf.fit(train_df.drop('Survived', axis=1), train_df['Survived'])
validate_df['Prediction_Survived'] = rf_clf.predict_proba(validate_df.drop('Survived', axis=1))[:, 1]
validate_df.head()

In [None]:
validate_df.to_pickle('data/rf_predictions.pkl')

In [None]:
lr_clf = LogisticRegression()
lr_clf.fit(train_df.drop('Survived', axis=1), train_df['Survived'])
validate_df['Prediction_Survived'] = lr_clf.predict_proba(validate_df.drop(['Survived', 'Prediction_Survived'], axis=1))[:, 1]
validate_df.head()

In [None]:
validate_df.to_pickle('data/lr_predictions.pkl')

In [None]:
# np.percentile(validate_df['Prediction_Survived'], np.array([i*5 for i in range(20)]))

In [None]:
# def calculate_ks_scores(df, threshold=0.5):
#     df = df.copy()
#     df['Ventile'] = pd.qcut(df['Prediction_Survived'], 20, labels=False)
#     df['Prediction_Survived'] = (df['Prediction_Survived'] > threshold).astype('int')
#     df = df.groupby('Ventile')['Prediction_Survived'].agg(['count', 'sum'])
#     df.columns = ['total', 'target']
#     df['non_target'] = df['total'] - df['target']
#     df['perc_total_cum'] = df['total'].cumsum() / df['total'].sum()
#     df['perc_target_cum'] = df['target'].cumsum() / df['target'].sum()
#     df['perc_non_target_cum'] = df['non_target'].cumsum() / df['non_target'].sum()
#     df['ks'] = (df['perc_target_cum'] - df['perc_non_target_cum']).apply(abs)
#     return df

In [None]:
# calculate_ks_scores(validate_df)