In [None]:
import matplotlib.pyplot as plt, numpy as np, pandas as pd, seaborn as sns

# train-test split and preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Normalizer

# model
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier

# metrics
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.metrics import confusion_matrix, precision_recall_curve, roc_auc_score, roc_curve

In [None]:
""" acquire training and testing data """
train_df = pd.read_csv('../input/train.csv')
test_df = pd.read_csv('../input/test.csv')

In [None]:
# describe training data
train_df.describe(include='all')

In [None]:
# describe testing data
test_df.describe(include='all')

In [None]:
# head training data
train_df.head(n=5)

In [None]:
# tail testing data
test_df.tail(n=5)

In [None]:
""" wrangle, prepare, cleanse the data """
for df in (train_df, test_df):
    df['Title'] = df['Name'].str.extract(r' ([A-Za-z]+)\.', expand=False).map({'Mr': 0, 'Miss': 1, 'Mrs': 2, 'Master': 3, 'Dr': 4, 'Rev': 4, 'Mlle': 1, 'Col': 4, 'Major': 4, 'Countess': 4, 'Don': 4, 'Lady': 4, 'Jonkheer': 4, 'Mme': 2, 'Sir': 4, 'Ms': 1, 'Capt': 4, 'Dona': 4}).astype(int)
    df['Sex'] = df['Sex'].map({'female': 0, 'male': 1}).astype(int)
    df['Age'] = df['Age'].fillna(df['Age'].mean())
    df['Family'] = df['SibSp'] + df['Parch'] + 1
    df['Ticket'] = df['Ticket'].str.extract(r'([A-Za-z]+)', expand=False).apply(lambda x: 0 if pd.isnull(x) else 1)
    df['Fare'] = df['Fare'].fillna(df['Fare'].mean())
    df['Cabin'] = df['Cabin'].apply(lambda x: 0 if pd.isnull(x) else 1)
    df['Embarked'] = df['Embarked'].fillna('S').map({'S': 0, 'C': 1, 'Q': 2}).astype(int)

In [None]:
# value counts training data
train_df['Embarked'].value_counts()

In [None]:
# value counts testing data
test_df['Embarked'].value_counts()

In [None]:
""" analyze, identify patterns, and explore the data """
x = train_df.drop(['PassengerId', 'Survived', 'Name', 'SibSp', 'Parch'], axis=1)
y = train_df['Survived']

# pairgrid for training data
grd = sns.PairGrid(pd.concat([x, y], axis=1), hue='Survived', palette="Oranges")
grd = grd.map(plt.scatter)
grd = grd.add_legend()

In [None]:
""" model, predict and solve the problem """

# train-test split
x_train, x_validate, y_train, y_validate = train_test_split(x, y, random_state=0, test_size=0.25)

# model
logreg = LogisticRegression().fit(x_train, y_train)
logreg_ypredict = logreg.predict(x_validate)
logreg_f1score, logreg_auc = f1_score(y_validate, logreg_ypredict), roc_auc_score(y_validate, logreg_ypredict)
print('logistic regression\t f1 score: %f, auc: %f' %(logreg_f1score, logreg_auc))

treeclf = DecisionTreeClassifier(max_depth=8, min_samples_leaf=5, splitter='best').fit(x_train, y_train)
treeclf_ypredict = treeclf.predict(x_validate)
treeclf_f1score, treeclf_auc = f1_score(y_validate, treeclf_ypredict), roc_auc_score(y_validate, treeclf_ypredict)
print('decision tree classifier\t f1 score: %f, auc: %f' %(treeclf_f1score, treeclf_auc))

forestclf = RandomForestClassifier(max_depth=20, min_samples_leaf=2, n_estimators=250, random_state=0).fit(x_train, y_train)
forestclf_ypredict = forestclf.predict(x_validate)
forestclf_f1score, forestclf_auc = f1_score(y_validate, forestclf_ypredict), roc_auc_score(y_validate, forestclf_ypredict)
print('random forest classifier\t f1 score: %f, auc: %f' %(forestclf_f1score, forestclf_auc))

nnclf = MLPClassifier(activation='logistic', hidden_layer_sizes=[20, 10], max_iter=9999, random_state=0).fit(x_train, y_train)
nnclf_ypredict = nnclf.predict(x_validate)
nnclf_f1score, nnclf_auc = f1_score(y_validate, nnclf_ypredict), roc_auc_score(y_validate, nnclf_ypredict)
print('neural network classifier\t f1 score: %f, auc: %f' %(nnclf_f1score, nnclf_auc))

In [None]:
""" visualize, report, and present the problem solving steps and final solution """
model = forestclf

In [None]:
""" supply or submit the results """
x_test = test_df.drop(['PassengerId', 'Name', 'SibSp', 'Parch'], axis=1)
y_test = model.predict(x_test)

out = pd.DataFrame({'PassengerId': test_df['PassengerId'], 'Survived': y_test})
out.to_csv('submission.csv', index=False)