**Index**
* [Tutorial by Alexis Cook](#Tutorial-by-Alexis-Cook)
* [Data exploration](#Data-Exploration)
* [Bayesian approach](#Bayesian-approach)

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

In [None]:
df = pd.read_csv('train.csv')  
testdf = pd.read_csv('test.csv')  # the data without class for submission

## Tutorial by Alexis Cook
(https://www.kaggle.com/alexisbcook/titanic-tutorial)

Just a RandomForest classifier over Passenger class, sex, siblings and parents

After submission it scored 76% (I've killed/revived the other 24%)

In [None]:
y = df.Survived
features = ['Pclass', 'Sex', 'SibSp', 'Parch']

# Convert categories into numbers
X = pd.get_dummies(df[features])
X_test = pd.get_dummies(testdf[features])

# Instantiate the classifier, train and predict
model = RandomForestClassifier()
model.fit(X, y)
predictions = model.predict(X_test)

pd.DataFrame({'PassengerId': testdf.PassengerId, 'Survived':predictions}).to_csv('my_submission.csv', index=False)

# Data Exploration

Show a visualization about rough numbers

In [None]:
sns.set(style='ticks')

d0 = df.copy()

f, axes = plt.subplots(2, 3, figsize=(15, 10))

f = d0[d0.Age.isna()]
d0.loc[f.index, 'Age'] = -1

sns.countplot(d0.Survived, ax=axes[0, 0])
sns.countplot(d0.Sex, ax=axes[0, 1])
sns.distplot(d0.Age, kde=False, ax=axes[0,2])
sns.countplot(d0.Pclass, ax=axes[1, 0])
sns.countplot(d0.SibSp, ax=axes[1, 1])
sns.countplot(d0.Parch, ax=axes[1, 2])

plt.show()

## Bayesian approach

Set people with less than 50% chances of survive as dead. After submitting, this approach gave 69% accuracy

Get live chances given sex

In [None]:
# Sex chances given live status
d1 = df[['Sex', 'Survived']]
f = d1.Survived == 0
d2, d3 = d1[f], d1[~f]  # Dead/alive
d2 = d2.groupby('Sex').count()
d3 = d3.groupby('Sex').count()

# Merge on sex
d1 = pd.merge(d2, d3, left_index=True, right_index=True)

# Dead/alive percents
da = d1.sum(axis=0) / d1.sum().sum()

# Compute female & male alive, female & male dead
f, m = d1.values.sum(axis=1)  # totals by sex
fd, fa, md, ma = d1.values.ravel()

# get live chances given sex
data = {
    'Female': [fa/f, fd/f],
    'Male': [ma/m, md/m],}
L_S = pd.DataFrame(data=data, index=['Alive', 'Dead'])
L_S

Knowing the live chances given sex, calculate the live chances given the Pclass

In [None]:
# Filter women
f = df.Sex == 'female'
d0 = df[f][['Survived', 'Pclass']]

# Filter alive status
f = d0.Survived == 0
d1, d2 = d0[f], d0[~f]  # dead/alive

d1, d2 = [data.groupby('Pclass').count() for data in (d1, d2)]

d0 = pd.merge(d1, d2, left_index=True, right_index=True, suffixes=('_0', '_1'))
d0 = d0/d0.sum()

A1 = d0.iat[0, 1] / d0.iloc[0, :].sum()
A2 = d0.iat[1, 1] / d0.iloc[1, :].sum()
A3 = d0.iat[2, 1] / d0.iloc[2, :].sum()

pf0 = [A1, A2, A3]  # Surviving chances by class being female

# Filter men
f = df.Sex == 'male'
d0 = df[f][['Survived', 'Pclass']]

# Filter alive status
f = d0.Survived == 0
d1, d2 = d0[f], d0[~f]  # dead/alive

d1, d2 = [data.groupby('Pclass').count() for data in (d1, d2)]

d0 = pd.merge(d1, d2, left_index=True, right_index=True, suffixes=('_0', '_1'))
d0 = d0/d0.sum()

A1 = d0.iat[0, 1] / d0.iloc[0, :].sum()
A2 = d0.iat[1, 1] / d0.iloc[1, :].sum()
A3 = d0.iat[2, 1] / d0.iloc[2, :].sum()

pm0 = [A1, A2, A3]  # Surviving chances by class being male


pd.DataFrame({'female': pf0, 'male': pm0})

In [None]:
# Now set, f3, m3, m2 as dead and m1 f2 f1 as survived
d0 = testdf.copy()

d0['Survived'] = 0

# revive m1, f2 & f1
f = (
    ((d0.Sex == 'male') & (d0.Pclass == 1)) |
    ((d0.Sex == 'female') & (d0.Pclass.isin((1, 2)))))
d0.loc[d0[f].index, 'Survived'] = 1

pd.DataFrame({'PassengerId': testdf.PassengerId, 'Survived':predictions}).to_csv('my_submission.csv', index=False)
d0[['PassengerId', 'Survived']].to_csv('bayes.csv', index=False)