**Index**
* [Tutorial by Alexis Cook](#Tutorial-by-Alexis-Cook)
* [Data exploration](#Data-Exploration)
* [Bayesian approach](#Bayesian-approach)
* [Tutorial by Jeffd23](#Tutorial-approach)

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

In [None]:
df = pd.read_csv('train.csv')  
testdf = pd.read_csv('test.csv')  # the data without class for submission

# Tutorial by Alexis Cook
(https://www.kaggle.com/alexisbcook/titanic-tutorial)

Just a RandomForest classifier over Passenger class, sex, siblings and parents

After submission it scored 76% (I've killed/revived the other 24%)

In [None]:
y = df.Survived
features = ['Pclass', 'Sex', 'SibSp', 'Parch']

# Convert categories into numbers
X = pd.get_dummies(df[features])
X_test = pd.get_dummies(testdf[features])

# Instantiate the classifier, train and predict
model = RandomForestClassifier()
model.fit(X, y)
predictions = model.predict(X_test)

pd.DataFrame({'PassengerId': testdf.PassengerId, 'Survived':predictions}).to_csv('my_submission.csv', index=False)

# Data Exploration

Show a visualization about rough numbers

In [None]:
sns.set(style='ticks')

d0 = df.copy()

f, axes = plt.subplots(2, 3, figsize=(15, 10))

f = d0[d0.Age.isna()]
d0.loc[f.index, 'Age'] = -1

sns.countplot(d0.Survived, ax=axes[0, 0])
sns.countplot(d0.Sex, ax=axes[0, 1])
sns.distplot(d0.Age, kde=False, ax=axes[0,2])
sns.countplot(d0.Pclass, ax=axes[1, 0])
sns.countplot(d0.SibSp, ax=axes[1, 1])
sns.countplot(d0.Parch, ax=axes[1, 2])

plt.show()

# Bayesian approach
Try to predict the survivors using simple bayes' theorem rules.
Set people with less than 50% chances of survive as dead. After submitting, this approach gave 69% accuracy

Get live chances given sex

In [None]:
# Sex chances given live status
d1 = df[['Sex', 'Survived']]
f = d1.Survived == 0
d2, d3 = d1[f], d1[~f]  # Dead/alive
d2 = d2.groupby('Sex').count()
d3 = d3.groupby('Sex').count()

# Merge on sex
d1 = pd.merge(d2, d3, left_index=True, right_index=True)

# Dead/alive percents
da = d1.sum(axis=0) / d1.sum().sum()

# Compute female & male alive, female & male dead
f, m = d1.values.sum(axis=1)  # totals by sex
fd, fa, md, ma = d1.values.ravel()

# get live chances given sex
data = {
    'Female': [fa/f, fd/f],
    'Male': [ma/m, md/m],}
L_S = pd.DataFrame(data=data, index=['Alive', 'Dead'])
L_S

Knowing the live chances given sex, calculate the live chances given the Pclass

In [None]:
# Filter women
f = df.Sex == 'female'
d0 = df[f][['Survived', 'Pclass']]

# Filter alive status
f = d0.Survived == 0
d1, d2 = d0[f], d0[~f]  # dead/alive

d1, d2 = [data.groupby('Pclass').count() for data in (d1, d2)]

d0 = pd.merge(d1, d2, left_index=True, right_index=True, suffixes=('_0', '_1'))
d0 = d0/d0.sum()

A1 = d0.iat[0, 1] / d0.iloc[0, :].sum()
A2 = d0.iat[1, 1] / d0.iloc[1, :].sum()
A3 = d0.iat[2, 1] / d0.iloc[2, :].sum()

pf0 = [A1, A2, A3]  # Surviving chances by class being female

# Filter men
f = df.Sex == 'male'
d0 = df[f][['Survived', 'Pclass']]

# Filter alive status
f = d0.Survived == 0
d1, d2 = d0[f], d0[~f]  # dead/alive

d1, d2 = [data.groupby('Pclass').count() for data in (d1, d2)]

d0 = pd.merge(d1, d2, left_index=True, right_index=True, suffixes=('_0', '_1'))
d0 = d0/d0.sum()

A1 = d0.iat[0, 1] / d0.iloc[0, :].sum()
A2 = d0.iat[1, 1] / d0.iloc[1, :].sum()
A3 = d0.iat[2, 1] / d0.iloc[2, :].sum()

pm0 = [A1, A2, A3]  # Surviving chances by class being male


pd.DataFrame({'female': pf0, 'male': pm0})

In [None]:
# Now set, f3, m3, m2 as dead and m1 f2 f1 as survived
d0 = testdf.copy()

d0['Survived'] = 0

# revive m1, f2 & f1
f = (
    ((d0.Sex == 'male') & (d0.Pclass == 1)) |
    ((d0.Sex == 'female') & (d0.Pclass.isin((1, 2)))))
d0.loc[d0[f].index, 'Survived'] = 1

pd.DataFrame({'PassengerId': testdf.PassengerId, 'Survived':predictions}).to_csv('my_submission.csv', index=False)
d0[['PassengerId', 'Survived']].to_csv('bayes.csv', index=False)

# Tutorial approach

Now we'll work through a tutorial on [kaggle](https://www.kaggle.com/jeffd23/scikit-learn-ml-from-start-to-finish) made by jeffd23

After submission it scored 74% (less than the first tutorial but better than the bayes approach). Notice that cross validation gave 83% of accuracy

## Fisrt plot some visualizations

In [None]:
sns.barplot(x=df.Embarked, y=df.Survived, hue=df.Sex);

In [None]:
sns.barplot(x=df.Pclass, y=df.Survived, hue=df.Sex)

In [None]:
d0 = df.copy()
t0 = testdf.copy()

for dataframe in (d0, t0):
    # Build age bins
    dataframe.loc[:, 'Age'] = dataframe.Age.fillna(-.5)
    bins = [-1, 0, 5, 12, 18, 25, 35, 60, 120, ]
    labels = ['unknown', 'baby', 'child', 'teenager', 'student', 'young adult', 'adult', 'senior', ]
    dataframe.loc[:, 'Age'] = pd.cut(dataframe.Age, bins, labels=labels)

    # Simplify cabin names
    dataframe.loc[:, 'Cabin'] = dataframe.Cabin.fillna('N')
    dataframe.loc[:, 'Cabin'] = dataframe.Cabin.apply(lambda x: x[0])

    # Make fares categorical
    dataframe.loc[:, 'Fare'] = dataframe.Fare.fillna(-.5)
    bins = [-1, 0, 8, 14, 31, 520, ]
    labels = ['unknown', '1st', '2nd', '3rd', '4th']
    dataframe.loc[:, 'Fare'] = pd.cut(dataframe.Fare, bins, labels=labels)

    # Normalize names
    d1 = dataframe.Name.apply(lambda x: x.split(',')[1].split('.')[0])
    dataframe['Title'] = d1.str.replace(' ', '')

    # A couple of irregular ones
    d1 = dataframe[dataframe.Title.str.contains('Jonkheer')]
    d2 = dataframe[dataframe.Title.str.contains('Countess')]
    dataframe.loc[d1.index, 'Title'] = 'Mr'
    dataframe.loc[d2.index, 'Title'] = 'Mrs'  # In her Age group are majority
    
    # Finally, drop some columns
    dataframe.drop(columns=['Ticket', 'Embarked', 'Name', 'PassengerId'], inplace=True)

In [None]:
d0.sample()

## Encode the data
In this step we'll encode the labels into numbers that can run in a ML algorithm. We should build a dataframe with all the training and testing features so the encoding will see all possible values.

In [None]:
from sklearn.preprocessing import LabelEncoder

features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Cabin', 'Title']
combined_df = pd.concat((d0[features], t0[features]))  # Join all possible values
for feat in features:
    le = LabelEncoder()
    le = le.fit(combined_df[feat])
    d0.loc[:, feat] = le.transform(d0[feat])
    t0.loc[:, feat] = le.transform(t0[feat])

## Splitting up data

In [None]:
from sklearn.model_selection import train_test_split

X_all = d0.drop(columns=['Survived'])
y_all = d0['Survived']

x_train, x_test, y_train, y_test = train_test_split(
    X_all, y_all, train_size=.8)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import make_scorer, accuracy_score
from sklearn.model_selection import GridSearchCV

# Choose the type of classifier. 
clf = RandomForestClassifier()

# Choose some parameter combinations to try
parameters = {'n_estimators': [4, 6, 9], 
              'max_features': ['log2', 'sqrt','auto'], 
              'criterion': ['entropy', 'gini'],
              'max_depth': [2, 3, 5, 10], 
              'min_samples_split': [2, 3, 5],
              'min_samples_leaf': [1,5,8]
             }

# Type of scoring used to compare parameter combinations
acc_scorer = make_scorer(accuracy_score)

# Run the grid search
grid_obj = GridSearchCV(clf, parameters, scoring=acc_scorer)
grid_obj = grid_obj.fit(x_train, y_train)

# Set the clf to the best combination of parameters
clf = grid_obj.best_estimator_

# Fit the best algorithm to the data. 
clf.fit(x_train, y_train)

In [None]:
predictions = clf.predict(x_test)
accuracy_score(y_test, predictions)

In [None]:
from sklearn.model_selection import KFold
kf = KFold(10)
outcomes = []
fold = 0
for train_idx, test_idx in kf.split(X_all):
    fold += 1
    X_train, y_train = X_all.values[train_idx], y_all.values[train_idx]
    X_test, y_test = X_all.values[test_idx], y_all.values[test_idx]
    clf.fit(X_train, y_train)
    y_hat = clf.predict(X_test)
    accuracy = accuracy_score(y_test, y_hat)
    outcomes.append(accuracy)
    print('Fold {} accuracy: {}'.format(fold, accuracy))
np.mean(outcomes)

In [None]:
# Prepare the submission
y_hat = clf.predict(t0)

In [None]:
pd.Series(index=testdf.PassengerId, data=y_hat, name='Survived').to_csv('jd-go.csv')