In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.metrics import accuracy_score, make_scorer
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier

%matplotlib inline

In [None]:
df = pd.read_csv('train.csv')  
testdf = pd.read_csv('test.csv')  # the data without class for submission

# Bayesian age inference

This approach has two main stages:
* **Age Inference:** using all the available data, estimate unknown ages' bins by making use of bayes' rule over Titles.
* **Surviving inference:** once filled all NaN ages, apply the JeffD approach so they can be comparable.

---
[Place here the uploads]


### 1st attempt
`RandomForestClassifier(
    max_depth=10, max_features='sqrt', min_samples_leaf=5, n_estimators=9)`  

Mean accuracy: 0.829450686641698  
No upload

---

### 2nd attempt
`RandomForestClassifier(
    criterion='entropy', max_depth=10, max_features='log2',
    min_samples_split=5, n_estimators=9)`
    
Mean accuracy (10 fold): 0.8305617977528088  
After these trainings, the submission scored **.76076**

---

### 3rd attempt

On this time I've reduced the `max_depth` because of an overfitting intuition, options to `[2, 3, 5]`:  
`RandomForestClassifier(
    criterion='entropy', max_depth=5, max_features='log2',
    min_samples_leaf=8, min_samples_split=5, n_estimators=4)`  

This upload scored the best outcome to date **.77751**

---

### 4th attempt

Finally, I opted for give it one final go, just in case

The overfittig intuition gains weight. Also, it seems that when running fresh the notebook (on this attempt I restarted the kernel) the hyperparameters are less.    
`RandomForestClassifier(max_depth=5, max_features='log2', n_estimators=6)`  

Mean accuracy was set to 0.8193133583021224

It scored better than previous: **.77990**




# Filling up unknown ages stage

- Join test and train datasets since we are going to use all the available features 
- Get rid of unknown ages
- Clean in a JeffD way.
- Train/test split the data so we can check the accuracy of the predictor
- Build a linear model using a Bayesian approach that predicts the ages
- Measure the accuracy of age inference

## Clean data
We'll create reusable functions so we can reuse them again for the survival stage

In [None]:
def age_bins(df):
    """Classify ages in logical bins."""
    df['Age'] = df.Age.fillna(-.5)
    bins = [-1, 0, 5, 12, 18, 25, 35, 60, 120, ]
    labels = [
        'unknown', 'baby', 'child', 'teenager', 'student',
        'young adult', 'adult', 'senior', ]
    df.loc[:, 'Age'] = pd.cut(df.Age, bins, labels=labels)
    return df


def cabins(df):
    """Just keep the initial letter from the cabin number."""
    df.loc[:, 'Cabin'] = df.Cabin.fillna('N')
    df.loc[:, 'Cabin'] = df.Cabin.apply(lambda x: x[0])
    return df


def fares(df):
    """Make the fares categorical."""
    df.loc[:, 'Fare'] = df.Fare.fillna(-.5)
    bins = [-1, 0, 8, 14, 31, 520, ]
    labels = ['unknown', '1st', '2nd', '3rd', '4th']
    df.loc[:, 'Fare'] = pd.cut(df.Fare, bins, labels=labels)
    return df


def names(df):
    """Normalize names."""
    d1 = df.Name.apply(lambda x: x.split(',')[1].split('.')[0])
    df['Title'] = d1.str.replace(' ', '')

    # A couple of irregular ones
    d1 = df[df.Title.str.contains('Jonkheer')]
    d2 = df[df.Title.str.contains('Countess')]
    df.loc[d1.index, 'Title'] = 'Mr'
    df.loc[d2.index, 'Title'] = 'Mrs'  # In her Age group are majority
    
    # Let's assume that MR == Master
    return df


def drop_useless_cols(df):
    """Get rid of useless columns"""
    return df.drop(
        columns=['Ticket', 'Embarked', 'Name', 'PassengerId'])

## Visualize people per title
Once defined the methods, we'll proceed to prepare the data to apply bayes' theorem

In [None]:
# Join train and test original datasets
# We'll use all the available known ages
d0 = df.drop(columns='Survived').copy()
d0 = pd.concat((d0, testdf))

# get rid of nan ages
nan_ages = d0.Age.isna()
d0 = d0[~nan_ages]

# now transform the data using the methods
d0 = age_bins(d0)
assert (d0.Age == 'unknown').sum() == 0  # Ensure there are no unknown ages
d0 = cabins(d0)
d0 = fares(d0)
d0 = names(d0)
d0 = drop_useless_cols(d0)


# count people within each group and title
count_people = d0.pivot_table(
    index='Title', columns='Age', values='Pclass', aggfunc='count').fillna(0)
count_people.columns = count_people.columns.astype(str)

# Get a df with the totals for visualization purposes
visual_df = count_people.copy()
visual_df['total'] = visual_df.sum(axis=1)
visual_df.loc['total'] = visual_df.sum()
visual_df

## Bayes' rule
Update the probability of having certain age given a title. 

In [None]:
passage = count_people.sum().sum()

# Ages prior
priors = count_people.sum() / passage

# Title likelihoods
likelihood = count_people.T / count_people.sum(axis=1)

# Get the posterior
likelihood.iloc[likelihood == 0] = np.nan
posterior = priors * likelihood.T 

# Since we're not considering all the evidence in the model when calculating
# Bayes, we divide by the sum so all the titles' probabilities add up to one. 
posterior = (posterior.T / posterior.sum(axis=1))
posterior.T

*Col* adult has the same posterior as *Don* since there's only one *Col* in all the passage and therefore its likelihood is 1 (understand 100%). This means that their priors remain intact in the posterior. And that makes sense, you're not incorporating any new information to what you knew 

Conversely, *Dr* has the likelihood distributed among *student*, *young adult* and *adult* (14% +14% + 72% = 100%) and so it's reflected in its posterior (13%, 14% & 73%). Notice how different passage sizes between *student* & *young adult* (250 vs 281) have slightly effect on the posterior despite of the fact their likelihoods were the same.

### Plot posteriors

In [None]:
# Plot DR to see differences
sns.set()
x = posterior.index
fig, ax = plt.subplots(3, 2, figsize=(15, 9))
sns.barplot(x=x, y=posterior['Dr'], ax=ax[0, 0], alpha=.5)
sns.barplot(x=x, y=posterior['Master'], ax=ax[0, 1])
sns.barplot(x=x, y=posterior['Miss'], ax=ax[1, 0]);
sns.barplot(x=x, y=posterior['Mr'], ax=ax[1, 1]);
sns.barplot(x=x, y=posterior['Mrs'], ax=ax[2, 0]);

## Classify unknown ages
Once known the classification rule for unknown ages let's fill in the age predicted

In [None]:
# First get the classification Series
clf = posterior.T.idxmax(axis=1)
clf.name = 'clf'

def classify_ages(df, clf=clf):
    """
    Classify unknown ages. 
    First select the unknown ones' titles. Then, merge the titles with their
    correspondent age. Finally add by index to the original dataset.
    """
    unknown = df[df.Age == 'unknown'].Title
    classified = pd.merge(
        unknown, clf, how='left', left_on='Title', right_index=True)
    classified = classified.reindex(unknown.index)
    df.loc[classified.index, 'Age'] = classified.clf
    assert df[df.Age.isna()].empty
    return df

# Apply transformations
train = df.copy()
train = age_bins(train)
train = cabins(train)
train = fares(train)
train = names(train)
train = drop_useless_cols(train)
train = classify_ages(train)

# Apply transformations to test data
test = testdf.copy()
test = age_bins(test)
test = cabins(test)
test = fares(test)
test = names(test)
test = drop_useless_cols(test)
test = classify_ages(test)

## Encode JeffD likewise

In [None]:
features = ['Age', 'Pclass', 'Sex', 'SibSp', 'Parch', 'Fare', 'Cabin', 'Title']
combined_df = pd.concat((train[features], test[features]))
for feat in features:
    le = LabelEncoder()
    le = le.fit(combined_df[feat])
    train.loc[:, feat] = le.transform(train[feat])
    test.loc[:, feat] = le.transform(test[feat])

## Splitting up the data

In [None]:
X_all = train.drop(columns=['Survived'])
y_all = train.Survived

x_train, x_test, y_train, y_test = train_test_split(
    X_all, y_all, train_size=.8)

## Build a Random forest to classify the ages
I tried a bayesian algorithm with normal distribution but it didn't work out.

Now let's approach the ages with a random forest classifier like with Jeff notebook.

In [None]:
# Choose the type of classifier. 
clf = RandomForestClassifier()

# Choose some parameter combinations to try
parameters = {'n_estimators': [4, 6, 9], 
              'max_features': ['log2', 'sqrt','auto'], 
              'criterion': ['entropy', 'gini'],
              'max_depth': [2, 3, 5], 
              'min_samples_split': [2, 3, 5],
              'min_samples_leaf': [1,5,8]
             }

# Type of scoring used to compare parameter combinations
acc_scorer = make_scorer(accuracy_score)

# Run the grid search
grid_obj = GridSearchCV(clf, parameters, scoring=acc_scorer)
grid_obj = grid_obj.fit(x_train, y_train)

# Set the clf to the best combination of parameters
clf = grid_obj.best_estimator_

# Fit the best algorithm to the data. 
clf.fit(x_train, y_train)

## Test a single prediction
Use our trained model on the test split to see its performance

In [None]:
predictions = clf.predict(x_test)
accuracy_score(y_test, predictions)

## Accuracy test

In [None]:
kf = KFold(10)
outcomes = []
fold = 0
for train_idx, test_idx in kf.split(X_all):
    fold += 1
    X_train, y_train = X_all.values[train_idx], y_all.values[train_idx]
    X_test, y_test = X_all.values[test_idx], y_all.values[test_idx]
    clf.fit(X_train, y_train)
    y_hat = clf.predict(X_test)
    accuracy = accuracy_score(y_test, y_hat)
    outcomes.append(accuracy)
    print('Fold {} accuracy: {}'.format(fold, accuracy))
np.mean(outcomes)

In [None]:
y_hat= clf.predict(test)
pd.Series(index=testdf.PassengerId, data=y_hat, name='Survived').to_csv(
    'bayesian_age-3rd pass.csv')