In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import pymc3 as pm
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

%matplotlib inline

In [None]:
df = pd.read_csv('train.csv')  
testdf = pd.read_csv('test.csv')  # the data without class for submission

# Bayesian logistic regression v2

In this approach we'll make a dual model: one for the known ages and the other for the unknown.

The train/test splits were chosen with the same ratio as the train test provided data.

---

### 1st attempt
In the submitted data the unknown model performed better than the known one. In further submissions of this model happens the other way around (and that also matches my intuition derived from previous tests)

| mean | known model | unknown model |
|------|-------------|---------------|
| 0.7852028639618138 | 0.7791044776119403 | 0.8095238095238095 |

After submission, it scored **0.76555**, the second best record to date after jd second attempt

---

### 2nd attempt
Increased tuning steps from 3500 to 4000 in the unknown model due to the target acceptance warning

| mean | known model | unknown model |
|------|-------------|---------------|
| 0.7852028639618138 | 0.7940298507462686 | 0.75 |

**What!** the same mean as the first attempt, let's give it another go before submitting:

| mean | known model | unknown model |
|------|-------------|---------------|
| 0.8042959427207638 |  0.8059701492537313 | 0.7976190476190477 |

This second attempt performed nice on training data, but however it only scored **0.64593** on kaggle. Maybe it's overfitting?

---

### 3rd attempt

| mean | known model | unknown model |
|------|-------------|---------------|
| 0.766109785202864 |  0.7791044776119403 | 0.7142857142857143 |

This third attempt performed worse on training data, but improved previous score **0.68421** on kaggle. Still overfitting?

---

### 4th attempt
The following day I submitted the last prediction. scoring **0.73684**

| mean | known model | unknown model |
|------|-------------|---------------|
| 0.7995226730310262 |  0.8 | 0.7976190476190477 |


## Convenience cleaning method

In [None]:
def clean(df, cabins=False, titles=False):
    """Convenience method to clean and prepare the data.
    
    Arguments: a pandas dataframe
    returns: a cleaned dataframe and the cabins & titles ratio for the
    test dataframe.
    """
    
    # Let's assign nan fares to -1
    df.Fare.fillna(-.5, inplace=True)
    
    # Let's make sex a number
    idx = df[df.Sex == 'female'].index
    df.loc[:, 'Sex'] = 0
    df.loc[idx, 'Sex'] = 1
    
    
    # Simplify cabin names and encode by fatality ratio
    df.loc[:, 'Cabin'] = df.Cabin.fillna('N')
    df.loc[:, 'Cabin'] = df.Cabin.apply(lambda x: x[0])
    try:
        cabins.all()
    except AttributeError:
        cabins = df.pivot_table(
            index='Cabin', columns='Survived', values='Sex', aggfunc='count').fillna(0)
        cabins['ratio'] = cabins[1] / cabins[0]
    df = pd.merge(
        df, cabins.ratio, left_on='Cabin', right_index=True, how='left')
    df.drop(columns=['Cabin',], inplace=True)
    df.rename(columns={'ratio': 'Cabin'}, inplace=True)
    
    
    # Get the name titles and encode by fatality ratio
    d1 = df.Name.apply(lambda x: x.split(',')[1].split('.')[0])
    df['Title'] = d1.str.replace(' ', '')

    # A couple of irregular ones
    d1 = df[df.Title.str.contains('Jonkheer')]
    d2 = df[df.Title.str.contains('Countess')]
    df.loc[d1.index, 'Title'] = 'Mr'
    df.loc[d2.index, 'Title'] = 'Mrs'  # In her Age group are majority
    
    try:
        titles.all()
    except AttributeError:
        titles = df.pivot_table(index='Title', columns='Survived', values='Sex', aggfunc='count').fillna(0)
        titles['ratio'] = (titles[1]+1) / (titles[0]+1)
    df = pd.merge(df, titles['ratio'], left_on='Title', right_index=True, how='left')
    df.drop(columns=['Title',], inplace=True)
    df.rename(columns={'ratio': 'Title'}, inplace=True)
    
    # Finally, drop some columns
    df.drop(
        columns=['Ticket', 'Embarked', 'Name', 'PassengerId'], inplace=True)
    
    return df, cabins, titles

## Prepare data and train
We'll code in parallel both models. 

In [None]:
### Prepare the data
nan_ages = df.Age.isna()
kdf, udf = df[~nan_ages].copy(), df[nan_ages].copy()
kdf, cabins, titles = clean(kdf)
udf, _, _= clean(udf, cabins=cabins)
udf.drop(columns='Age', inplace=True)

### Split the data
test_size = testdf.shape[0] / df.shape[0]
k_train, k_test = train_test_split(
    kdf, test_size=test_size)
u_train, u_test = train_test_split(
    udf, test_size=test_size)

### Training
k_feat = [
    'Survived', 'Sex', 'Pclass', 'Fare', 'Cabin', 'Title', 'Age' ]
u_feat = k_feat[:-1]

k_model = 'Survived ~ Sex + Pclass + Fare + Cabin + Title + Age'
u_model = k_model[:-6]

with pm.Model() as k_logit:
    pm.glm.GLM.from_formula(
        k_model, k_train[k_feat],
        family=pm.glm.families.Binomial())
    k_trace = pm.sample(3000, tune=3500, init='adapt_diag')

with pm.Model() as u_logit:
    pm.glm.GLM.from_formula(
        u_model, u_train[u_feat],
        family=pm.glm.families.Binomial())
    u_trace = pm.sample(3000, tune=4000, init='adapt_diag')

## Build some convenience dataframes
We'll make three dataframes:
1. Bayesian point estimates, to compare between known and unknown models
2. Linear model, logit, prediction, ground truth for known ages
3. Linear model, logit, prediction, ground truth for unknown ages

In [None]:
k_idx = ['Intercept', ] + k_feat[1:]
u_idx = ['Intercept', ] + u_feat[1:]

# Come up with simple bayesian points for coefficients and intercept
k_bp = [k_trace[f].mean() for f in k_idx]
u_bp = [u_trace[f].mean() for f in u_idx]

data = {
    'k_data': k_bp,  # coefficients known age
    'u_data': u_bp + [np.nan, ],  # coefficients unknown age (=> as nan) 
}

bayesian_points = pd.DataFrame(data=data, index=k_idx)

# linear model
k_lm = (k_test.T.reindex(k_idx).fillna(1).T * np.array(k_bp)).sum(axis=1)
u_lm = (u_test.T.reindex(u_idx).fillna(1).T * np.array(u_bp)).sum(axis=1)

# logistic function
def logit(x):
    return 1 / (1 + np.exp(-x))

k_logit, u_logit = [
    logit(x) for x in (k_lm, u_lm)] 

# Build the predictions df for known ages
k_model = pd.concat((k_lm, k_logit), axis=1, keys=('lm', 'logit',))
k_model['y'] = k_test.Survived
k_model['y_hat'] = 0
survived = k_model.logit > .5
k_model.loc[k_model[survived].index, 'y_hat'] = 1
k_model['matched'] = k_model.y == k_model.y_hat
k_model['known_age'] = True

# Build the predictions df for unknown ages
u_model = pd.concat((u_lm, u_logit), axis=1, keys=('lm', 'logit',))
u_model['y'] = u_test.Survived
u_model['y_hat'] = 0
survived = u_model.logit > .5
u_model.loc[u_model[survived].index, 'y_hat'] = 1
u_model['matched'] = u_model.y == u_model.y_hat
u_model['known_age'] = False

model = pd.concat((k_model, u_model))

## Measure the accuracy

In [None]:
f = model.known_age == True
main = accuracy_score(model.y, model.y_hat)
known = accuracy_score(model[f].y, model[f].y_hat)
unknown = accuracy_score(model[~f].y, model[~f].y_hat)
print('Overall accuracy:', main)
print('Known model:     ', known)
print('Unknown model:   ', unknown)

## Generate the upload

In [None]:
t0, _, _ = clean(testdf, cabins=cabins, titles=titles)
f = t0.Age.isna()
known, unknown = t0[f], t0[~f]

k_lm = (
    known.T.reindex(k_idx).fillna(1).T * 
    bayesian_points.k_data.values).sum(axis=1)
u_lm = (
    unknown.T.reindex(u_idx).fillna(1).T * 
    bayesian_points.u_data.values[:-1]).sum(axis=1)

lm = pd.concat((k_lm, u_lm))
log = logit(lm)
survived = log > .5
log[survived] = 1
log[~survived] = 0
log = log.astype(int)
log.name = 'Survived'
  
pd.concat((testdf.PassengerId, log), axis=1).to_csv(
    'Submissions/05-bayesian-logistic-regression-splitted-4th-pass.csv', 
    index=False)