In [1]:
import pandas as pd
from pandas import DataFrame
from sklearn.model_selection import train_test_split

In [2]:
titanic = pd.read_csv("titanic.csv")
titanic_train, titanic_test = train_test_split(titanic, test_size=0.1)

def table_lookup_predictor(x, table, age):
    """Implements the table-lookup algorithm with ages after cufoff"""
    
    # Get most common label
    default = table.Survived.value_counts().argmax()
    # Get similar individuals
    similar_tab = table.loc[(table["Pclass"] == x["Pclass"]) &\
                            (table["Sex"] == x["Sex"]) &\
                            (table["Siblings/Spouses Aboard"] == x["Siblings/Spouses Aboard"]) &\
                            (table["Parents/Children Aboard"] == x["Parents/Children Aboard"]) &\
                            ((table["Age"] < age) == (x["Age"] < age)) , "Survived"]
    if len(similar_tab) == 0:
        # If table is empty (no "similar" individuals), guess the most common label
        return default
    else:
        return similar_tab.value_counts().argmax()

actual = titanic_test.Survived
predicted = titanic_test.apply(table_lookup_predictor, 1, table=titanic_train, age=10)

The current behaviour of 'Series.argmax' is deprecated, use 'idxmax'
instead.
The behavior of 'argmax' will be corrected to return the positional
maximum in the future. For now, use 'series.values.argmax' or
'np.argmax(np.array(values))' to get the position of the maximum
row.
  
The current behaviour of 'Series.argmax' is deprecated, use 'idxmax'
instead.
The behavior of 'argmax' will be corrected to return the positional
maximum in the future. For now, use 'series.values.argmax' or
'np.argmax(np.array(values))' to get the position of the maximum
row.


In [3]:
DataFrame({"actual": actual, "predicted": predicted})

Unnamed: 0,actual,predicted
717,0,0
861,1,1
283,0,0
609,1,1
665,0,0
458,0,0
30,0,0
496,0,0
468,0,0
623,0,0


In [4]:
from sklearn.metrics import accuracy_score

In [5]:
accuracy_score(y_true=actual, y_pred=predicted)

0.8089887640449438

In [6]:
from sklearn.metrics import precision_score

In [7]:
precision_score(y_true=actual, y_pred=predicted)

0.8571428571428571

In [8]:
precision_score(y_true=actual, y_pred=predicted, pos_label=0)

0.7941176470588235

In [9]:
from sklearn.metrics import recall_score

In [10]:
recall_score(y_true=actual, y_pred=predicted)

0.5625

In [11]:
recall_score(y_true=actual, y_pred=predicted, pos_label=0)

0.9473684210526315

In [12]:
from sklearn.metrics import f1_score

In [13]:
f1_score(y_true=actual, y_pred=predicted)

0.6792452830188678

In [14]:
# Again, depends on the label of interest
f1_score(y_true=actual, y_pred=predicted, pos_label=0)

0.864

In [15]:
from sklearn.metrics  import classification_report

In [16]:
print(classification_report(y_true=actual, y_pred=predicted))

              precision    recall  f1-score   support

           0       0.79      0.95      0.86        57
           1       0.86      0.56      0.68        32

    accuracy                           0.81        89
   macro avg       0.83      0.75      0.77        89
weighted avg       0.82      0.81      0.80        89



### Bayes Factor

In [17]:
N = len(actual)    # Total sample size
M = (actual == predicted).sum()    # A shorthand for computing the number of "successes
(M, N)

(72, 89)

In [18]:
post_params_lookup = (3 + M, 3 + N - M)
post_params_lookup

(75, 20)

In [19]:
ds = pd.Series(actual).value_counts()
ds

0    57
1    32
Name: Survived, dtype: int64

In [20]:
from scipy.stats import beta

In [21]:
N = 10000    # Number of simulations
p_1 = beta.rvs(67, 28, size=N)
p_2 = beta.rvs(64, 31, size=N)
trial = p_1 > p_2

pm1 = trial.mean()
pm2 = 1 - pm1
(pm1, pm2)

(0.6786, 0.3214)

In [22]:
K = pm1 / pm2
K

2.111387678904791