## Model selection

### Topics

Out of scope: [feature engineering](https://en.wikipedia.org/wiki/Feature_engineering); we assume we already have good enough features.

1. Separate out training and validation sets from available data.

1. For each of the classification model families of logistic regression, decision trees, random forests and causal random forests, 

* fit the training data 

In [None]:
import os
from pathlib import Path

import pandas as pd
from pandas.api.types import CategoricalDtype
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import precision_score

from scipy.stats import pearsonr

from fake_data_for_learning.contingency_tables import calculate_contingency_table

from risk_learning.simpson import compute_margin

In [None]:
data_dir = Path(os.getcwd()) / 'data'

file_read_params = dict(delimiter=',')
target_col = 'default'
non_target_cols = ['gender', 'occupation', 'activity']
#non_target_cols = ['gender']
default = pd.read_csv(
    data_dir / 'default.csv', **file_read_params,
    usecols=non_target_cols + [target_col]
)
print(f'Shape of entire dataset: {default.shape}')
default.head()

## Split data for model fitting

In [None]:
train_ratio = 0.6
validate_ratio_after_train_split = 0.5
seed = 42
X_train, X_validate_test, y_train, y_validate_test = train_test_split(
    default[non_target_cols], default[target_col], 
    train_size=train_ratio, random_state=seed
)
print(f'Shape of training fields: {X_train.shape},\nShape of target: {y_train.shape}')

X_validate, _, y_validate, _ = train_test_split(
    X_validate_test, y_validate_test, 
    train_size = validate_ratio_after_train_split, random_state=seed
)
print(f'\nShape of validation fields: {X_validate.shape},\nShape of target: {y_validate.shape}')

### A little Exploratory Data Analysis

Very little.

In [None]:
# Value ranges can be guessed from data or extracted from IT system
default_values = [0, 1]
gender_values = [0, 1]
occupation_values = [0, 1]
activity_values = [0, 1]
data_categories = dict(
    default=CategoricalDtype(categories=default_values, ordered=True),
    gender=CategoricalDtype(categories=gender_values, ordered=True),
    occupation=CategoricalDtype(categories=occupation_values, ordered=True),
    activity=CategoricalDtype(categories=activity_values, ordered=True)
)

# Recombine 
X_y_train = X_train.copy()
X_y_train['default'] = y_train.values
for column, c_type in data_categories.items():
    X_y_train[column] = X_y_train[column].astype(c_type)
    
contingency_table = calculate_contingency_table(X_y_train[data_categories.keys()])

In [None]:
female_default_rate = (
    compute_margin(contingency_table, dict(default=1, gender=0 )) 
    / compute_margin(contingency_table, dict(gender=0))
)
print('\nfemale default rate', female_default_rate)
male_default_rate = (
    compute_margin(contingency_table, dict(default=1, gender=1)) 
    / compute_margin(contingency_table, dict(gender=1))
)
print('male default rate', male_default_rate)

In [None]:
pairs = [('gender', 'default'), ('activity', 'default'), ('occupation', 'activity')]
for pair in pairs:
    print(f'Correlation of {pair}: {pearsonr(X_y_train[pair[0]], X_y_train[pair[1]])}')

## Fit modes to training data and spot-check on validation data

In [None]:
first_n = 5
idxes_first_n = X_validate.index[:first_n]
X_validate_first_n = X_validate.loc[idxes_first_n, :]
y_validate_first_n = y_validate[:first_n]
sample_validate = X_validate_first_n.copy()
sample_validate['default'] = y_validate_first_n
sample_validate

## Logistic regression

In [None]:
hyperparams = {"C": 0.5}
clf = LogisticRegression(**hyperparams)
clf.fit(X_train, y_train)
clf.predict_proba(X_validate_first_n)

In [None]:
# Choose non-default decision threshold as mean of default=1 probabilities
# NOTE choosing a good decision threshold deserves more attention than given here
scores = clf.predict_proba(X_validate)
default_scores = scores[:, 1]
threshold = np.mean(default_scores)
default_predict = default_scores > threshold
default_predict

In [None]:
precision_score(y_validate, default_predict)

In [None]:
feature_combinations = pd.DataFrame(
    [
        [0, 0, 0], # female, education, no-activity
        [0, 0, 1], # female, education, activity
        [0, 1, 0], # female, health, no-activity
        [0, 1, 1], # female, health, activity
        [1, 1, 0], # male, health, no-activity
        [1, 1, 1], # male, health, activity
        [1, 0, 0], # male, education, no-activity
        [1, 0, 1], # male, education, activity

    ], columns=['gender', 'occupation', 'activity']
)

clf.predict_proba(feature_combinations)

In [None]:
prob_default = clf.predict_proba(feature_combinations)[:, 1]
np.mean(prob_default[:4])

In [None]:
np.mean(prob_default[4:])

## Decision tree

In [None]:
hyperparams = {"criterion": "gini"}
clf = DecisionTreeClassifier(**hyperparams)
clf.fit(X_train, y_train)
clf.predict_proba(X_validate_first_n)

In [None]:
scores = clf.predict_proba(X_validate)
default_scores = scores[:, 1]
threshold = np.mean(default_scores)
default_predict = default_scores > threshold
default_predict

In [None]:
precision_score(y_validate, default_predict)

In [None]:
# Probability of default given ...
clf.predict_proba(feature_combinations)

In [None]:
prob_default = clf.predict_proba(feature_combinations)[:, 1]
np.mean(prob_default[:4])

In [None]:
np.mean(prob_default[4:])

### Random Forest Classifier

In [None]:
hyperparams = {'n_estimators': 5}
clf = RandomForestClassifier(**hyperparams)
clf.fit(X_train, y_train)
clf.predict_proba(X_validate_first_n)

In [None]:
scores = clf.predict_proba(X_validate)
default_scores = scores[:, 1]
threshold = np.mean(default_scores)
default_predict = default_scores > threshold
default_predict

In [None]:
precision_score(y_validate, default_predict)

In [None]:
clf.predict_proba(feature_combinations)

In [None]:
prob_default = clf.predict_proba(feature_combinations)[:, 1]
np.mean(prob_default[:4])

In [None]:
np.mean(prob_default[4:])