## Model selection

### Topics

Out of scope: [feature engineering](https://en.wikipedia.org/wiki/Feature_engineering); we assume we already have good enough features.

1. Separate out training and validation sets from available data.

1. For each of the classification model families of logistic regression, decision trees, random forests and causal random forests, 

* fit the training data 

In [1]:
import os
from pathlib import Path

import pandas as pd
from pandas.api.types import CategoricalDtype
import numpy as np
import xarray as xr

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import precision_score

from scipy.stats import pearsonr

from fake_data_for_learning.contingency_tables import calculate_contingency_table

from risk_learning.simpson import (
    get_flat_combinations, 
    make_feature_combination_score_array,
    compute_margin
)

In [2]:
data_dir = Path(os.getcwd()) / 'data'

file_read_params = dict(delimiter=',')
target_col = 'default'
non_target_cols = ['gender', 'occupation', 'activity']
#non_target_cols = ['gender']
default = pd.read_csv(
    data_dir / 'default.csv', **file_read_params,
    usecols=non_target_cols + [target_col]
)
print(f'Shape of entire dataset: {default.shape}')
default.head()

Shape of entire dataset: (10000, 4)


Unnamed: 0,gender,occupation,default,activity
0,1,0,1,0
1,0,1,1,1
2,0,0,0,0
3,1,1,0,0
4,0,0,1,1


## Split data for model fitting

In [3]:
train_ratio = 0.6
validate_ratio_after_train_split = 0.5
seed = 42
X_train, X_validate_test, y_train, y_validate_test = train_test_split(
    default[non_target_cols], default[target_col], 
    train_size=train_ratio, random_state=seed
)
print(f'Shape of training fields: {X_train.shape},\nShape of target: {y_train.shape}')

X_validate, _, y_validate, _ = train_test_split(
    X_validate_test, y_validate_test, 
    train_size = validate_ratio_after_train_split, random_state=seed
)
print(f'\nShape of validation fields: {X_validate.shape},\nShape of target: {y_validate.shape}')

Shape of training fields: (6000, 3),
Shape of target: (6000,)

Shape of validation fields: (2000, 3),
Shape of target: (2000,)


### A little Exploratory Data Analysis

Very little.

In [4]:
# Value ranges can be guessed from data or extracted from IT system
default_values = [0, 1]
gender_values = [0, 1]
occupation_values = [0, 1]
activity_values = [0, 1]
data_categories = dict(
    default=CategoricalDtype(categories=default_values, ordered=True),
    gender=CategoricalDtype(categories=gender_values, ordered=True),
    occupation=CategoricalDtype(categories=occupation_values, ordered=True),
    activity=CategoricalDtype(categories=activity_values, ordered=True)
)

# Recombine 
X_y_train = X_train.copy()
X_y_train['default'] = y_train.values
for column, c_type in data_categories.items():
    X_y_train[column] = X_y_train[column].astype(c_type)
    
contingency_table = calculate_contingency_table(X_y_train[data_categories.keys()])

In [5]:
female_default_rate = (
    compute_margin(contingency_table, dict(default=1, gender=0 )) 
    / compute_margin(contingency_table, dict(gender=0))
)
print('\nfemale empirical default rate', female_default_rate)
male_default_rate = (
    compute_margin(contingency_table, dict(default=1, gender=1)) 
    / compute_margin(contingency_table, dict(gender=1))
)
print('male empirical default rate', male_default_rate)


female empirical default rate 0.37757514353259036
male empirical default rate 0.3320171108917407


In [6]:
pairs = [('gender', 'default'), ('activity', 'default'), ('occupation', 'activity')]
for pair in pairs:
    print(f'Correlation of {pair}: {pearsonr(X_y_train[pair[0]], X_y_train[pair[1]])}')

Correlation of ('gender', 'default'): (-0.04761480627555703, 0.0002246500110554631)
Correlation of ('activity', 'default'): (0.6457464358214573, 0.0)
Correlation of ('occupation', 'activity'): (-0.36227791894060063, 1.607006088912142e-185)


In [7]:
# Other variables uses when evaluating different models
gender_values = [0, 1]
occupation_values = [0, 1]
activity_values = [0, 1]
feature_coords = dict(
    gender=gender_values,
    occupation=occupation_values,
    activity=activity_values
)
feature_combinations = get_flat_combinations(feature_coords)

## Fit models to training data and spot-check on validation data

In [8]:
first_n = 5
idxes_first_n = X_validate.index[:first_n]
X_validate_first_n = X_validate.loc[idxes_first_n, :]
y_validate_first_n = y_validate[:first_n]
sample_validate = X_validate_first_n.copy()
sample_validate['default'] = y_validate_first_n
sample_validate

Unnamed: 0,gender,occupation,activity,default
8113,0,0,1,1
1562,1,1,0,0
5735,0,1,0,0
9834,0,1,1,1
5379,1,0,0,0


## Logistic regression

In [9]:
?LogisticRegression

In [10]:
hyperparams = {"C": 0.5, "random_state": 42}
clf = LogisticRegression(**hyperparams)
clf.fit(X_train, y_train)
clf.predict_proba(X_validate_first_n)

array([[0.0124595 , 0.9875405 ],
       [0.72900609, 0.27099391],
       [0.99387285, 0.00612715],
       [0.03406177, 0.96593823],
       [0.49044512, 0.50955488]])

In [11]:
# Choose non-default decision threshold as mean of default=1 probabilities
# NOTE choosing a good decision threshold deserves more attention than given here
scores = clf.predict_proba(X_validate)
default_scores = scores[:, 1]
threshold = np.mean(default_scores)
default_predict = default_scores > threshold
default_predict

array([ True, False, False, ...,  True,  True, False])

In [12]:
precision_score(y_validate, default_predict)

0.7812995245641838

In [13]:
# Default scores by gender
scores = clf.predict_proba(feature_combinations)[:, 1]
score_array = make_feature_combination_score_array(feature_combinations, scores)
mean_female_score = score_array.sel(**dict(gender=0)).mean().values
mean_male_score = score_array.sel(**dict(gender=1)).mean().values
print(clf.__class__, 'mean scores')
print(f'female: {mean_female_score}, male: {mean_male_score}')

<class 'sklearn.linear_model._logistic.LogisticRegression'> mean scores
female: 0.49413614994455485, male: 0.6949387789994711


## Decision tree

In [14]:
hyperparams = {"criterion": "gini", "random_state": 42}
clf = DecisionTreeClassifier(**hyperparams)
clf.fit(X_train, y_train)
clf.predict_proba(X_validate_first_n)

array([[0.        , 1.        ],
       [0.7304007 , 0.2695993 ],
       [1.        , 0.        ],
       [0.        , 1.        ],
       [0.47510094, 0.52489906]])

In [15]:
scores = clf.predict_proba(X_validate)
default_scores = scores[:, 1]
threshold = np.mean(default_scores)
default_predict = default_scores > threshold
default_predict

array([ True, False, False, ...,  True,  True, False])

In [16]:
precision_score(y_validate, default_predict)

0.7812995245641838

In [17]:
# Default scores by gender
scores = clf.predict_proba(feature_combinations)[:, 1]
score_array = make_feature_combination_score_array(feature_combinations, scores)
mean_female_score = score_array.sel(**dict(gender=0)).mean().values
mean_male_score = score_array.sel(**dict(gender=1)).mean().values
print(clf.__class__, 'mean scores')
print(f'female: {mean_female_score}, male: {mean_male_score}')

<class 'sklearn.tree._classes.DecisionTreeClassifier'> mean scores
female: 0.5, male: 0.6986245902523436


In [18]:
prob_default = clf.predict_proba(feature_combinations)[:, 1]
np.mean(prob_default[:4])

0.5

In [19]:
np.mean(prob_default[4:])

0.6986245902523436

### Random Forest Classifier

In [20]:
hyperparams = {"n_estimators": 5, "random_state": 42}
clf = RandomForestClassifier(**hyperparams)
clf.fit(X_train, y_train)
clf.predict_proba(X_validate_first_n)

array([[0.        , 1.        ],
       [0.72733895, 0.27266105],
       [1.        , 0.        ],
       [0.        , 1.        ],
       [0.47440243, 0.52559757]])

In [21]:
scores = clf.predict_proba(X_validate)
default_scores = scores[:, 1]
threshold = np.mean(default_scores)
default_predict = default_scores > threshold
default_predict

array([ True, False, False, ...,  True,  True, False])

In [22]:
precision_score(y_validate, default_predict)

0.7812995245641838

In [23]:
# Default scores by gender
scores = clf.predict_proba(feature_combinations)[:, 1]
score_array = make_feature_combination_score_array(feature_combinations, scores)
mean_female_score = score_array.sel(**dict(gender=0)).mean().values
mean_male_score = score_array.sel(**dict(gender=1)).mean().values
print(clf.__class__, 'mean scores')
print(f'female: {mean_female_score}, male: {mean_male_score}')

<class 'sklearn.ensemble._forest.RandomForestClassifier'> mean scores
female: 0.5, male: 0.6153113438316921
