## Model selection

### Topics

Out of scope: [feature engineering](https://en.wikipedia.org/wiki/Feature_engineering); we assume we already have good enough features.

1. Separate out training and validation sets from available data.

1. For each of the classification model families of logistic regression, decision trees, random forests and causal random forests, 

* fit the training data 

In [1]:
import os
from pathlib import Path

import pandas as pd
from pandas.api.types import CategoricalDtype
import numpy as np
import xarray as xr

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import precision_score

from scipy.stats import pearsonr

from fake_data_for_learning.contingency_tables import calculate_contingency_table

from risk_learning.simpson import compute_margin, get_flat_combinations

In [2]:
data_dir = Path(os.getcwd()) / 'data'

file_read_params = dict(delimiter=',')
target_col = 'default'
non_target_cols = ['gender', 'occupation', 'activity']
#non_target_cols = ['gender']
default = pd.read_csv(
    data_dir / 'default.csv', **file_read_params,
    usecols=non_target_cols + [target_col]
)
print(f'Shape of entire dataset: {default.shape}')
default.head()

Shape of entire dataset: (10000, 4)


Unnamed: 0,gender,occupation,default,activity
0,0,0,0,0
1,0,1,1,1
2,1,1,0,0
3,1,1,0,0
4,0,0,0,0


## Split data for model fitting

In [3]:
train_ratio = 0.6
validate_ratio_after_train_split = 0.5
seed = 42
X_train, X_validate_test, y_train, y_validate_test = train_test_split(
    default[non_target_cols], default[target_col], 
    train_size=train_ratio, random_state=seed
)
print(f'Shape of training fields: {X_train.shape},\nShape of target: {y_train.shape}')

X_validate, _, y_validate, _ = train_test_split(
    X_validate_test, y_validate_test, 
    train_size = validate_ratio_after_train_split, random_state=seed
)
print(f'\nShape of validation fields: {X_validate.shape},\nShape of target: {y_validate.shape}')

Shape of training fields: (6000, 3),
Shape of target: (6000,)

Shape of validation fields: (2000, 3),
Shape of target: (2000,)


### A little Exploratory Data Analysis

Very little.

In [4]:
# Value ranges can be guessed from data or extracted from IT system
default_values = [0, 1]
gender_values = [0, 1]
occupation_values = [0, 1]
activity_values = [0, 1]
data_categories = dict(
    default=CategoricalDtype(categories=default_values, ordered=True),
    gender=CategoricalDtype(categories=gender_values, ordered=True),
    occupation=CategoricalDtype(categories=occupation_values, ordered=True),
    activity=CategoricalDtype(categories=activity_values, ordered=True)
)

# Recombine 
X_y_train = X_train.copy()
X_y_train['default'] = y_train.values
for column, c_type in data_categories.items():
    X_y_train[column] = X_y_train[column].astype(c_type)
    
contingency_table = calculate_contingency_table(X_y_train[data_categories.keys()])

In [5]:
female_default_rate = (
    compute_margin(contingency_table, dict(default=1, gender=0 )) 
    / compute_margin(contingency_table, dict(gender=0))
)
print('\nfemale default rate', female_default_rate)
male_default_rate = (
    compute_margin(contingency_table, dict(default=1, gender=1)) 
    / compute_margin(contingency_table, dict(gender=1))
)
print('male default rate', male_default_rate)


female default rate 0.3745136186770428
male default rate 0.3292181069958848


In [6]:
pairs = [('gender', 'default'), ('activity', 'default'), ('occupation', 'activity')]
for pair in pairs:
    print(f'Correlation of {pair}: {pearsonr(X_y_train[pair[0]], X_y_train[pair[1]])}')

Correlation of ('gender', 'default'): (-0.04738658560144876, 0.00024080381961064963)
Correlation of ('activity', 'default'): (0.6617354898683132, 0.0)
Correlation of ('occupation', 'activity'): (-0.3494802806361358, 6.405847695546306e-172)


In [7]:
# Other variables uses when evaluating different models
gender_values = [0, 1]
occupation_values = [0, 1]
activity_values = [0, 1]
feature_coords = dict(
    gender=gender_values,
    occupation=occupation_values,
    activity=activity_values
)
feature_combinations = get_flat_combinations(feature_coords)
score_array = xr.DataArray(
    np.zeros([len(gender_values), len(occupation_values), len(activity_values)]),
    coords=feature_coords
)

## Fit models to training data and spot-check on validation data

In [8]:
first_n = 5
idxes_first_n = X_validate.index[:first_n]
X_validate_first_n = X_validate.loc[idxes_first_n, :]
y_validate_first_n = y_validate[:first_n]
sample_validate = X_validate_first_n.copy()
sample_validate['default'] = y_validate_first_n
sample_validate

Unnamed: 0,gender,occupation,activity,default
8113,1,1,0,0
1562,0,0,0,0
5735,1,0,0,0
9834,0,0,1,1
5379,0,0,1,1


## Logistic regression

In [9]:
hyperparams = {"C": 0.5}
clf = LogisticRegression(**hyperparams)
clf.fit(X_train, y_train)
clf.predict_proba(X_validate_first_n)

array([[0.73124993, 0.26875007],
       [0.98373354, 0.01626646],
       [0.49745306, 0.50254694],
       [0.01202461, 0.98797539],
       [0.01202461, 0.98797539]])

In [10]:
# Choose non-default decision threshold as mean of default=1 probabilities
# NOTE choosing a good decision threshold deserves more attention than given here
scores = clf.predict_proba(X_validate)
default_scores = scores[:, 1]
threshold = np.mean(default_scores)
default_predict = default_scores > threshold
default_predict

array([False, False,  True, ..., False, False, False])

In [11]:
precision_score(y_validate, default_predict)

0.8032258064516129

In [12]:
clf.predict_proba(feature_combinations)

array([[9.83733542e-01, 1.62664575e-02],
       [1.20246073e-02, 9.87975393e-01],
       [9.94020443e-01, 5.97955747e-03],
       [3.23723927e-02, 9.67627607e-01],
       [4.97453062e-01, 5.02546938e-01],
       [1.99172441e-04, 9.99800828e-01],
       [7.31249931e-01, 2.68750069e-01],
       [5.47292856e-04, 9.99452707e-01]])

In [13]:
prob_default = clf.predict_proba(feature_combinations)[:, 1]
np.mean(prob_default[:4])

0.4944622537675699

In [14]:
np.mean(prob_default[4:])

0.692637635270783

## Decision tree

In [15]:
hyperparams = {"criterion": "gini"}
clf = DecisionTreeClassifier(**hyperparams)
clf.fit(X_train, y_train)
clf.predict_proba(X_validate_first_n)

array([[0.73281748, 0.26718252],
       [1.        , 0.        ],
       [0.48122392, 0.51877608],
       [0.        , 1.        ],
       [0.        , 1.        ]])

In [16]:
scores = clf.predict_proba(X_validate)
default_scores = scores[:, 1]
threshold = np.mean(default_scores)
default_predict = default_scores > threshold
default_predict

array([False, False,  True, ..., False, False, False])

In [17]:
precision_score(y_validate, default_predict)

0.8032258064516129

In [18]:
# Probability of default given ...
clf.predict_proba(feature_combinations)

array([[1.        , 0.        ],
       [0.        , 1.        ],
       [1.        , 0.        ],
       [0.        , 1.        ],
       [0.48122392, 0.51877608],
       [0.        , 1.        ],
       [0.73281748, 0.26718252],
       [0.        , 1.        ]])

In [19]:
prob_default = clf.predict_proba(feature_combinations)[:, 1]
np.mean(prob_default[:4])

0.5

In [20]:
np.mean(prob_default[4:])

0.696489649876586

### Random Forest Classifier

In [21]:
hyperparams = {'n_estimators': 5}
clf = RandomForestClassifier(**hyperparams)
clf.fit(X_train, y_train)
clf.predict_proba(X_validate_first_n)

array([[0.73454658, 0.26545342],
       [1.        , 0.        ],
       [0.49720428, 0.50279572],
       [0.        , 1.        ],
       [0.        , 1.        ]])

In [22]:
scores = clf.predict_proba(X_validate)
default_scores = scores[:, 1]
threshold = np.mean(default_scores)
default_predict = default_scores > threshold
default_predict

array([False, False,  True, ..., False, False, False])

In [23]:
precision_score(y_validate, default_predict)

0.8032258064516129

In [24]:
clf.predict_proba(feature_combinations)

array([[1.        , 0.        ],
       [0.        , 1.        ],
       [1.        , 0.        ],
       [0.        , 1.        ],
       [0.49720428, 0.50279572],
       [0.29775738, 0.70224262],
       [0.73454658, 0.26545342],
       [0.44194135, 0.55805865]])

In [25]:
prob_default = clf.predict_proba(feature_combinations)[:, 1]
np.mean(prob_default[:4])

0.5

In [26]:
np.mean(prob_default[4:])

0.5071376044415691