In [1]:
import os
from pathlib import Path
import json

import pandas as pd
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
import numpy as np

from prr.utils import (
    convert_to_categorical,
    make_feature_combination_array,
    make_feature_combination_score_array
)
from prr.trends import make_trend_reports, make_data_trend_reports

## Read in Simpson dataset used for plot of trend vs regularization parameter

In [2]:
data_path = Path(os.environ['REPO_ROOT']) / 'data/pathological-default-for-x-validation.csv'

df = pd.read_csv(data_path)

# Convert datatypes to categorical
label_mapping_values = dict(gender=[0, 1], occupation=[0, 1])
data_categories = label_mapping_values.copy()
data_categories['default'] = [0, 1]
df = convert_to_categorical(df, data_categories)

# Look at data trend report
print('Data trend report:')
make_data_trend_reports(
    df, 
    target_field_name='default', 
    target_field_value=1, 
    non_exposure_field_name='occupation'
)

Data trend report:


  counts = data.groupby(data.columns.tolist(), as_index=True).size()


{'n_samples': 600,
 'contingency_table': {'dims': ('default', 'gender', 'occupation'),
  'attrs': {},
  'data': [[[93.0, 21.0], [0.0, 123.0]], [[313.0, 1.0], [2.0, 47.0]]],
  'coords': {'default': {'dims': ('default',), 'attrs': {}, 'data': [0, 1]},
   'gender': {'dims': ('gender',), 'attrs': {}, 'data': [0, 1]},
   'occupation': {'dims': ('occupation',), 'attrs': {}, 'data': [0, 1]}},
  'name': None},
 'total_population': {'gender_trend': 0.4487611388828516},
 'sub_populations': [{'population_group': 'occupation',
   'population_value': 0,
   'trend': {'gender_trend': -0.229064039408867}},
  {'population_group': 'occupation',
   'population_value': 1,
   'trend': {'gender_trend': -0.23101604278074866}}]}

## Fit logistic regression with intercept using cross-validation

taking default settings unless we have a good reason not to, e.g. `solver='newton-cholesky'` rather than default `solver='lbfgs'`.

In [9]:
X = df[['gender', 'occupation']]
y = df['default']
print(f'Fix model with cross-validation')
clf = LogisticRegressionCV(cv=3, random_state=42, fit_intercept=True, penalty='l2', solver='newton-cholesky').fit(X, y)
print(f'Cross-validation optimized choice of C: {clf.C_[0]}')

feature_combination_array = make_feature_combination_array(
    label_mapping_values=dict(label_mapping_values)
)
scores = clf.predict_proba(feature_combination_array)
prob_default = scores[:, [1]]  # Class 1 is default, class 0 no-default

feature_fields = list(label_mapping_values.keys())
feature_combination_scores = pd.DataFrame(
    np.concatenate([feature_combination_array, prob_default], axis=1),
    columns=feature_fields + ['default_score']
)
for feature in feature_fields:
    feature_combination_scores[feature] = feature_combination_scores[feature].astype(int)

# Convert flat probabilities to xarray for more robust accessing
feature_combination_contingency = make_feature_combination_score_array(
    feature_combinations=feature_combination_scores[feature_fields],
    scores=feature_combination_scores['default_score']
)

sub_population_trend_reports = make_trend_reports(
    feature_combination_contingency, population_subgroup='occupation'
)
sub_population_trend_reports

Fix model with cross-validation
Cross-validation optimized choice of C: 0.3593813663804626




[{'population_group': 'occupation',
  'population_value': 0,
  'trend': {'gender_trend': 0.003078681346700307}},
 {'population_group': 'occupation',
  'population_value': 1,
  'trend': {'gender_trend': 0.0033781725168935406}}]

## Calculate "true" trends by fitting with a large inverse regularization parameter

In [4]:
clf_0 = LogisticRegression(fit_intercept=True, penalty='l2', solver='newton-cholesky', C=1e9)
clf_0.fit(X, y)

feature_combination_array = make_feature_combination_array(
    label_mapping_values=dict(label_mapping_values)
)
scores = clf_0.predict_proba(feature_combination_array)
prob_default = scores[:, [1]]  # Class 1 is default, class 0 no-default

feature_fields = list(label_mapping_values.keys())
feature_combination_scores = pd.DataFrame(
    np.concatenate([feature_combination_array, prob_default], axis=1),
    columns=feature_fields + ['default_score']
)
for feature in feature_fields:
    feature_combination_scores[feature] = feature_combination_scores[feature].astype(int)

# Convert flat probabilities to xarray for more robust accessing
feature_combination_contingency = make_feature_combination_score_array(
    feature_combinations=feature_combination_scores[feature_fields],
    scores=feature_combination_scores['default_score']
)

sub_population_trend_reports = make_trend_reports(
    feature_combination_contingency, population_subgroup='occupation'
)
sub_population_trend_reports



[{'population_group': 'occupation',
  'population_value': 0,
  'trend': {'gender_trend': -0.19555714999981944}},
 {'population_group': 'occupation',
  'population_value': 1,
  'trend': {'gender_trend': -0.2342684634471907}}]

### Fit again, but using `class_weight='balanced'`

to adjust for unbalanced classes in the dataset.

In [5]:
print(f'Fix model with cross-validation')
clf_balanced = LogisticRegressionCV(
    cv=5, random_state=42, fit_intercept=True, penalty='l2', solver='newton-cholesky',
    class_weight='balanced'
).fit(X, y)
print(f'Cross-validation optimized choice of C: {clf_balanced.C_[0]}')


scores = clf_balanced.predict_proba(feature_combination_array)
prob_default = scores[:, [1]]  # Class 1 is default, class 0 no-default

feature_fields = list(label_mapping_values.keys())
feature_combination_scores = pd.DataFrame(
    np.concatenate([feature_combination_array, prob_default], axis=1),
    columns=feature_fields + ['default_score']
)
for feature in feature_fields:
    feature_combination_scores[feature] = feature_combination_scores[feature].astype(int)

# Convert flat probabilities to xarray for more robust accessing
feature_combination_contingency = make_feature_combination_score_array(
    feature_combinations=feature_combination_scores[feature_fields],
    scores=feature_combination_scores['default_score']
)

sub_population_trend_reports = make_trend_reports(
    feature_combination_contingency, population_subgroup='occupation'
)
sub_population_trend_reports

Fix model with cross-validation
Cross-validation optimized choice of C: 0.046415888336127774




[{'population_group': 'occupation',
  'population_value': 0,
  'trend': {'gender_trend': 0.13173856642063608}},
 {'population_group': 'occupation',
  'population_value': 1,
  'trend': {'gender_trend': 0.11679702737617637}}]