In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
%pylab inline
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# import all of our files
import sys
sys.path.append('../')

import fico
import distribution_to_loans_outcomes as dlo

DATA_DIR = '../data/'

Populating the interactive namespace from numpy and matplotlib


In [3]:
# set plotting parameters
sns.set_context("talk")
sns.set_style("white")

# this needs to be here so we can edit figures later
plt.rcParams['pdf.fonttype'] = 42

In [4]:
all_cdfs, performance, totals = fico.get_FICO_data(data_dir=DATA_DIR);

In [5]:
cdfs = all_cdfs[["White","Black"]]

# B is White
# A is Black

cdf_B = cdfs['White'].values
cdf_A = cdfs['Black'].values

repay_B = performance['White']
repay_A = performance['Black']

scores = cdfs.index
scores_list = scores.tolist()
scores_repay = cdfs.index

In [6]:
# to populate group distributions
def get_pmf(cdf):
    pis = np.zeros(cdf.size)
    pis[0] = cdf[0]
    for score in range(cdf.size-1):
        pis[score+1] = cdf[score+1] - cdf[score]
    return pis

# to get loan repay probabilities for a given score
loan_repaid_probs = [lambda i: repay_A[scores[scores.get_loc(i,method='nearest')]], 
                     lambda i: repay_B[scores[scores.get_loc(i,method='nearest')]]]

In [7]:
# basic parameters
N_scores = cdf_B.size
N_groups = 2

# get probability mass functions of each group
pi_A = get_pmf(cdf_A)
pi_B = get_pmf(cdf_B)
pis = np.vstack([pi_A, pi_B])

# demographic statistics 
group_ratio = np.array((totals["Black"], totals["White"]))
group_size_ratio = group_ratio/group_ratio.sum()
print(group_size_ratio)

[0.12066905 0.87933095]


In [8]:
# to get loan repay probabilities for a given score
loan_repaid_probs = [lambda i: repay_A[scores[scores.get_loc(i,method='nearest')]], 
                     lambda i: repay_B[scores[scores.get_loc(i,method='nearest')]]]

# unpacking repay probability as a function of score
loan_repay_fns = [lambda x: loan_repaid_prob(x) for
                      loan_repaid_prob in loan_repaid_probs]

[0.12066905 0.87933095]


In [9]:
# all of the above is from Lydia's code in delayed-impact repos
# all of the below is my code transforming the data and running linear regression on it!

In [10]:
import pandas as pd

In [11]:
def get_repay_probabilities(samples, repay_probs):
    sample_probs = []
    for index, score in enumerate(samples):
        prob_index = np.where(scores_arr == score)
        repay_prob = repay_probs[prob_index[0][0]]
        sample_probs.insert(index, repay_prob)
    return sample_probs

In [12]:
# NOTE: A is Black, B is White

# Convert data in format needed
scores_arr = np.asarray(scores_list)
repay_A_arr = pd.Series.to_numpy(repay_A)
repay_B_arr = pd.Series.to_numpy(repay_B)

In [13]:
# For reference: https://www.w3schools.com/python/ref_random_choices.asp
from random import choices

num_A_samples = 120
num_B_samples = 880

samples_A = sorted(choices(scores_arr, pi_A, k=num_A_samples))
samples_B = sorted(choices(scores_arr, pi_B, k=num_B_samples))

In [14]:
print(samples_A)
print(samples_B)

[311.9047619047619, 323.8095238095238, 323.8095238095238, 323.8095238095238, 323.8095238095238, 323.8095238095238, 323.8095238095238, 323.8095238095238, 323.8095238095238, 323.8095238095238, 347.6190476190476, 347.6190476190476, 347.6190476190476, 354.76190476190476, 354.76190476190476, 360.7142857142857, 360.7142857142857, 366.6666666666667, 366.6666666666667, 372.6190476190476, 390.4761904761905, 390.4761904761905, 390.4761904761905, 396.42857142857144, 406.48148148148147, 406.48148148148147, 406.48148148148147, 406.48148148148147, 411.1111111111111, 411.1111111111111, 425.0, 434.25925925925924, 443.51851851851853, 443.51851851851853, 443.51851851851853, 443.51851851851853, 443.51851851851853, 452.3076923076923, 456.15384615384613, 463.8461538461538, 467.6923076923077, 467.6923076923077, 467.6923076923077, 467.6923076923077, 467.6923076923077, 475.38461538461536, 479.2307692307692, 479.2307692307692, 479.2307692307692, 494.6153846153846, 498.46153846153845, 505.0632911392405, 508.227

In [15]:
# Once I have samples for white and black groups, calculate their probabilities and make lists for race
# A == Black == 0 (later defined as 0.0 when converting to pandas df)
# B == White == 1 (later defined as 1.0 when converting to pandas df)

samples_A_probs = np.asarray(get_repay_probabilities(samples=samples_A, repay_probs=repay_A_arr))
samples_A_race = np.asarray([0] * num_A_samples)

samples_B_probs = np.asarray(get_repay_probabilities(samples=samples_B, repay_probs=repay_B_arr))
samples_B_race = np.asarray([1] * num_B_samples)

In [16]:
data_A_dict = {'score': samples_A, 'repay_probability': samples_A_probs, 'race': samples_A_race}
data_B_dict = {'score': samples_B, 'repay_probability': samples_B_probs, 'race': samples_B_race}

data_A_df = pd.DataFrame(data=data_A_dict, dtype=np.float64)
data_B_df = pd.DataFrame(data=data_B_dict, dtype=np.float64)

data_all_df = pd.concat([data_A_df, data_B_df], ignore_index=True)
print(data_all_df)
data_all_df_shuffled = data_all_df.sample(frac=1).reset_index(drop=True)
print(data_all_df_shuffled)

          score  repay_probability  race
0    311.904762             0.0077   0.0
1    323.809524             0.0120   0.0
2    323.809524             0.0120   0.0
3    323.809524             0.0120   0.0
4    323.809524             0.0120   0.0
..          ...                ...   ...
995  836.842105             0.9902   1.0
996  836.842105             0.9902   1.0
997  841.228070             0.9905   1.0
998  841.228070             0.9905   1.0
999  845.614035             0.9907   1.0

[1000 rows x 3 columns]
          score  repay_probability  race
0    795.886076             0.9870   1.0
1    601.250000             0.7521   1.0
2    511.392405             0.2686   1.0
3    781.645570             0.9850   1.0
4    685.507246             0.9442   1.0
..          ...                ...   ...
995  730.882353             0.9740   1.0
996  694.565217             0.9533   1.0
997  384.523810             0.0737   1.0
998  738.235294             0.9768   1.0
999  757.911392             0.98

In [17]:
# TODO 5: THEN do linear regression on the data
#https://realpython.com/linear-regression-in-python/