# 0. Import packages and set Parameters

In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# imports for my own code
import pandas as pd
import random
from random import choices
from scipy import stats

# import all of our files
import sys
sys.path.append('../')
import Liu_paper_code.fico as fico
import Liu_paper_code.distribution_to_loans_outcomes as dlo

from scripts.data_creation_utils import get_pmf,get_repay_probabilities,get_scores, adjust_set_ratios
from scripts.evaluation_utils import inspect_MinMax
from scripts.visualization_utils import visualize_data_distribution, visual_scores_by_race

In [None]:
# Parameters
data_dir = '../data/raw/'
results_dir = '../data/testing/'
file_name = 'test.csv'

set_size = 100000
order_of_magnitude = 100000 # amount of samples generated in a batch; larger than set_size

group_size_ratio = [0.12,0.88]
black_label_ratio = [0.66,0.34]

shuffle_seed = 42
round_num_scores = 2

# 1. Load and parse the data
#### Code is primarily from Lydia's FICO-figures.ipynb 

In [None]:
all_cdfs, performance, totals = fico.get_FICO_data(data_dir);

# 2. Convert the data into format needed

In [None]:
cdfs = all_cdfs[["White","Black"]]

# B is White
# A is Black

cdf_B = cdfs['White'].values
cdf_A = cdfs['Black'].values

repay_B = performance['White']
repay_A = performance['Black']
scores = cdfs.index
scores_list = scores.tolist()
scores_repay = cdfs.index


In [None]:
# basic parameters
N_scores = cdf_B.size
N_groups = 2

# get probability mass functions of each group
pi_A = get_pmf(cdf_A)
pi_B = get_pmf(cdf_B)
pis = np.vstack([pi_A, pi_B])

# demographic statistics 
#group_ratio = np.array((totals["Black"], totals["White"]))
#group_size_ratio = group_ratio/group_ratio.sum() - true fico data goup size ratio
#print(group_size_ratio)

In [None]:
# to get loan repay probabilities for a given score
loan_repaid_probs = [lambda i: repay_A[scores[scores.get_loc(i,method='nearest')]], 
                     lambda i: repay_B[scores[scores.get_loc(i,method='nearest')]]]

# unpacking repay probability as a function of score
loan_repay_fns = [lambda x: loan_repaid_prob(x) for
                      loan_repaid_prob in loan_repaid_probs]

In [None]:
# Make repay probabilities into percentages from decimals
scores_arr = np.asarray(get_scores(scores=scores_list, round_num=round_num_scores)) # we recommend 1 or 2 for round_num
print(scores_arr)
repay_A_arr = pd.Series.to_numpy(repay_A)*100
repay_B_arr = pd.Series.to_numpy(repay_B)*100

# 3. Sample from the data according to ratios and combine the scores and probabilities and convert data types

In [None]:
# Sample data according to the pmf
def sample(group_size_ratio, order_of_magnitude, shuffle_seed,scores_arr, pi_A, pi_B, repay_A_arr, repay_B_arr):
    
    # Sample data according to the pmf
    # Reference: https://www.w3schools.com/python/ref_random_choices.asp

    num_A_samples = int(group_size_ratio[0] * order_of_magnitude)
    num_B_samples = int(group_size_ratio[1] * order_of_magnitude)

    samples_A = np.asarray(sorted(choices(scores_arr, pi_A, k=num_A_samples)))
    samples_B = np.asarray(sorted(choices(scores_arr, pi_B, k=num_B_samples)))

    # Calculate samples groups' probabilities and make arrays for race

    # A == Black == 0 (later defined as 0.0 when converting to pandas df)
    samples_A_probs = get_repay_probabilities(samples=samples_A,scores_arr=scores_arr, repay_probs=repay_A_arr, round_num=1)
    samples_A_race = np.zeros(num_A_samples, dtype= int)
    # B == White == 1 (later defined as 1.0 when converting to pandas df)
    samples_B_probs = get_repay_probabilities(samples=samples_B,scores_arr=scores_arr, repay_probs=repay_B_arr, round_num=1)
    samples_B_race = np.ones(num_B_samples, dtype= int)

    # Get data in dict form with score and repay prob
    data_A_dict = {'score': samples_A, 'repay_probability': samples_A_probs} #,'race': samples_A_race}
    data_B_dict = {'score': samples_B, 'repay_probability': samples_B_probs} #,'race': samples_B_race}

    # Get data in dict form with score, repay prob, and race
    data_A_dict = {'score': samples_A, 'repay_probability': samples_A_probs ,'race': samples_A_race}
    data_B_dict = {'score': samples_B, 'repay_probability': samples_B_probs,'race': samples_B_race}

    # Convert from dict to df
    data_A_df = pd.DataFrame(data=data_A_dict, dtype=np.float64)
    data_B_df = pd.DataFrame(data=data_B_dict, dtype=np.float64)

    # Combine all of the data together and shuffle
    # NOTE: not currently being used but could be useful at a later time
    data_all_df = pd.concat([data_A_df, data_B_df], ignore_index=True)
    #print(data_all_df)
    np.random.seed(shuffle_seed)
    data_all_df_shuffled = data_all_df.sample(frac=1).reset_index(drop=True)
    #print(data_all_df_shuffled)

    # Add Final Column to dataframe, repay indices
    # repay: 1.0, default: 0.0
    probabilities = data_all_df_shuffled['repay_probability']
    repay_indices = []
    # Create a random num and then have that decide given a prob if the person gets a loan or not
    # (e.g. If 80% prob, then calculate a random num, then if that is below they will get loan, if above, then they don't)

    for index, prob in enumerate(probabilities):
        rand_num = random.randint(0,1000)/10
        if rand_num > prob:  # default
            repay_indices.append(0)
        else:
            repay_indices.append(1)  # repay

    data_all_df_shuffled['repay_indices'] = np.array(repay_indices)

    return data_all_df_shuffled, samples_A, samples_B, samples_A_probs, samples_B_probs

## 3.1 Generate set, based on set-ratios specified in Section 0

In [None]:
# Reference: https://www.w3schools.com/python/ref_random_choices.asp
# Calculate samples groups' probabilities and make arrays for race
# A == Black == 0 (later defined as 0.0 when converting to pandas df)
# B == White == 1 (later defined as 1.0 when converting to pandas df)

# generate first batch of samples:
data,samples_A, samples_B, samples_A_probs, samples_B_probs = sample(group_size_ratio, order_of_magnitude,shuffle_seed, scores_arr, pi_A, pi_B, repay_A_arr, repay_B_arr)
# split the data cols (x,y)
x = data[['score','repay_probability', 'race']].values
y = data['repay_indices'].values

# adjust the set according to the ratios specified
x,y = adjust_set_ratios(x, y, black_label_ratio, group_size_ratio, set_size)
i = 0
# if dataset it to small, samplee a larger batch
while len(y) < set_size:
    i += 1
    # Generate new samples
    data_add, samples_A_add, samples_B_add, samples_A_probs_add, samples_B_probs_add = sample(group_size_ratio, order_of_magnitude,i, scores_arr, pi_A, pi_B, repay_A_arr, repay_B_arr)
    data = pd.concat([data,data_add])
    samples_A = np.concatenate((samples_A,samples_A_add))
    samples_A_probs = np.concatenate((samples_A_probs,samples_A_probs_add))
    samples_B = np.concatenate((samples_B,samples_B_add))
    samples_B_probs = np.concatenate((samples_B_probs,samples_B_probs_add))
    # split the data cols (x,y)
    x = data[['score','repay_probability', 'race']].values
    y = data['repay_indices'].values

    # adjust the set according to the ratios specified
    x,y = adjust_set_ratios(x,y, black_label_ratio, group_size_ratio, set_size)

# merge x,y back into a DataFrame
df = {'score':x[:,0],'repay_probability': x[:,1],'race':x[:,2],'repay_indices': y}
data = pd.DataFrame(df)

# print proportions of dataset
idx_An = np.where((x[:, 2] == 0) & (y == 0))[0]
idx_Ap = np.where((x[:, 2] == 0) & (y == 1))[0]
idx_B = np.where((x[:, 2] == 1))[0]
print(i,'Black N/P:',len(idx_An),'/',len(idx_Ap),'White:',len(idx_B))

## 3.2 Save the pandas dataframes to CSVs

In [None]:
data.to_csv(index=False, path_or_buf=results_dir+file_name)

# To save the data separately by race
#data_A_df.to_csv(index=False, path_or_buf='simData_2decProbs_0decScores_groupA_black.csv')
#data_B_df.to_csv(index=False, path_or_buf='simData_2decProbs_0decScores_groupB_white.csv')

# 4. Evaluation

## 4.1 Inspect the min/max values of the data

In [None]:
inspect_MinMax(samples_A_probs,samples_B_probs)

## 4.2 Visualize Distributions

In [None]:
visualize_data_distribution(results_dir,samples_A,samples_A_probs,samples_B,samples_B_probs)

In [None]:
#visual_scores_by_race(data)