In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sys, os
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score
from IPython.core.display import HTML
from scipy.stats import norm, poisson,expon,lognorm,skewnorm,exponnorm,skew,kstest
from math import pi
%matplotlib inline

In [None]:
# Import the cleaned loan data
loan_data = pd.read_csv('loan_data.csv', index_col=False)
categorical_data = pd.read_csv('categorical_data.csv', index_col=False)

In [None]:
# Function that takes all of the sample values of a feature X (DataFrame.column or NumPy array)
# and returns the equivalent number of samples(variates) from certain SciPy distributions
# specified by keyword argument, "distribution"
def fit_to_data(X, distribution='norm',**kwargs):
    if distribution=='skewnorm':
        params = skewnorm.fit(X)
        variates = skewnorm.rvs(a=params[0], loc=params[1], scale=params[2],  size=len(X))
        
    elif distribution=='lognorm':
        params = lognorm.fit(X)
        variates = lognorm.rvs(s=params[0],loc=params[1],scale=params[2], size=len(X))

    elif distribution=='expon':
        params = expon.fit(X)
        variates = expon.rvs(loc=params[0],scale=params[1], size=len(X))

    elif distribution=='norm':
        params = norm.fit(X)
        variates = norm.rvs(loc=params[0], scale=params[0],size=len(X))

    elif distribution=='exponnorm':
        params = exponnorm.fit(X)
        variates = exponnorm.rvs(params[0],loc=params[1],scale=params[2], size=len(X))

    else:
        dlist = ['skewnorm','norm','expon','lognorm','exponnorm']
        print('Please specify one of the following distributions {},{},{},{},{}'.format(dlist))
        return None
    
    return variates


# Function that is used to contrast kernel density estimate distribution to histogram of actual
# values. 
def hist_density_plot(x, variates,xlabel=None, title=None):
    fig, ax1 = plt.subplots(1,1)
    ax1.hist(x, alpha=0.5,bins=50)
    ax1.set_ylabel('Frequency')
    ax1.set_xlabel(xlabel)

    axtwin = ax1.twinx()
    sns.kdeplot(variates,ax=axtwin,color='r')
    axtwin.set_title(title)
    axtwin.set_ylabel('Density')
    plt.show()
    return None

# Some of these were previously used but at now unused; they're hanging around incase I decide to use them again.
def imbalance_thresholding(df, threshold):
    mode_percentages = [len(df[df[x] == df[x].mode().values[0]])/len(df) for x in df.columns]
    count_df = pd.DataFrame(mode_percentages, index=df.columns, columns=['percentages'])
    balanced_df = df.loc[:, df.columns.isin(count_df[count_df.percentages < threshold].index.tolist())]
    return balanced_df

def max_pairwise_correlations(df):
    # Produce all correlations to the relations between features
    correlations_ = df.corr()
    # Maximum correlations (excluding auto-correlation)
    correlations_df = correlations_.unstack().to_frame(name='data')
    # Remove the auto-correlations which are trivial / not useful values.
    correlations_no_auto = correlations_df[correlations_df['data']!=1]
    # To pick out the maximum pairwise correlations, 
    maxcvalues = correlations_no_auto[correlations_no_auto['data'] == correlations_no_auto.groupby(level=[0])['data'].transform(max)]
    return maxcvalues

def correlation_thresholding(df, threshold=0.999):
    # Remove features ( technically remove one component of a pair) whose maximum pairwise pearson correlation is
    # greater than threshold, default value 0.999, if no such columns exist then return original DataFrame.
    maxcorr = max_pairwise_correlations(df)
    maxcorr = maxcorr[maxcorr.data > threshold]
    columns_to_drop = maxcorr.sort_values(by='data',ascending=False).reset_index()[::2].level_1.values
    if len(columns_to_drop) > 0:
        return df.drop(columns=columns_to_drop)
    else:
        return df

def clean(df,threshcorr=0.99, threshcount=0.99):
    clean_df = df.copy()
    print('Original shape: ', df.shape)
    clean_df = clean_df.dropna()
    clean_df = imbalance_thresholding(clean_df,threshcount)
    print('pruned shape (filtering out imbalanced features):', clean_df.shape)
    clean_df = correlation_thresholding(clean_df, threshcorr)
    print('pruned shape (filtering out correlated features):', clean_df.shape)
    return clean_df

As a reminder, there are continuous and discrete numerical variables as well as categorical variables (converted to
discrete numerical variables by transformation by integer codes). With over two million samples, my first thought was
that most of these continuous variables would be normally distributed. This was a result of being new to financial data
and I suppose naivety. As I'll show, there is a variety of distributions which the continuous variables take; some of which follow no recognizable or nice form.

The goal of this particular portion of the statistical analysis is to get a feel for how the variables are distributed because 
there are hyperparameters such as class weighting that can be very beneficial depending on the results of this investigation.

First, To demonstrate how imbalanced some of the data features are, let's look at the percentage of data values that the mode of each feature represents.

loan_data.mode() / len(loan_data)

Highly imbalanced data and highly correlated data require special care and considerations as they both seem to invite improper manipulation. An example would be to  modify the data by pruning features with pearson correlation above a certain threshold. For example, there are a number of pairs of features with pearson correlation scores greater than 0.999 for a specific and relatively obvious reason. Specifically, some features are essentially identical; an example being: the funded amount of a loan and funded amount of a loan from investors. If investors represent the overwhelming majority of loan funding then these features are nearly identical which seems to be the case upon inspection.The best course of action seems to be to leave the features untouched and to simply include considerations for these properties such as regularization and balanced class weighting. 

Along the same vein let's look at the maximal pairwise correlations between features using the function written above.

max_pairwise_correlations(loan_data).sort_values(by='data',ascending=False)[::2]

# Sampling distribution of summary statistic (func) replicates produced by permutation replicates
# (sampling without replacement)
def permutation_replicates(data1, data2, func, size=10000):
    # Initialize array of replicates: perm_replicates
    perm_replicates = np.empty(size)

    for i in range(size):
        # Generate permutation sample
        perm_sample_1, perm_sample_2 = permutation_sample(data1, data2)

        # Compute the test statistic
        perm_replicates[i] = func(perm_sample_1,perm_sample_2)

    return perm_replicates

def permutation_sample(data1, data2):
    # Concatenate the data sets: data
    data = np.concatenate((data1,data2))

    # Permute the concatenated array: permuted_data
    permuted_data = np.random.permutation(data)

    # Split the permuted array into two: perm_sample_1, perm_sample_2
    perm_sample_1 = permuted_data[:len(data1)]
    perm_sample_2 = permuted_data[len(data1):]

    return perm_sample_1, perm_sample_2

def diff_of_mean(data1, data2):
    return np.mean(data1)-np.mean(data2)

# Sampling distribution of summary statistic (func) replicates produced by bootstrap sampling
# (sampling with replacement)
def bootstrap_replicates(data, func, size=10000):
    bs_replicates = np.empty(size)

    # Generate replicates
    for i in range(size):
        bs_replicates[i] = func(np.random.choice(data, size=len(data)))

    return bs_replicates

def difference_of_mean_p_value(data1, data2, size=1000):
    mean_diff = diff_of_mean(data1, data2) 
    print('The empirical difference of means is {:0.2f}'.format(mean_diff))
    # Compute samples of difference of means: bs_diff_replicates
    mean_diff_perm_replicates = permutation_replicates(data1, data2, diff_of_mean, size=size)

    p = np.sum(mean_diff_perm_replicates >= mean_diff) / len(mean_diff_perm_replicates)
    # Print the results
    print('difference of means = ${:0.2f}'.format(mean_diff))
    print('p-value =', p)
    return p, mean_diff_perm_replicates

# Import the cleaned loan data
loan_data = pd.read_csv('loan_data.csv', index_col=False)
categorical_data = pd.read_csv('categorical_data.csv', index_col=False)

Need to account for time series data by either eliminating it from the dataset to avoid snooping or account for it. 
There aren't variables which has day-to-day interactions so perhaps there is enough time for the data to be uncorrelated. 

Splitting into numerical data for linear regression and object data for classification. The reason for this is to 1. reduce the number of time dependent variables in either set (is this sample bias?) 
What about the variables that aren't explicitly time variables but are implicitly dependent on time? For instance, in order to have a recovery amount, the loan must have been charged off, which means it is likely past the maturity date of the loan, which means it contains time dependent data. 



[len(df[df[x] == df[x].mode().values[0]])/len(df) for x in df.columns]

