In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sys, os
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFE
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score, roc_curve, ConfusionMatrixDisplay
from sklearn.metrics import precision_recall_curve, precision_recall_fscore_support, plot_precision_recall_curve
from sklearn.metrics import cohen_kappa_score
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV, TimeSeriesSplit
from sklearn.preprocessing import KBinsDiscretizer, OneHotEncoder, Binarizer, StandardScaler
from sklearn.feature_selection import RFECV
from sklearn.linear_model import LinearRegression
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import scale, StandardScaler, MinMaxScaler, QuantileTransformer
from sklearn.metrics import average_precision_score, cohen_kappa_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.decomposition import PCA
from joblib import Parallel, delayed
import itertools
%matplotlib inline

pca
dump correlations
feature selection
onehot encoding

In [201]:
# Function that combines different classification metrics to avoid repeated large blocks of code.
# Produces the confusion matrix, classification report (precision, recall, f1-score,..)
# ROC-AUC and ROC curve
# Precision-recall curve

def my_score(clf, X_test, y_test):
    ''' Predict using trained scikit-learn estimator and compute the explained variance score.  

    Parameters
    ----------
    X_test : ndarray or DataFrame (n_samples, n_features)
             Feature data to test. n_features represents the number of features
             present in the data used to train the estimator clf

    y_test : ndarray (n_samples, )
             Target data to test. 


    clf : scikit-learn estimator which has been fit to data with same number of columns as X_test

    '''
    y_predict_proba= clf.predict_proba(X_test)[:,1]
    return roc_auc_score(y_test, y_predict_proba)


def classifier_analysis(clf, X_test, y_test):
    ''' Predict and measure quality of model with a variety of metrics.   

    Parameters
    ----------

    X_test : ndarray or DataFrame (n_samples, n_features)
             Feature data to test. n_features represents the number of features
             present in the data used to train the estimator clf

    y_test : ndarray (n_samples, )
             Target data to test. 


    clf : scikit-learn estimator which has been fit to data with same number of columns as X_test

    '''
    y_predict = clf.predict(X_test)
    y_predict_proba= clf.predict_proba(X_test)[:,1]

    cm = confusion_matrix(y_test, y_predict, labels=[0,1])
    print('tn, fp, fn, tp', cm.ravel())
    _ = ConfusionMatrixDisplay(cm, [0,1]).plot()
    
    print(classification_report(y_test, y_predict))

    roc_auc = roc_auc_score(y_test, y_predict_proba)
    fpr, tpr, thresholds = roc_curve(y_test, y_predict_proba)

    plt.figure()
    plt.plot(fpr, tpr, label=(clf.__class__.__name__ + '(area = %0.2f)' % roc_auc))
    plt.plot([0, 1], [0, 1],'r--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic')
    plt.legend(loc="lower right")
    plt.show()

    average_precision = average_precision_score(y_test, y_predict_proba)
    precision, recall, thresholds = precision_recall_curve(y_test, y_predict_proba)
    disp = plot_precision_recall_curve(clf, X_test, y_test)
    disp.ax_.set_ylim([0.0, 1.0])
    disp.ax_.set_title('2-class Precision-Recall curve: '
                   'AP={0:0.2f}'.format(average_precision))
    plt.show()
    
    return roc_auc


# Convert date-like + 'Missing' valued features to one-hot encoded columns; uses KBins on the years of the date-like, which 
# bins then one-hot encodes, while also one-hot encoding 'Missing' separately. 
def encode_dates_and_missing(df):
    for i, seriesname in enumerate(df.columns):
        series = df[seriesname]
        series_missing = pd.get_dummies(series[series=='Missing'])
        series_notmissing = series[series!='Missing']
        notmiss_index =series_notmissing.index
        
        kbd = KBinsDiscretizer(n_bins=3, strategy='uniform')
        series_dt = pd.to_datetime(series_notmissing).dt.year
        encoded_series = kbd.fit_transform(series_dt.values.reshape(-1,1))
        encoded_series = pd.DataFrame(kbd.fit_transform(series_dt.values.reshape(-1,1)).toarray(),index=notmiss_index).astype(int)
        encoded_series = encoded_series.join(series_missing, how='outer').fillna(value=0)
        encoded_series.columns = [seriesname+'_'+str(col) for col in encoded_series.columns]
        
        if i == 0: 
            encoded_df = encoded_series
        else:
            encoded_df = pd.concat((encoded_df,encoded_series),axis=1)
            
    return encoded_df



In [205]:
def my_feature_transformer(X, num_features, cat_features, num_transformer=QuantileTransformer()):

    uniq_categories = [list(np.sort(X[col].unique()))+['Missing','Unknown'] for col in cat_features]
    cat_transformer = OneHotEncoder(categories=uniq_categories, drop='first')
    col_transformer = ColumnTransformer(transformers=[('num', num_transformer, num_features), 
                                                      ('cat', cat_transformer, cat_features)])
    return col_transformer

def scale_features(col_transformer_, X_train, X_test):
    _ = col_transformer_.fit(X_train)
    X_train = col_transformer_.transform(X_train)
    X_test = col_transformer_.transform(X_test)
    return X_train, X_test


def my_cross_validate(estimator, col_transformer_, X_traintest, y_traintest, train_test_iterable, param_grid, n_jobs=-2):
    mean_scores = []
    # cv_params needs to make sense for the estimator given
    param_grid_list = param_grid_iterable(param_grid)
    for params_ in param_grid_list:
        with Parallel(n_jobs=-2) as parallel:
            fitted_models_and_test_splits = parallel(delayed(fit_model_)(estimator(**params_), col_transformer_,
                                                          X_traintest, y_traintest, train, test)
                                  for (train, test) in train_test_iterable)
            # Get averages (explained variance) score for this model
            scores = parallel(delayed(my_score)(model_, xt, yt) for (model_, xt, yt) in fitted_models_and_test_splits) 
            mean_scores += [np.mean(list(scores))]
            
    
    best_params_ = param_grid_list[np.argmax(np.array(mean_scores))]
    return estimator(**best_params_) 


def fit_model_(estimator, col_transformer_, X_traintest, y_traintest, train, test):
    X_train, X_test = X_traintest.loc[train,:], X_traintest.loc[test,:]
    y_train, y_test = y_traintest.loc[train].values.ravel(), y_traintest.loc[test].values.ravel()
    
    X_train, X_test  = scale_features(col_transformer_, X_train, X_test)
    
    _ = estimator.fit(X_train, y_train)    
    return (estimator, X_test, y_test)

def param_grid_iterable(params):
    keys = sorted(params)
    combinations = list(itertools.product(*(params[key] if type(params[key]) in [list, dict, np.ndarray] else [params[key]]
                                            for key in keys)))
    cvsorted = [dict(zip(len(c)*keys, c)) for c in combinations]
    return cvsorted

### Random Forest and Logistic Regression modeling of loan status.

In [143]:
loan_data0 = pd.read_csv('classification_loan_data.csv', index_col=False)

Need to take into consideration the time dependence element of this problem. Because we want to predict whether or not
to issue a loan, need to take into consideration the time series nature of the issuance date. The main consideration is during the cross-validation process later but reorder the data now as it will later be transformed by one-hot encoding. Specifically, it will be stored in a sparse matrix which is harder to manipulate.

In [144]:
# This step will be deprecated after newest data cleaning run. 
issued_datetime = pd.to_datetime(loan_data0.issue_d)

loan_data0 = loan_data0.loc[issued_datetime.sort_values().index,:]

Have the cross validation folds, now need to create procedure which correctly preprocesses them before testing. Training component of folds are cumulative over time; always want to use as much information as possible.

There are too many values for the dates of the earliest credit known credit lines and zip codes; too many at least for one-hot encoding to be practical. Therefore, group zip_code by the first two digits as this retains the geographical information and group the earliest credit_line by using KBinsDiscretizer on the year; it needs a numerical value so the entire date cannot be used without modification. The reason for KBinsDiscretizer is that the distribution is not uniform over time, and it presents an unbiased selection/grouping method.

In [145]:
loan_data0 = loan_data0.drop(columns='zip_code')

In [146]:
loan_data0.shape

(599584, 39)

In [147]:
loan_data = loan_data0#.iloc[:, 12:]

In [140]:
encoded_dates = encode_dates_and_missing(loan_data.loc[:, 'earliest_cr_line'].to_frame(name='earliest_cr_line'))

In [38]:
loan_data = pd.concat((loan_data.drop(columns=['earliest_cr_line']), encoded_dates),axis=1)

In [189]:
y = loan_data.loan_status.astype(int)
X = loan_data.drop(columns=['loan_status'])

In [9]:
loan_data.drop(columns=['earliest_cr_line'])
y = loan_data.loan_status.astype(int)
X = loan_data.drop(columns=['loan_status'])

In [82]:
y = loan_data0.loan_status.astype(int)
X = loan_data

In [135]:
X = X.select_dtypes(include='number')

In [None]:
X.shape

In [190]:
X = X.iloc[:, 27:]

Create a "hold-out" set of data that will used for final predictions and analysis after all cross-validation and
model learning has been accomplished. The loan issuance dates are aggregated by month, but from the metadata we know that the data is reported *quarterly*. Using this as motivation, the hold-out data will be the most recent quarter. Because the number of loans has grown over time, this one quarter represents nearly $1 / 7$ of all loan data of loans that have either been fully paid or charged off.

In [191]:
# Data goes from second quarter of 2007 to fourth quarter of 2015; the number of samples are skewed toawrds later dates;
# The data is reported quarterly; this should be represented in the cross validation/model selection process.
pind = pd.PeriodIndex(issued_datetime, freq='Q-DEC')

KBins needs numerical variables; use year.quarter (number.decimal). This transformation is not applied to the training data; it's just used

In [192]:
# Add the year to the decimal representing the quarter; Q1 = 0.0 , Q2 = 0.25, Q3 = 0.50, Q4 = 0.75
numerical_quarters = pind.year + (pind.quarter - 1)/4

In [193]:
nb=6
kbd = KBinsDiscretizer(n_bins=nb)
bin_masks = kbd.fit_transform(numerical_quarters.values.reshape(-1, 1)).toarray()

Size of the cross-validation folds

In [194]:
np.sum(bin_masks, axis=0)

array([ 78398., 114494.,  77607., 105283.,  63538., 160264.])

Scikit-learn wants iterable containing (train_indices, test_indices)

In [195]:
train_indices = [loan_data.index[np.array(np.sum(bin_masks[:,:i+1],axis=1), dtype=bool)]  for i in range(nb-2)]
test_indices = [loan_data.index[np.array(bin_masks[:,i+1], dtype=bool)]  for i in range(nb-2)]
train_test_iterable = list(zip(train_indices,test_indices))
traintest_indices = loan_data.index[np.array(np.sum(bin_masks[:,:-1],axis=1), dtype=bool)] 
holdout_indices = loan_data.index[np.array(bin_masks[:, -1], dtype=bool)] 

In [196]:
X_traintest, X_holdout = X.loc[traintest_indices, :], X.loc[holdout_indices, :]
y_traintest, y_holdout = y.loc[traintest_indices], y.loc[holdout_indices]

Special considerations need to be made because the problem is time dependent. The cross validation folds could normally be produced with TimeSeriesSplit(), but the distribution of the continuous numerical variables begs for rescaling. Therein lies the issue; however, as the renormalization using only train

1. Bin categorical data that has too many unique values.
2. Convert categorical data to discrete numerical data by means of OneHotEncoder.

The main goal is to reduce capital loss not maximize profits. Therefore, we value prediction of when a loan will be charged off more than fully paid. We can account for this by changing the class weights in the classification process. This model will reject loans that would have been fully paid in order to avoid loans that will become charged off. In other words, the goal is to maximize the number of true positives, where "positive" in this case is equivalent to a loan being charged off.

In [197]:
num_features = X.select_dtypes(exclude=['object','category']).columns
cat_features = X.select_dtypes(include=['object','category']).columns

In [203]:
train0, test0 = train_test_iterable[1]
X_train, X_test = X.loc[train0, :],X.loc[test0, :]
y_train, y_test = y.loc[train0], y.loc[test0]

In [None]:
rfc0 = RandomForestClassifier(n_estimators=100)
col_transformer_rfc = my_feature_transformer(X, num_features, cat_features, num_transformer=StandardScaler())
_ = col_transformer_rfc.fit(X_train)

X_train_transform = col_transformer_rfc.transform(X_train)
X_test_transform = col_transformer_rfc.transform(X_test)

_ = rfc0.fit(X_train_transform, y_train)
roc0 = classifier_analysis(rfc0, X_test_transform, y_test)

In [None]:
logreg = LogisticRegression(max_iter=1000, tol=1e-10)
col_transformer_logreg = my_feature_transformer(X, num_features, cat_features, num_transformer=StandardScaler())
_ = col_transformer_logreg.fit(X_train)
X_train_transform = col_transformer_logreg.transform(X_train)
X_test_transform = col_transformer_logreg.transform(X_test)

_ = logreg.fit(X_train_transform, y_train)

logreg_roc_auc = classifier_analysis(logreg, X_test_transform, y_test)

In [None]:
col_transformer_rfc = my_feature_transformer(X, num_features, cat_features, num_transformer=QuantileTransformer())

rfc_param_grid = {'n_estimators':100, 'class_weight':None}

rfc_model = my_cross_validate(RandomForestClassifier, col_transformer_rfc, X_traintest, y_traintest,
                              train_test_iterable, rfc_param_grid)

In [None]:
rfc_roc_auc = classifier_analysis(rfc_model, X_holdout, y_holdout)

In [41]:
col_transformer_rfc = my_feature_transformer(X, num_features, cat_features, num_transformer=QuantileTransformer())

rfc_param_grid = {'n_estimators':[10,25,100], 'class_weight':['balanced', None]}

rfc_model = my_cross_validate(RandomForestClassifier, col_transformer_rfc, X_traintest, y_traintest,
                              train_test_iterable, rfc_param_grid)

TerminatedWorkerError: A worker process managed by the executor was unexpectedly terminated. This could be caused by a segmentation fault while calling the function or by an excessive memory usage causing the Operating System to kill the worker.

In [None]:
col_transformer_logreg = my_feature_transformer(X, num_features, cat_features, num_transformer=QuantileTransformer())

logreg_param_grid = {'max_iter':300, 'C':[0.1, 1, 2], 'class_weight':['balanced', None]}

logreg_model = my_cross_validate(LogisticRegression, col_transformer_logreg, X_traintest, y_traintest, 
                                 train_test_iterable, logreg_param_grid)

In [None]:
rfc_roc_auc = classifier_analysis(rfc_model, X_holdout, y_holdout)

In [None]:
logreg_roc_auc = classifier_analysis(logreg_model, X_holdout, y_holdout)

Need to figure out why it's always predicting to accept loans. First check correlations. 

In [None]:
def max_pairwise_correlations(df):
    # Produce all correlations to the relations between features
    correlations_ = df.corr()
    # Maximum correlations (excluding auto-correlation)
    correlations_df = correlations_.unstack().to_frame(name='data')
    # Remove the auto-correlations which are trivial / not useful values.
    correlations_no_auto = correlations_df[correlations_df['data']!=1]
    # To pick out the maximum pairwise correlations, 
    maxcvalues = correlations_no_auto[correlations_no_auto['data'] == 
                                      correlations_no_auto.groupby(level=[0])['data'].transform(max)]
    return maxcvalues

In [None]:
Xcorr = col_transformer.fit_transform(X)

In [None]:
maxcorrelations = pd.DataFrame.sparse.from_spmatrix(Xcorr).corr().dropna(axis=1).dropna(axis=0)

In [None]:
maxcorr = max_pairwise_correlations(X)

In [None]:
maxcorr.sort_values(by='data', ascending=False)

Use all training/testing data to help scale the holdout data.

Can't actually use the entire training and testing data to scale/encode because it results in more features than the model has.

The one hot encoding can throw an error if there are categories in the test set not in the train set; as the categories are
quantities known before hand it should be ok to pass to the encoder; or does this contaminate the test data? The issue is that the lack of a category in the training data prevents accurate prediction as by definition there is no training done on those values.

Is it ok to "look at" the test data? The issue arises when there are categories that are in the testing set that are not
in the training set. If the time-series cross validation folds are not cumulative then this becomes even more of a problem.
If kept as a general procedure, that is, relabel any categories unique to the testing set as "Unknown", then perhaps it can work. I.e. relabel values in the testing set that are "unknown" to the training set; this dummy variable is a flag to the algorithm that these values are "special". From Mike; if we know the categories before hand, use that set of unique values and then add two dummy columns: "Missing" and "Unknown/New"

In [None]:
# ranfor_param_grid={'n_estimators':[5,10,15]}
# logreg_param_grid = {'tol':[1e-1, 1e-2, 1e-4]}

# ranfor_cv = RandomForestClassifier(class_weight='balanced')
# ranfor_model_ = GridSearchCV(ranfor_cv, ranfor_param_grid, cv=TimeSeriesSplit(n_splits=3))
# ranfor_model_.fit(X_train, y_train)