In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sys, os
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.linear_model import Ridge, LinearRegression, Lasso, ElasticNet, SGDClassifier
from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold
from sklearn.metrics import mean_squared_error, r2_score, explained_variance_score
from sklearn.svm import SVC
from sklearn.preprocessing import scale, StandardScaler
from sklearn.feature_selection import RFE, RFECV
from sklearn.decomposition import PCA

In [3]:
def classifier_analysis(clf, X_test, y_test):
    y_predict = clf.predict(X_test)
    mse_ = mean_squared_error(y_test, y_predict)
    evs_ = explained_variance_score(y_test, y_predict)
    
    print('grid scores', clf.grid_scores_)
    print('Number of features selected by RFECV', clf.n_features_)
    print('Mean squared error {:0.2f}'.format(mse_))
    print('Explained variance score {:0.2f}'.format(evs_))
    return None

def model_and_evaluate(X, y, clf=LinearRegression(), CVmethod=RFECV, **kwargs):
    # Split into train and test
    X_train, X_test, y_train, y_test = train_test_split(X, y.values.ravel(), test_size=0.20)

    # Scale all of the data but only using only the training set to fit
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train, y_train)
    X_test = scaler.transform(X_test)

    # Initialize the cross validation method to pass to our CV protocol
    model_ = CVmethod(clf, **kwargs)
    
    # Select and fit model based on cross-validated recursive feature elimination. 
    model_.fit(X_train, y_train)

    # Predict and analyse model with mean-squared error, explained-various and RFECV attributes
    classifier_analysis(model_, X_test, y_test)
    return model_, (X_train, X_test, y_train, y_test)

To begin the regression of the recovered amount of money from charged-off loans, only the the data for charged-off loans
is imported.

In [4]:
loan_data = pd.read_csv('regression_loan_data.csv',index_col=False)

#Still contains missing values; first, drop features with missing values (dropping samples leaves us with no data).
loan_data = loan_data.dropna(axis=1)

Our prediction is occuring in the present moment. All of the data corresponding to charged-off loans can be used, and it does not have to be treated as a time series, as we are not trying to predict the future.

In [11]:
loan_data.sec_app_earliest_cr_line.value_counts()    

Missing     257603
Aug-2006        42
Aug-2007        39
Mar-2006        38
May-2006        38
             ...  
Sep-1984         1
Dec-1985         1
May-1976         1
Jun-1968         1
Mar-1984         1
Name: sec_app_earliest_cr_line, Length: 451, dtype: int64

In [16]:
loan_data.select_dtypes(include=['object','category']).nunique().sort_values(ascending=False)

sec_app_earliest_cr_line     451
issue_d                      137
last_pymnt_d                 132
last_credit_pull_d           114
zip_code                     100
settlement_date               89
debt_settlement_flag_date     83
earliest_cr_line              65
addr_state                    51
sub_grade                     35
hardship_end_date             25
hardship_start_date           25
payment_plan_start_date       24
purpose                       14
emp_length                    12
hardship_reason               10
grade                          7
home_ownership                 6
hardship_loan_status           6
verification_status_joint      4
settlement_status              4
hardship_status                4
verification_status            3
application_type               2
initial_list_status            2
hardship_flag                  2
hardship_type                  2
disbursement_method            2
debt_settlement_flag           2
term                           2
dtype: int

In [13]:
loan_data.shape

(261655, 52)

In [14]:
loan_data.dropna(axis=1).shape

(261655, 52)

Need to reduce the number of categories so that we can employ one hot encoding and still have a manageable amount of data;
know from classification problem how we can deal with dates, at least when there are not missing values; issue_d, earliest_cr_line.
When there are missing values; convert the non missing values to numerical and use KBinsDisc.. and then include the Missing
values as a one-hot encoded category. 

In [5]:
# Assign target/training variables
y = loan_data.recoveries
X = loan_data.drop(columns=['recoveries'])

In [6]:
num_features = X.select_dtypes(exclude=['object','category']).columns
cat_features = X.select_dtypes(include=['object','category']).columns

In [7]:
loan_data.loc[:,'zip_code'] = loan_data.zip_code.str.split('xx').str.join(sep='').apply(lambda x : x[:-1]).astype('category')
loan_data.loc[:, 'earliest_cr_line'] = pd.to_datetime(loan_data.earliest_cr_line).dt.year.astype('category')

In [None]:
# num_transformer = Pipeline(steps=[('scaler', StandardScaler()), ('encoder', KBinsDiscretizer())])
num_transformer = StandardScaler()
cat_transformer = OneHotEncoder(handle_unknown='ignore')
col_transformer = ColumnTransformer(transformers=[('num', num_transformer, num_features), 
                                                  ('cat', cat_transformer, cat_features)])

In [None]:
unique_categories = [np.array(X[col].sort_values().unique().tolist()) for col in X.select_dtypes(include='object').columns]

ohenc = OneHotEncoder(categories=unique_categories)
cat_X_train_sparse = ohenc.fit_transform(cat_X_train)
cat_X_test_sparse = ohenc.transform(cat_X_test)

In [None]:
sclr = StandardScaler()
numerical_X_train_scaled = sclr.fit_transform(numerical_X_train)
numerical_X_test_scaled = sclr.transform(numerical_X_test)

In [None]:
ridge_pipeline = Pipeline(steps=[('preprocess', col_transformer), ('classifier', Ridge())])
sgd_pipeline = Pipeline(steps=[('preprocess', col_transformer), ('classifier', SGDClassifier())])

In [None]:
GridSearchCV(make_pipeline(StandardScaler(), LogisticRegression()),
                    param_grid={'logisticregression__C': [0.1, 10.]},
                    cv=2,
                    refit=False)

In [4]:
clf1 = LinearRegression()
model1, split_data1 = model_and_evaluate(X_timeless, y, clf1)

grid scores [0.01440052 0.02398066 0.22794159 0.22802398 0.25660907 0.29377674
 0.29437993 0.30180923 0.32287637 0.32720689 0.34509231 0.34509229
 0.34508347 0.34519487 0.34532442 0.34540255 0.34545715 0.34549758
 0.34552857 0.34555112 0.3456515  0.34565983 0.34575382 0.34578151
 0.34581119 0.34582871 0.34585572 0.34586926 0.34588983 0.34589936
 0.34591497 0.34592242 0.34593034 0.34594197 0.34594949 0.34596088
 0.34597073 0.34597957 0.34598294 0.34598338 0.34598488 0.34598611
 0.34599391 0.34599786 0.34600069 0.34600931 0.34601406 0.34601631
 0.34602045 0.34601544 0.34601893 0.34602173 0.34602508 0.34603021
 0.34603226 0.34603324 0.34603298 0.34603758 0.34603951 0.34604212
 0.34604295 0.34604204 0.34604337 0.34604333 0.34604314 0.34604262
 0.34604282 0.34604299 0.34604299 0.34604251 0.34604265 0.34604269
 0.34604269 0.34604269]
Number of features selected by RFECV 63
Mean squared error 341250.27
Explained variance score 0.35


In [5]:
clf2 = Ridge()
model2, split_data2 = model_and_evaluate(X_timeless, y, clf2)

grid scores [0.0146229  0.02429777 0.22938217 0.23508293 0.26522    0.29644831
 0.29942201 0.32595813 0.32595805 0.3481662  0.34816557 0.34821974
 0.34830391 0.3484388  0.34851153 0.34856195 0.34863916 0.3486853
 0.34872824 0.34876667 0.34878449 0.34888069 0.34890857 0.34893048
 0.3489524  0.34896748 0.34897992 0.34899571 0.34901677 0.34902638
 0.3490469  0.34905682 0.34906698 0.34906836 0.34905595 0.34906613
 0.34906439 0.34907778 0.34909091 0.34909601 0.34910428 0.34910763
 0.34911481 0.34911958 0.34912317 0.34912468 0.34913002 0.34913738
 0.3491367  0.34913125 0.34913488 0.34913976 0.34914159 0.34914875
 0.34915397 0.34915567 0.34915734 0.34915834 0.34915943 0.34915843
 0.3491597  0.34915892 0.34915944 0.34915833 0.34915896 0.34915977
 0.3491599  0.34916031 0.34916064 0.34916112 0.34916107 0.34916117
 0.34916117 0.34916117]
Number of features selected by RFECV 74
Mean squared error 343235.27
Explained variance score 0.34


Using RFECV it filters the features then fits to the estimator using cross-validation to determine number of features.
Essentially the function model_and_evaluate is equivalent to a pipeline in the following order: 
    
    1. train_test_split 
    2. StandardScaler using only training data to fit 
    3. Scale test data 
    4. model = RFECV(classifier) fitting with training data
    5. Predict with model and test data
