In [1]:
import pandas as pd
import os
from multiprocess import Pool
import numpy as np
import math
from sklearn.base import BaseEstimator, TransformerMixin
#!pip install multiprocess

In [2]:
def create_combined_dataset(path_to_raw_data_folder):
    
    data = [pd.read_csv(f'{path_to_raw_data_folder}/{i}', low_memory = False, skiprows=1) for i in os.listdir(path_to_raw_data_folder) if i.endswith('.zip')]
    return pd.concat(data)
    
    
def read_one(path):
    return pd.read_csv(path, low_memory = False, skiprows=1)

def create_combined_dataset_parallel(path_to_raw_data_folder):
    
    files = [f'{path_to_raw_data_folder}/{i}'for i in os.listdir(path_to_raw_data_folder)]
    
    p = Pool(25)
    data = p.map(read_one, files)
    p.close()

    return data

def build_dataset():

    features = ['loan_amnt'
                , 'term'
                , 'int_rate'
                , 'installment'
                , 'grade'
                , 'sub_grade'
                , 'emp_title'
                , 'emp_length'
                , 'home_ownership'
                , 'annual_inc'
                , 'verification_status'
                , 'purpose'
                , 'title'
                , 'addr_state'
                , 'dti'
                , 'delinq_2yrs'
                , 'earliest_cr_line'
                , 'fico_range_low'
                , 'fico_range_high'
                , 'inq_last_6mths'
                , 'mths_since_last_delinq'
                , 'mths_since_last_record'
                , 'open_acc'
                , 'pub_rec'
                , 'revol_bal'
                , 'revol_util'
                , 'total_acc'
                , 'initial_list_status'
                , 'loan_status'
               ]
    data = create_combined_dataset ('C:/Users/mamat/Documents/lending_club_data10_08_2020')
   # df = pd.concat(data)
    status_list = ['Fully Paid', 'Charged Off', 'Late (31-120 days)', 'Default']
    
    data = data[data['loan_status'].isin(status_list)]
    
#    df['loan_status'] = df['loan_status'].apply(make_target)
    
    return data[features]

def make_target(val):
    if val in ['Charged Off', 'Late (31-120 days)', 'Default']:
        return 1
    else:
        return 0


In [3]:
df = build_dataset()

In [4]:
df

Unnamed: 0,loan_amnt,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,...,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,loan_status
0,5000.0,36 months,10.65%,162.87,B,B2,,10+ years,RENT,24000.0,...,1.0,,,3.0,0.0,13648.0,83.7%,9.0,f,Fully Paid
1,2500.0,60 months,15.27%,59.83,C,C4,Ryder,< 1 year,RENT,30000.0,...,5.0,,,3.0,0.0,1687.0,9.4%,4.0,f,Charged Off
2,2400.0,36 months,15.96%,84.33,C,C5,,10+ years,RENT,12252.0,...,2.0,,,2.0,0.0,2956.0,98.5%,10.0,f,Fully Paid
3,10000.0,36 months,13.49%,339.31,C,C1,AIR RESOURCES BOARD,10+ years,RENT,49200.0,...,1.0,35.0,,10.0,0.0,5598.0,21%,37.0,f,Fully Paid
4,3000.0,60 months,12.69%,67.79,B,B5,University Medical Group,1 year,RENT,80000.0,...,0.0,38.0,,15.0,0.0,27783.0,53.9%,38.0,f,Fully Paid
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13170,22000.0,60 months,11.02%,478.56,B,B2,Laboratory Tech,8 years,MORTGAGE,73000.0,...,0.0,,,14.0,0.0,623.0,1.6%,34.0,w,Fully Paid
13240,10000.0,36 months,16.95%,356.28,C,C4,Firm Administrator,10+ years,RENT,73500.0,...,1.0,33.0,,9.0,0.0,11862.0,37.9%,36.0,w,Fully Paid
13244,17000.0,60 months,16.12%,414.50,C,C3,Sales,2 years,MORTGAGE,155000.0,...,1.0,,,23.0,0.0,55432.0,83%,51.0,w,Charged Off
13255,3000.0,36 months,15.24%,104.35,C,C2,IT manager,< 1 year,MORTGAGE,75000.0,...,3.0,56.0,,9.0,0.0,2676.0,26.2%,16.0,w,Fully Paid


In [5]:
# Custom Transforms

class fix_percent(BaseEstimator, TransformerMixin):
    
    def __init__(self):
        pass 
    
    def fit(self, X, y = None):
        return self
    
    def transform(self, X, y=None):
        _X = X.copy()
        
        for i in X.columns:
            _X[i] = _X[i].str.replace('%', '').astype(float)/100
        return _X
    

In [6]:
from sklearn.impute import SimpleImputer, MissingIndicator
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, KBinsDiscretizer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# Features that we want to leave untransformed:
# Only numeric features need apply:
standardize_and_impute_only = [
'loan_amnt'
, 'annual_inc'
, 'delinq_2yrs'
, 'inq_last_6mths'
, 'open_acc'
]


# maybe we will trying some polynomial transofmrations
polynomial_features = [
'installment'
, 'dti'
]

# how about some discretizations?
discretization = [
'fico_range_low'
, 'fico_range_high'
, 'mths_since_last_delinq'
, 'mths_since_last_record'
, 'pub_rec'
, 'revol_bal'
, 'total_acc'
]

# let's construct all of our transformers
standard_scaler = StandardScaler()
missing_indicator = MissingIndicator(features="all")
simple_imputer = SimpleImputer(strategy='median')
polynomial_featurizer = PolynomialFeatures(2)
discretizer = KBinsDiscretizer(n_bins=8, encode='ordinal', strategy='uniform')
fix_percent_transformer = fix_percent()

# and make the pipelines
standardize_and_impute_pipeline_steps = [('standardization', standard_scaler), ('imputer', simple_imputer)]
standardize_and_impute_pipeline = Pipeline(standardize_and_impute_pipeline_steps)

polynomial_pipeline_steps = standardize_and_impute_pipeline_steps + [('polynomial', polynomial_featurizer)]
polynomial_pipeline = Pipeline(polynomial_pipeline_steps)

discretize_steps = [('imputer', simple_imputer), ('discretize', discretizer)]
discretize_pipeline = Pipeline(discretize_steps)

interest_rate_steps = [('fix_int_rate', fix_percent_transformer)] + standardize_and_impute_pipeline_steps
interest_rate_pipeline = Pipeline(interest_rate_steps)

missing_flag_steps = [('missing_flag', missing_indicator)]
missing_flag_pipeline = Pipeline(missing_flag_steps)

#now we can use a ColumnTransformer to do everything
transform_pipeline = ColumnTransformer([
    ('standardize_and_impute_pipeline', standardize_and_impute_pipeline, standardize_and_impute_only)
     , ('polynomial_pipeline', polynomial_pipeline, polynomial_features)
     , ('discretize_pipeline', discretize_pipeline, discretization)
     , ('interest_rate_pipeline', interest_rate_pipeline, ['int_rate'])
     , ('missing_flag_pipeline', missing_flag_pipeline, standardize_and_impute_only + polynomial_features + discretization)
])


In [7]:
from sklearn.ensemble import RandomForestClassifier

def make_target(val):
    if val in ['Charged Off', 'Late (31-120 days)', 'Default']:
        return 1
    else:
        return 0
    
df['loan_status'] = df['loan_status'].apply(make_target)

small_df = df.sample(n=11000)

X = small_df.drop(['loan_status'], axis = 1)
y = small_df[['loan_status']]

from sklearn import ensemble

rf = ensemble.RandomForestClassifier(n_estimators = 700
                                     , max_features = 20
                                     , random_state = 12345
                                     , max_depth = 4
                                     , min_samples_split = 200
                                     , min_samples_leaf = 1000
                                     , max_leaf_nodes = 100
                                    )

pipeline = Pipeline([('transform', transform_pipeline)
                     , ('rf', rf)
                    ]
                   )

In [8]:
pipeline.fit(X, y)

  self._final_estimator.fit(Xt, y, **fit_params_last_step)


Pipeline(steps=[('transform',
                 ColumnTransformer(transformers=[('standardize_and_impute_pipeline',
                                                  Pipeline(steps=[('standardization',
                                                                   StandardScaler()),
                                                                  ('imputer',
                                                                   SimpleImputer(strategy='median'))]),
                                                  ['loan_amnt', 'annual_inc',
                                                   'delinq_2yrs',
                                                   'inq_last_6mths',
                                                   'open_acc']),
                                                 ('polynomial_pipeline',
                                                  Pipeline(steps=[('standardization',
                                                                   StandardScaler()),
        

In [9]:
pd.DataFrame(pipeline.predict_proba(X)).sort_values([0])

Unnamed: 0,0,1
1939,0.649815,0.350185
1555,0.649944,0.350056
1389,0.649948,0.350052
6384,0.649973,0.350027
7200,0.650067,0.349933
...,...,...
8780,0.919198,0.080802
1239,0.919211,0.080789
2552,0.919211,0.080789
219,0.919211,0.080789


In [10]:
pd.DataFrame(pipeline.predict(X))

Unnamed: 0,0
0,0
1,0
2,0
3,0
4,0
...,...
10995,0
10996,0
10997,0
10998,0


In [None]:
#Attempt to add accuracy scores

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

forest = RandomForestClassifier()
pipeline.fit(X_train, y_train)
y_predict = pipeline.predict(X_test)
accuracy_score(y_test, y_predict)

#model = RandomForestClassifier()
pipeline.fit(X, y)

y_predict = model.predict(X)

accuracy_score(y_test.values, y_predict)