In [368]:
import numpy as np
import pandas as pd
from scipy.stats import skew
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer 
from sklearn.preprocessing import StandardScaler, FunctionTransformer, OneHotEncoder
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.pipeline import Pipeline, make_pipeline

import category_encoders as ce

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import recall_score
from sklearn.model_selection import KFold, GridSearchCV, cross_val_score

import multiprocessing

%matplotlib inline

In [300]:
# Load the loan data

#df = pd.read_csv("https://raw.githubusercontent.com/kiwidamien/StackedTurtles/master/content/platt_scaling/lending_club_clean_and_processed.csv")

df= pd.read_csv('G:\\Min enhet\\BackUp\\Downloads\\Datasets\\loan-prediction\\train.csv')

df.isna().sum()

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [301]:
# Partition data into train and test sets
X = df.drop(columns= ['Loan_ID', 'Loan_Status'], axis = 1) #df.drop('defaulted', axis = 1)
y = df['Loan_Status']#df.defaulted

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 123)

# Numerical features

In [302]:
select_numeric_features = make_column_selector(dtype_include=np.number)

numeric_features = select_numeric_features(X_train)

print(f'N numeric_features: {len(numeric_features)} \n')
print(', '.join(numeric_features))

N numeric_features: 5 

ApplicantIncome, CoapplicantIncome, LoanAmount, Loan_Amount_Term, Credit_History


In [332]:
# Convert the missing type to numpy missing type
X_train.fillna(np.nan, inplace=True)
X_test.fillna(np.nan, inplace=True)


numeric_pipeline = make_pipeline(SimpleImputer(strategy='median', add_indicator=True))  

In [None]:
# determine categorical and numerical features
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(exclude= [np.number]).columns

# Get the skew for numeric features
feature_skew = X_train.select_dtypes(include = [np.number]).skew()
print(feature_skew)

# Break the features into two groups: log transformation for highly skewed data , Scaling otherwise
log_features = feature_skew[abs(feature_skew) > 0.9].index
scale_features = [name for name in feature_skew.index if name not in log_features]


In [388]:
# Numeric with high skew

def select_skew_features(df):
    
    skew_features =\
        df\
        .select_dtypes(include = [np.number]).skew()\
        .loc[lambda x: abs(x) > 0.9]\
        .index\
        .tolist()
        
    return skew_features

skew_features = select_skew_features(X_train)

print(f'N skew_features: {len(skew_features)} \n')
print(', '.join(skew_features))

N skew_features: 5 

ApplicantIncome, CoapplicantIncome, LoanAmount, Loan_Amount_Term, Credit_History


In [396]:
# Put it in a pipeline

log_pipeline = make_pipeline(SimpleImputer(strategy='median'), # fill the missing values first
                             FunctionTransformer(func= np.log1p, validate=False)) # Do log transform on skew columns

In [397]:
# Scale to make the values for rest of the numeric columns between 0 and 1
scale_pipeline = make_pipeline( StandardScaler()) # Do scaling on non-skew columns

# Categorical with moderate-to-low cardinality

In [305]:
# OH transformation may not be suitable for features with high cardinality. 
# For the sake of illustration, I'm going to set my limit at 20 values.

MAX_OH_CARDINALITY = 10

def select_oh_features(df):
    
    hc_features =\
        df\
        .select_dtypes(['object', 'category'])\
        .apply(lambda col: col.nunique())\
        .loc[lambda x: x <= MAX_OH_CARDINALITY]\
        .index\
        .tolist()
        
    return hc_features

oh_features = select_oh_features(X_train)

print(f'N oh_features: {len(oh_features)} \n')
print(', '.join(oh_features))

N oh_features: 6 

Gender, Married, Dependents, Education, Self_Employed, Property_Area


In [306]:
oh_pipeline = make_pipeline(SimpleImputer(strategy='constant'), OneHotEncoder(handle_unknown='ignore'))

In [307]:
# Categorical with high cardinality

def select_hc_features(df):
    
    hc_features =\
        df\
        .select_dtypes(['object', 'category'])\
        .apply(lambda col: col.nunique())\
        .loc[lambda x: x > MAX_OH_CARDINALITY]\
        .index\
        .tolist()
        
    return hc_features


hc_features = select_hc_features(X_train)

print(f'N hc_features: {len(hc_features)} \n')
print(', '.join(hc_features))

N hc_features: 0 




In [308]:
hc_pipeline = make_pipeline(ce.GLMMEncoder())


# Putting It All Together

In [345]:
column_transformer = ColumnTransformer(transformers=\
                                       [('numeric_pipeline', numeric_pipeline, select_numeric_features),\
                                        ('oh_pipeline', oh_pipeline, select_oh_features),\
                                        ('hc_pipeline', hc_pipeline, select_hc_features)],
                                       n_jobs = multiprocessing.cpu_count(),
                                       remainder='drop')

# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.
#model = GradientBoostingClassifier(learning_rate=0.025, n_estimators=1000, subsample=0.25, max_depth=5,\
#                                 min_samples_split=50, max_features='sqrt')

model = RandomForestClassifier()

clf = Pipeline(steps=[('preprocessor', column_transformer),
                      ('classifier', model)])

clf.fit(X_train, y_train)
#print("model score: %.3f" % clf.score(X_test, y_test))

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(n_jobs=8,
                                   transformers=[('numeric_pipeline',
                                                  Pipeline(steps=[('simpleimputer',
                                                                   SimpleImputer(add_indicator=True,
                                                                                 strategy='median'))]),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x00000247838CFD88>),
                                                 ('oh_pipeline',
                                                  Pipeline(steps=[('simpleimputer',
                                                                   SimpleImputer(strategy='constant')),
                                                                  ('onehotencoder',
                                                                   OneHotEncoder(handl

In [346]:
out_df = pd.DataFrame([
    [clf.score(X_train, y_train), recall_score(y_train.values, clf.predict(X_train), pos_label='Y')], 
    # pos_label: indicate which label is the positive one 
    [clf.score(X_test, y_test), recall_score(y_test, clf.predict(X_test), pos_label='Y')],
], columns = ['Accuracy', 'Recall'], index=['Train', 'Test'])

print(out_df)

       Accuracy    Recall
Train  1.000000  1.000000
Test   0.767568  0.966942


# Using the prediction pipeline in a grid search

In [362]:
column_transformer = ColumnTransformer(transformers=\
                                       [('numeric_pipeline', numeric_pipeline, select_numeric_features),\
                                        ('oh_pipeline', oh_pipeline, select_oh_features),\
                                        ('hc_pipeline', hc_pipeline, select_hc_features)],
                                       n_jobs = multiprocessing.cpu_count(),
                                       remainder='drop')

# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.
#model = GradientBoostingClassifier(learning_rate=0.025, n_estimators=1000, subsample=0.25, max_depth=5,\
#                                 min_samples_split=50, max_features='sqrt')

model = RandomForestClassifier()

clf = Pipeline(steps=[('preprocessor', column_transformer),
                      ('clf', model)])


kfold = KFold(n_splits=5, random_state=22)


#Create the parameter grid, entering the values to use for each parameter selected in the RandomForest estimator
parameters = {'clf__n_estimators': [50, 100, 200, 500, 1000], 
              'clf__max_features': ['log2', 'sqrt','auto'],
              'clf__criterion': ['entropy', 'gini'], 
              'clf__max_depth': [2, 3, 5, 9], 
              'clf__min_samples_split': [2, 3, 5],
              'clf__min_samples_leaf': [1,5,8] 
             }

grid_RF = GridSearchCV(clf, param_grid=parameters , cv= kfold)

#Fit the grid search object to the training data and find the optimal parameters using fit()
grid_RF= grid_RF.fit(X_train, y_train)

#Get the best estimator and print out the estimator model
best_clf = grid_RF.best_estimator_
print (best_clf)

#Use best estimator to make predictions on the test set
best_predictions = best_clf.predict(X_test)

cv_result = cross_val_score(clf, X_train, y_train, cv = kfold,scoring = "accuracy")

print(cv_result.mean())

print(("best random forest from grid search: %.3f"
       % best_clf.score(X_test, y_test)))

AttributeError: 'GridSearchCV' object has no attribute 'best_estimator_'

In [369]:
cv_result = cross_val_score(clf, X_train, y_train, cv = kfold,scoring = "accuracy")

print(cv_result.mean())

0.8043228454172366


# Extra note

In [395]:
# define the data preparation for the columns


column_transformer = ColumnTransformer(transformers=\
                                       [('numeric_pipeline', numeric_pipeline, select_numeric_features),\
                                        ('scale_pipeline', scale_pipeline, scale_features ),
                                        ('log_pipeline', log_pipeline, skew_features),
                                        ('oh_pipeline', oh_pipeline, select_oh_features),\
                                        ('hc_pipeline', hc_pipeline, select_hc_features)],
                                       n_jobs = multiprocessing.cpu_count(),
                                       remainder='drop') # or, remainder='passthrough') # ColumnTransformer will move all features that were not transformed, but this can be overriden by using the remainder="passthrough"


# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.
#model = GradientBoostingClassifier(learning_rate=0.025, n_estimators=1000, subsample=0.25, max_depth=5,\
#                                 min_samples_split=50, max_features='sqrt')

model = RandomForestClassifier(n_estimators=100)

clf = Pipeline(steps=[('preprocessor', column_transformer),
                      ('classifier', model)])

clf.fit(X_train, y_train)
print("model score: %.3f" % clf.score(X_test, y_test))
   

model score: 0.773


In [394]:
column_transformer

ColumnTransformer(n_jobs=8,
                  transformers=[('numeric_pipeline',
                                 Pipeline(steps=[('simpleimputer',
                                                  SimpleImputer(add_indicator=True,
                                                                strategy='median'))]),
                                 <sklearn.compose._column_transformer.make_column_selector object at 0x00000247838CFD88>),
                                ('scale_pipeline',
                                 Pipeline(steps=[('standardscaler',
                                                  StandardScaler())]),
                                 []),
                                ('log_pipeline',
                                 Pipeline(steps=[('funct...
                                  'LoanAmount', 'Loan_Amount_Term',
                                  'Credit_History']),
                                ('oh_pipeline',
                                 Pipeline(steps=[('