In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv("../data/data.csv") 
df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         599 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         592 non-null    float64
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 62.5+ KB


In [4]:
df = df.drop('Loan_ID', axis=1)

In [5]:
#include pre-processing steps in pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import PowerTransformer
from sklearn.ensemble import RandomForestClassifier

In [6]:
#full pipeline to consist of: 

    ## dropper
    # preprocessing
    # model

In [7]:
# #create class to drop columns
# class columnDropperTransformer():
#     def __init__(self,columns):
#         self.columns=columns

#     def transform(self,X,y=None):
#         return X.drop(self.columns,axis=1)

#     def fit(self, X, y=None):
#         return self 

In [8]:
#define dropper step for pipeline
# dropper = Pipeline([("dropper", columnDropperTransformer(['Loan_ID']))])

In [9]:
#define preprocessing step for pipeline

cat_transform = Pipeline([("impute_mode", SimpleImputer(strategy='most_frequent')), ("one-hot-encode", OneHotEncoder(sparse=False))])
term_transform = Pipeline([("impute_mean", SimpleImputer(strategy='mean')),("scaling", StandardScaler())])
dollar_transform = Pipeline([("impute_mean", SimpleImputer(strategy='mean')),("log_transform", PowerTransformer())])

preprocessing = ColumnTransformer([("cat_transform", cat_transform, ['Gender','Married','Dependents','Education','Self_Employed','Credit_History','Property_Area']),
                                  ("term_transform", term_transform, ['Loan_Amount_Term']),
                                   ("dollar_transform", dollar_transform, ['ApplicantIncome','CoapplicantIncome','LoanAmount']),
                                  ])

In [10]:
#define model step for pipeline
model = RandomForestClassifier(criterion='gini', max_depth=4, max_features='sqrt', n_estimators=500)

In [11]:
pipeline = Pipeline([("preprocessing", preprocessing),
                     ("model", model)])

In [12]:
#Replicate steps that will be done in app.py file: 
#create X, y

In [13]:
X = df.drop('Loan_Status', axis=1)
y = df['Loan_Status']

In [14]:
X2 = preprocessing.fit_transform(X,y)
X2

array([[ 0.        ,  1.        ,  1.        , ...,  0.5443313 ,
        -1.10283684,  0.23538679],
       [ 0.        ,  1.        ,  0.        , ...,  0.17097382,
         0.75057774, -0.0376586 ],
       [ 0.        ,  1.        ,  0.        , ..., -0.49995512,
        -1.10283684, -1.34630531],
       ...,
       [ 0.        ,  1.        ,  0.        , ...,  1.02348683,
         0.20860307,  1.37240931],
       [ 0.        ,  1.        ,  0.        , ...,  0.93175029,
        -1.10283684,  0.73892362],
       [ 1.        ,  0.        ,  1.        , ...,  0.17097382,
        -1.10283684,  0.03993645]])

In [15]:
X3 = model.fit(X2,y)
X3

RandomForestClassifier(max_depth=4, max_features='sqrt', n_estimators=500)

In [16]:
r2 = model.score(X2,y)

In [17]:
print(r2)

0.8127035830618893


In [18]:
pipeline.fit(X,y)

Pipeline(steps=[('preprocessing',
                 ColumnTransformer(transformers=[('cat_transform',
                                                  Pipeline(steps=[('impute_mode',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('one-hot-encode',
                                                                   OneHotEncoder(sparse=False))]),
                                                  ['Gender', 'Married',
                                                   'Dependents', 'Education',
                                                   'Self_Employed',
                                                   'Credit_History',
                                                   'Property_Area']),
                                                 ('term_transform',
                                                  Pipeline(steps=[('impute_mean',
                 

In [19]:
pipeline.score(X,y)

0.8127035830618893

In [20]:
#import train-test-split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 33, stratify = y)

In [21]:
pipeline.fit(X_train, y_train)

Pipeline(steps=[('preprocessing',
                 ColumnTransformer(transformers=[('cat_transform',
                                                  Pipeline(steps=[('impute_mode',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('one-hot-encode',
                                                                   OneHotEncoder(sparse=False))]),
                                                  ['Gender', 'Married',
                                                   'Dependents', 'Education',
                                                   'Self_Employed',
                                                   'Credit_History',
                                                   'Property_Area']),
                                                 ('term_transform',
                                                  Pipeline(steps=[('impute_mean',
                 

In [22]:
r2 = pipeline.score(X_test, y_test)
print(f'Test set r^2: {r2}')

Test set r^2: 0.8162162162162162


In [23]:
X

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban
1,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural
2,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban
3,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban
4,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban
...,...,...,...,...,...,...,...,...,...,...,...
609,Female,No,0,Graduate,No,2900,0.0,71.0,360.0,1.0,Rural
610,Male,Yes,3+,Graduate,No,4106,0.0,40.0,180.0,1.0,Rural
611,Male,Yes,1,Graduate,No,8072,240.0,253.0,360.0,1.0,Urban
612,Male,Yes,2,Graduate,No,7583,0.0,187.0,360.0,1.0,Urban


In [24]:
pipeline.fit(X, y)

Pipeline(steps=[('preprocessing',
                 ColumnTransformer(transformers=[('cat_transform',
                                                  Pipeline(steps=[('impute_mode',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('one-hot-encode',
                                                                   OneHotEncoder(sparse=False))]),
                                                  ['Gender', 'Married',
                                                   'Dependents', 'Education',
                                                   'Self_Employed',
                                                   'Credit_History',
                                                   'Property_Area']),
                                                 ('term_transform',
                                                  Pipeline(steps=[('impute_mean',
                 

In [25]:
X.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban
1,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural
2,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban
3,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban
4,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban


In [26]:
#save file

In [27]:
import pickle
model_columns = list(X.columns)
with open('../data/model_columns.pkl', 'wb') as file:
    pickle.dump(model_columns, file)
    
pickle.dump(pipeline, open('../data/pipeline.pkl', 'wb'))