In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv("../data/data.csv") 
df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [20]:
df = df.drop('Loan_ID', axis=1)

In [21]:
#include pre-processing steps in pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import PowerTransformer
from sklearn.ensemble import RandomForestClassifier

In [22]:
#full pipeline to consist of: 

    ## dropper
    # preprocessing
    # model

In [5]:
# #create class to drop columns
# class columnDropperTransformer():
#     def __init__(self,columns):
#         self.columns=columns

#     def transform(self,X,y=None):
#         return X.drop(self.columns,axis=1)

#     def fit(self, X, y=None):
#         return self 

In [6]:
#define dropper step for pipeline
# dropper = Pipeline([("dropper", columnDropperTransformer(['Loan_ID']))])

In [7]:
#define preprocessing step for pipeline

cat_transform = Pipeline([("impute_mode", SimpleImputer(strategy='most_frequent')), ("one-hot-encode", OneHotEncoder(sparse=False))])
term_transform = Pipeline([("impute_mean", SimpleImputer(strategy='mean')),("scaling", StandardScaler())])
dollar_transform = Pipeline([("impute_mean", SimpleImputer(strategy='mean')),("log_transform", PowerTransformer())])

preprocessing = ColumnTransformer([("cat_transform", cat_transform, ['Gender','Married','Dependents','Education','Self_Employed','Credit_History','Property_Area']),
                                  ("term_transform", term_transform, ['Loan_Amount_Term']),
                                   ("dollar_transform", dollar_transform, ['ApplicantIncome','CoapplicantIncome','LoanAmount']),
                                  ])

In [8]:
#define model step for pipeline
model = RandomForestClassifier(criterion='gini', max_depth=4, max_features='sqrt', n_estimators=500)

In [23]:
pipeline = Pipeline([("preprocessing", preprocessing),
                     ("model", model)])

In [24]:
#Replicate steps that will be done in app.py file: 
#create X, y

In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Gender             601 non-null    object 
 1   Married            611 non-null    object 
 2   Dependents         599 non-null    object 
 3   Education          614 non-null    object 
 4   Self_Employed      582 non-null    object 
 5   ApplicantIncome    614 non-null    int64  
 6   CoapplicantIncome  614 non-null    float64
 7   LoanAmount         592 non-null    float64
 8   Loan_Amount_Term   600 non-null    float64
 9   Credit_History     564 non-null    float64
 10  Property_Area      614 non-null    object 
 11  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(7)
memory usage: 57.7+ KB


In [26]:
X = df.drop('Loan_Status', axis=1)
y = df['Loan_Status']

In [27]:
#import train-test-split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 33, stratify = y)

In [28]:
pipeline.fit(X_train, y_train)

Pipeline(steps=[('preprocessing',
                 ColumnTransformer(transformers=[('cat_transform',
                                                  Pipeline(steps=[('impute_mode',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('one-hot-encode',
                                                                   OneHotEncoder(sparse=False))]),
                                                  ['Gender', 'Married',
                                                   'Dependents', 'Education',
                                                   'Self_Employed',
                                                   'Credit_History',
                                                   'Property_Area']),
                                                 ('term_transform',
                                                  Pipeline(steps=[('impute_mean',
                 

In [29]:
r2 = pipeline.score(X_test, y_test)
print(f'Test set r^2: {r2}')

Test set r^2: 0.8162162162162162


In [30]:
pipeline.fit(X, y)

Pipeline(steps=[('preprocessing',
                 ColumnTransformer(transformers=[('cat_transform',
                                                  Pipeline(steps=[('impute_mode',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('one-hot-encode',
                                                                   OneHotEncoder(sparse=False))]),
                                                  ['Gender', 'Married',
                                                   'Dependents', 'Education',
                                                   'Self_Employed',
                                                   'Credit_History',
                                                   'Property_Area']),
                                                 ('term_transform',
                                                  Pipeline(steps=[('impute_mean',
                 

In [31]:
#save file

In [32]:
import pickle
model_columns = list(X.columns)
with open('../data/model_columns.pkl', 'wb') as file:
    pickle.dump(model_columns, file)
    
pickle.dump(pipeline, open('../data/pipeline.pkl', 'wb'))