In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
df = pd.read_csv('train.csv')

In [4]:
df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [5]:
df['Loan_Status'].replace({'Y':1,'N':0},inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Loan_Status'].replace({'Y':1,'N':0},inplace=True)
  df['Loan_Status'].replace({'Y':1,'N':0},inplace=True)


In [6]:
df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,1
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,0
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,1
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,1
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,1


In [7]:
df.dtypes

Loan_ID               object
Gender                object
Married               object
Dependents            object
Education             object
Self_Employed         object
ApplicantIncome        int64
CoapplicantIncome    float64
LoanAmount           float64
Loan_Amount_Term     float64
Credit_History       float64
Property_Area         object
Loan_Status            int64
dtype: object

In [8]:
df.isnull().sum()

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [9]:
df.dropna(axis=0, inplace = True)

In [10]:
df.isnull().sum()

Loan_ID              0
Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64

In [11]:
X = df.drop('Loan_Status', axis=1)

In [12]:
X.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban
5,LP001011,Male,Yes,2,Graduate,Yes,5417,4196.0,267.0,360.0,1.0,Urban


In [13]:
X.dtypes

Loan_ID               object
Gender                object
Married               object
Dependents            object
Education             object
Self_Employed         object
ApplicantIncome        int64
CoapplicantIncome    float64
LoanAmount           float64
Loan_Amount_Term     float64
Credit_History       float64
Property_Area         object
dtype: object

In [14]:
X['Dependents'] = X['Dependents'].astype(str)

In [15]:
X.drop('Loan_ID', axis=1, inplace = True)

In [16]:
y = df['Loan_Status']

In [17]:
y.head()

1    0
2    1
3    1
4    1
5    1
Name: Loan_Status, dtype: int64

In [18]:
from sklearn.preprocessing import LabelEncoder

In [19]:
encoder = LabelEncoder()

In [20]:
labels  = encoder.fit_transform(X.Education)
X['Education'] = labels

In [21]:
labels  = encoder.fit_transform(X.Dependents)
X['Dependents'] = labels

In [22]:
labels  = encoder.fit_transform(X.Married)
X['Married'] = labels

In [23]:
labels  = encoder.fit_transform(X.Gender)
X['Gender'] = labels

In [24]:
labels  = encoder.fit_transform(X.Self_Employed)
X['Self_Employed'] = labels

In [25]:
labels  = encoder.fit_transform(X.Property_Area)
X['Property_Area'] = labels

In [26]:
X.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
1,1,1,1,0,0,4583,1508.0,128.0,360.0,1.0,0
2,1,1,0,0,1,3000,0.0,66.0,360.0,1.0,2
3,1,1,0,1,0,2583,2358.0,120.0,360.0,1.0,2
4,1,0,0,0,0,6000,0.0,141.0,360.0,1.0,2
5,1,1,2,0,1,5417,4196.0,267.0,360.0,1.0,2


In [27]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics

In [28]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size= 0.2, random_state=42)

In [29]:
clf = RandomForestClassifier()

In [30]:
clf.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'monotonic_cst': None,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [31]:
clf.fit(X_train,y_train)

In [32]:
#use the model to make predicition on the test data set
y_preds = clf.predict(X_test)

In [33]:
#evaluate the model on the train dataset
train_acc = clf.score (X_train, y_train)
print(f"The model's accuracy on the train dataset is : {train_acc*100}%")

The model's accuracy on the train dataset is : 100.0%


In [34]:
#evaluate the model on the test dataset
test_acc = clf.score (X_test, y_test)
print(f"The model's accuracy on the train dataset is : {test_acc*100:.2f}%")

The model's accuracy on the train dataset is : 81.25%


In [35]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [36]:
#create a classifictaion report
print(classification_report(y_test, y_preds))

              precision    recall  f1-score   support

           0       0.81      0.46      0.59        28
           1       0.81      0.96      0.88        68

    accuracy                           0.81        96
   macro avg       0.81      0.71      0.73        96
weighted avg       0.81      0.81      0.79        96



In [37]:
#create confusin matrix
conf_mat = confusion_matrix(y_preds, y_test )
conf_mat

array([[13,  3],
       [15, 65]], dtype=int64)

In [38]:
# create accuracy score
accuracy_score (y_preds, y_test)

0.8125

In [39]:
#note that RandomForestClassifier has 100 estimators by default, try another number of estimators
#try different numbers of estimators (no cross validation)
np.random.seed(42)

for i in range(100,200,10):
  print(f"trying model with {i} estimators...")
  model = RandomForestClassifier (n_estimators = i). fit(X_train, y_train)
  print (f"model accuracy on test set: {model.score(X_test, y_test)*100:.2f}%")
  print ("")

trying model with 100 estimators...
model accuracy on test set: 82.29%

trying model with 110 estimators...
model accuracy on test set: 81.25%

trying model with 120 estimators...
model accuracy on test set: 79.17%

trying model with 130 estimators...
model accuracy on test set: 81.25%

trying model with 140 estimators...
model accuracy on test set: 82.29%

trying model with 150 estimators...
model accuracy on test set: 82.29%

trying model with 160 estimators...
model accuracy on test set: 82.29%

trying model with 170 estimators...
model accuracy on test set: 81.25%

trying model with 180 estimators...
model accuracy on test set: 81.25%

trying model with 190 estimators...
model accuracy on test set: 82.29%



In [40]:
from sklearn.model_selection import cross_val_score

In [41]:
# with cross validation
np.random.seed(42)

for i in range(100,200,10):
  print(f"trying model with {i} estimators...")
  model = RandomForestClassifier (n_estimators = i). fit(X_train, y_train)
  print (f"model accuracy on test set: {model.score(X_test, y_test)*100:.2f}%")

  #measure the model score on a single train split
modal_score = model.score(X_test, y_test)
print (f"model accuracy on single test set split: {model.score(X_test, y_test)*100:.2f}%")


trying model with 100 estimators...
model accuracy on test set: 82.29%
trying model with 110 estimators...
model accuracy on test set: 81.25%
trying model with 120 estimators...
model accuracy on test set: 79.17%
trying model with 130 estimators...
model accuracy on test set: 81.25%
trying model with 140 estimators...
model accuracy on test set: 82.29%
trying model with 150 estimators...
model accuracy on test set: 82.29%
trying model with 160 estimators...
model accuracy on test set: 82.29%
trying model with 170 estimators...
model accuracy on test set: 81.25%
trying model with 180 estimators...
model accuracy on test set: 81.25%
trying model with 190 estimators...
model accuracy on test set: 82.29%
model accuracy on single test set split: 82.29%


In [42]:
#measure the mean cross validation score across 5 different train test and split
cross_val_mean = np.mean(cross_val_score(model, X,y, cv=5))
print(f"5-fold cross-valaidation score:{cross_val_mean*100:.2f}%")
print("")


5-fold cross-valaidation score:80.42%



In [43]:
#another way to do it with GridsearchCv
np.random.seed(42)
from sklearn.model_selection import GridSearchCV

In [44]:
#define the parameters to search over a dicitionary
# (these can any of your traget model's hyperparameter)
param_grid = {'n_estimators': [ i for i in range (100,200,10)]}

In [45]:
# set up the grid search
grid = GridSearchCV(estimator=RandomForestClassifier(), param_grid=param_grid, cv=5, verbose=1)

In [46]:
#fit the grid search to the data
grid.fit(X,y)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


In [47]:
#find the best parameter
print (f"The best Parameter values are: {grid.best_params_}")
print (f"with a score of : {grid.best_score_*100:.2f}%")

The best Parameter values are: {'n_estimators': 180}
with a score of : 80.62%


In [48]:
# we can extract  the best model with the "best_estimator_" attriubte
#set the model to be the best estimator
clf = grid.best_estimator_
clf

In [49]:
# And now we have got the best cross-validation model,
# we can fitr and score it on our original single train/test spilt of the data
# fit the best model
clf = clf.fit(X_train, y_train)

In [50]:
# find the best model score on our single test spilt
#(note: this may be lower than the cross-validation score, it's only on one splilt on the data)

print(f"Best model score on single split of the data: {clf.score(X_test, y_test)*100:.2f}%")

Best model score on single split of the data: 82.29%


In [51]:
# joblib is used for heavy dataset, while pickle is used for light data
import pickle

In [54]:
# save an existing model to file
pickle.dump(model, open("random_forest_model.pkl", "wb"))

In [55]:
#load a save pickle model and evaluate
loaded_pickle_model = pickle.load(open("random_forest_model.pkl", "rb"))
print(f"loaded pickle model prediction score: {loaded_pickle_model.score(X_test, y_test)*100:.2f}%")

loaded pickle model prediction score: 82.29%


In [58]:
X.Property_Area.value_counts()

Property_Area
1    191
2    150
0    139
Name: count, dtype: int64

In [56]:
X.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
1,1,1,1,0,0,4583,1508.0,128.0,360.0,1.0,0
2,1,1,0,0,1,3000,0.0,66.0,360.0,1.0,2
3,1,1,0,1,0,2583,2358.0,120.0,360.0,1.0,2
4,1,0,0,0,0,6000,0.0,141.0,360.0,1.0,2
5,1,1,2,0,1,5417,4196.0,267.0,360.0,1.0,2


In [60]:
# load a new dataset
new_data = pd.DataFrame({
    'Gender':[1,0,1],
    'Married':[1,1,0],
    'Dependents':[3,2,1],
    'Education':[0,0,1],
    'Self_Employed':[0,1,0],
    'ApplicantIncome':[3000,4300,2000],
    'CoapplicantIncome':[1508.0,0.0,3000.0],
    'LoanAmount':[150,187,172],
    'Loan_Amount_Term':[360.0,360.0,360.0],
    'Credit_History':[1.0,1.0,0.0],
    'Property_Area':[0,1,2]
})

In [61]:
new_data.shape

(3, 11)

In [62]:
# make prediction on the new dataset
predictions = loaded_pickle_model.predict(new_data)
predictions

array([1, 1, 0], dtype=int64)