In [1]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.tree import DecisionTreeClassifier
import pandas as pd

In [3]:
df=pd.read_csv("./dataset/train_clean_data.csv",index_col=0)
df.head()

Unnamed: 0,Loan_ID,Dependents,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,gender_Male,married_Yes,education_Not Graduate,property_area_Semiurban,property_area_Urban,self_employed_Yes,Loan_status_Y
1,LP001003,1,4583,1508.0,128.0,360.0,1.0,1,1,0,0,0,0,0
2,LP001005,0,3000,0.0,66.0,360.0,1.0,1,1,0,0,1,1,1
3,LP001006,0,2583,2358.0,120.0,360.0,1.0,1,1,1,0,1,0,1
4,LP001008,0,6000,0.0,141.0,360.0,1.0,1,0,0,0,1,0,1
5,LP001011,2,5417,4196.0,267.0,360.0,1.0,1,1,0,0,1,1,1


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 517 entries, 1 to 613
Data columns (total 14 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Loan_ID                  517 non-null    object 
 1   Dependents               517 non-null    object 
 2   ApplicantIncome          517 non-null    int64  
 3   CoapplicantIncome        517 non-null    float64
 4   LoanAmount               517 non-null    float64
 5   Loan_Amount_Term         517 non-null    float64
 6   Credit_History           517 non-null    float64
 7   gender_Male              517 non-null    int64  
 8   married_Yes              517 non-null    int64  
 9   education_Not Graduate   517 non-null    int64  
 10  property_area_Semiurban  517 non-null    int64  
 11  property_area_Urban      517 non-null    int64  
 12  self_employed_Yes        517 non-null    int64  
 13  Loan_status_Y            517 non-null    int64  
dtypes: float64(4), int64(8), object

In [7]:
df.shape

(517, 14)

In [9]:
df['Loan_status_Y'].value_counts()

Loan_status_Y
1    360
0    157
Name: count, dtype: int64

In [11]:
''' Train Test Split'''
X=df.drop(['Loan_ID','Dependents','Loan_status_Y'],axis=1)
y=df['Loan_status_Y']

In [13]:
from sklearn.model_selection import train_test_split
X_train,X_test, y_train,y_test=train_test_split(X,y, test_size=0.33,stratify=y,random_state=42)

In [15]:
!pip install --upgrade scikit-learn
!pip install --upgrade imbalanced-learn



**Synthetic Minority Over-sampling Technique**
- using SMOTE (Synthetic Minority Over-sampling Technique) from the imblearn library to balance an imbalanced dataset
- The target **y_train** has **at least one minority class with fewer samples** than the majority class — otherwise SMOTE has nothing to balance.
- If you have **categorical features in X_train,** consider using **SMOTENC** instead of SMOTE, as regular SMOTE is designed for numerical data only.
- SMOTE for Nominal and Continuous features. It allows you to handle mixed-type data (numerical + categorical) during oversampling.
- SMOTE creates new, synthetic samples by **interpolating**(finding a point between two known points.) between existing ones.

In [15]:
from imblearn.over_sampling import SMOTE
'''set the random state, it ensures that the synthetic data generated during the oversampling process is the same every time you run the code'''
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

In [90]:
X_train_res.shape

(482, 11)

In [92]:
y_train_res.shape

(482,)

In [16]:
model1=DecisionTreeClassifier(class_weight='balanced')
model2=GaussianNB()
model3=LogisticRegression(solver='liblinear',max_iter=200,class_weight='balanced')

In [19]:
model1.fit(X_train_res,y_train_res)
model2.fit(X_train_res,y_train_res)
model3.fit(X_train_res,y_train_res)

In [21]:
#predict_proba() method of a model to obtain predicted probabilities for each class in a classification task.
'''averaging probabilities is called as soft voting or probabilistic ensembling'''
pred1=model1.predict_proba(X_test)
pred2=model2.predict_proba(X_test)
pred3=model3.predict_proba(X_test)

In [25]:
final_pred=(pred1+pred2+pred3)/3

In [27]:
final_pred

array([[0.39223731, 0.60776269],
       [0.40659715, 0.59340285],
       [0.02848186, 0.97151814],
       [0.04511311, 0.95488689],
       [0.35097035, 0.64902965],
       [0.23817243, 0.76182757],
       [0.29479918, 0.70520082],
       [0.10438856, 0.89561144],
       [0.47694483, 0.52305517],
       [0.09476933, 0.90523067],
       [0.13907493, 0.86092507],
       [0.14761552, 0.85238448],
       [0.06334219, 0.93665781],
       [0.11690038, 0.88309962],
       [0.10704854, 0.89295146],
       [0.52204028, 0.47795972],
       [0.05678688, 0.94321312],
       [0.04764203, 0.95235797],
       [0.97920344, 0.02079656],
       [0.07782672, 0.92217328],
       [0.77405891, 0.22594109],
       [0.59550243, 0.40449757],
       [0.03753762, 0.96246238],
       [0.0702189 , 0.9297811 ],
       [0.62788621, 0.37211379],
       [0.40903588, 0.59096412],
       [0.02358319, 0.97641681],
       [0.16701355, 0.83298645],
       [0.09439583, 0.90560417],
       [0.61470569, 0.38529431],
       [0.

In [29]:
model1.classes_

array([0, 1], dtype=int64)

**Voting Classifier**

In [32]:
clf1=DecisionTreeClassifier(class_weight='balanced')
clf2=GaussianNB()
clf3=LogisticRegression(solver='liblinear',max_iter=200,class_weight='balanced')

In [34]:
eclf1=VotingClassifier(estimators=[('DT',clf1),('GNB',clf2),('LR',clf3)], voting='hard')
eclf1=eclf1.fit(X_train_res,y_train_res)

In [36]:
predictions=eclf1.predict(X_test)

In [38]:
predictions

array([0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1,
       1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1,
       1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1,
       0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0,
       1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0,
       1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0], dtype=int64)

In [40]:
from sklearn.metrics import classification_report , confusion_matrix

print(classification_report(y_test,predictions))
print(confusion_matrix(y_test,predictions))

              precision    recall  f1-score   support

           0       0.70      0.62      0.65        52
           1       0.84      0.88      0.86       119

    accuracy                           0.80       171
   macro avg       0.77      0.75      0.76       171
weighted avg       0.80      0.80      0.80       171

[[ 32  20]
 [ 14 105]]


In [42]:
eclf2=VotingClassifier(estimators=[('DT',clf1),('GNB',clf2),('LR',clf3)], voting='soft')
eclf2=eclf2.fit(X_train_res,y_train_res)

In [44]:
prediction=eclf2.predict(X_test)

In [46]:
from sklearn.metrics import classification_report , confusion_matrix

print(classification_report(y_test,prediction))
print(confusion_matrix(y_test,prediction))

              precision    recall  f1-score   support

           0       0.67      0.60      0.63        52
           1       0.83      0.87      0.85       119

    accuracy                           0.79       171
   macro avg       0.75      0.74      0.74       171
weighted avg       0.78      0.79      0.79       171

[[ 31  21]
 [ 15 104]]


In [88]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
base_model1=DecisionTreeClassifier()
bagging = BaggingClassifier(estimator=base_model1, n_estimators=200, random_state=0)
bagging.fit(X_train_res, y_train_res)
y_pred_bagging = bagging.predict(X_test)
print("Bagging Accuracy:", accuracy_score(y_test, y_pred_bagging))

Bagging Accuracy: 0.7660818713450293


In [78]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=100, random_state=0)
rf.fit(X_train_res, y_train_res)
y_pred_rf = rf.predict(X_test)
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))

Random Forest Accuracy: 0.7953216374269005


In [94]:
from sklearn.ensemble import AdaBoostClassifier

boosting = AdaBoostClassifier(n_estimators=1200, random_state=0)
boosting.fit(X_train_res, y_train_res)
y_pred_boost = boosting.predict(X_test)
print("Boosting (AdaBoost) Accuracy:", accuracy_score(y_test, y_pred_boost))

Boosting (AdaBoost) Accuracy: 0.7953216374269005


In [96]:
#custom ensemble learning
from sklearn.utils import resample
'''resample function: This is used to create a random sample from the dataset. 
The function can sample with replacement . 
 setting replace=True, means the sampling will be with replacement.'''
'''If you're using multiple resamples to create different bootstrap samples 
 different values of random_state ensure that each resample is independent'''
X1, y1 = resample(X_train_res, y_train_res, replace=True, random_state=1)
X2, y2 = resample(X_train_res, y_train_res, replace=True, random_state=2)
X3, y3 = resample(X_train_res, y_train_res, replace=True, random_state=3)

model1.fit(X1, y1)
model2.fit(X2, y2)
model3.fit(X3, y3)

In [98]:
pred1 = model1.predict(X_test)
pred2 = model2.predict(X_test)
pred3 = model3.predict(X_test)

In [106]:
print(pred1)

[1 1 1 1 0 0 1 1 1 1 1 1 1 1 1 1 1 0 0 1 0 0 1 1 0 1 0 0 1 1 1 0 0 0 1 1 1
 1 1 0 1 1 0 0 0 0 0 0 0 1 1 1 1 1 0 1 1 1 1 0 1 1 0 1 0 0 1 1 0 0 0 0 0 1
 1 1 0 1 1 1 1 0 1 1 0 0 1 0 0 0 1 0 1 1 1 0 1 1 1 1 1 0 0 1 0 1 0 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 0 0 1 0 1 1 1 0 0 1 1 0 1 1 0 1 0 1 1 0 0 0 1 1 1
 0 1 1 1 1 0 0 0 0 1 0 0 0 1 1 0 1 0 0 1 0 1 0]


In [100]:
import numpy as np
'''The np.stack() method  is used to join a sequence of arrays along a new axis'''
all_preds = np.stack([pred1, pred2, pred3], axis=1) 

In [110]:
all_preds.shape

(171, 3)

In [102]:
from scipy.stats import mode
'''The scipy.stats.mode function returns two values:

1.The mode values (most frequent values).

2.The count of how many times the mode occurs.'''

final_pred, _ = mode(all_preds, axis=1, keepdims=False)

In [104]:
from sklearn.metrics import accuracy_score, classification_report

print("Accuracy:", accuracy_score(y_test, final_pred))
print("Report:\n", classification_report(y_test, final_pred))

Accuracy: 0.783625730994152
Report:
               precision    recall  f1-score   support

           0       0.68      0.54      0.60        52
           1       0.82      0.89      0.85       119

    accuracy                           0.78       171
   macro avg       0.75      0.71      0.73       171
weighted avg       0.78      0.78      0.78       171

