In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
%matplotlib inline

In [3]:
df=pd.read_csv('https://raw.githubusercontent.com/moezali1/modelbit/main/loans.csv')

In [4]:
df.head()

Unnamed: 0,Applicant_ID,Age,Annual_Income,Credit_Score,Employment_Years,Loan_Amount_Requested,Default
0,10,32,82133,689,1,10789,No
1,38,30,53172,588,3,5442,Yes
2,6,31,90000,573,4,5000,Yes
3,15,29,74634,621,7,16074,Yes
4,35,36,78232,701,5,17742,No


In [5]:
df.describe

<bound method NDFrame.describe of       Applicant_ID  Age  Annual_Income  Credit_Score  Employment_Years  \
0               10   32          82133           689                 1   
1               38   30          53172           588                 3   
2                6   31          90000           573                 4   
3               15   29          74634           621                 7   
4               35   36          78232           701                 5   
...            ...  ...            ...           ...               ...   
4995            28   27          75641           634                 4   
4996            36   27          63483           571                 1   
4997            34   48          88626           722                10   
4998            28   36          90000           644                13   
4999            35   24          43888           608                 3   

      Loan_Amount_Requested Default  
0                     10789      No  
1

In [6]:
df.drop(labels=['Applicant_ID'],axis=1,inplace=True)


In [7]:
X=df[['Annual_Income', 'Credit_Score', 'Employment_Years', 'Loan_Amount_Requested','Age']]

In [8]:
y = df['Default']

In [9]:
# Columns to be scaled
numeric_features = ['Annual_Income', 'Credit_Score', 'Employment_Years', 'Loan_Amount_Requested']

# Column to be binned and one-hot encoded
categorical_features = ['Age']

In [10]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.model_selection import train_test_split

In [11]:
# Create transformers
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', MinMaxScaler())
])
categorical_transformer = Pipeline(steps=[
    ('bin', KBinsDiscretizer(n_bins=6, encode='ordinal', strategy='quantile')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

# Combine all transformers into a preprocessor using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

In [12]:
# Create and evaluate the pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', LogisticRegression())])
pipeline

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('scaler',
                                                                   MinMaxScaler())]),
                                                  ['Annual_Income',
                                                   'Credit_Score',
                                                   'Employment_Years',
                                                   'Loan_Amount_Requested']),
                                                 ('cat',
                                                  Pipeline(steps=[('bin',
                                                                   KBinsDiscretizer(encode='ordinal',
                                                  

In [13]:
# Check the shape of X and y
print("Shape of X:", X.shape)
print("Shape of y:", y.shape)

# Ensure X and y have the same number of samples
if X.shape[0] != y.shape[0]:
    # If they don't match, adjust the data accordingly
    # For example, subset X and y to have the same number of samples
    X = X[:y.shape[0], :]
    print("Adjusted shape of X:", X.shape)

Shape of X: (5000, 5)
Shape of y: (5000,)


In [14]:
from sklearn.model_selection import train_test_split

# Split the data into a training set and a test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [15]:
# print train and test set shape
print("X_train:", X_train.shape)
print("X_test:", X_test.shape)
print("y_train:", y_train.shape)
print("y_test:", y_test.shape)


X_train: (3500, 5)
X_test: (1500, 5)
y_train: (3500,)
y_test: (1500,)


In [16]:
# generate predictions
pipeline.fit(X_train,y_train)
y_pred = pipeline.predict(X_test)
y_pred_prob = pipeline.predict_proba(X_test)[:, 1]

In [17]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test,y_pred))

0.9606666666666667


In [74]:
# install modelbit
!pip install modelbit

# run on top of your notebook
import modelbit
mb = modelbit.login()

In [18]:
import pandas as pd
import numpy as np

# first define function
def predict_loan_default(Age: int, Annual_Income: float, Credit_Score: int, Employment_Years: float, Loan_Amount_Requested: float)-> float:
   
  """
  Predict the probability of loan default using a pre-trained machine learning pipeline.

  Args:
      Age (int): Applicant's age.
      Annual_Income (float): Applicant's annual income.
      Credit_Score (int): Applicant's credit score.
      Employment_Years (float): Number of years employed.
      Loan_Amount_Requested (float): Requested loan amount.

  Returns:
      float: Probability of loan default.
  """
  
  return pipeline.predict_proba(pd.DataFrame([[Age, Annual_Income, Credit_Score, Employment_Years, Loan_Amount_Requested]],
                                             columns = ['Age', 'Annual_Income', 'Credit_Score', 'Employment_Years', 'Loan_Amount_Requested']))[:,1]     

In [19]:
predict_loan_default(32, 821233, 689, 1, 10789)

array([4.28870351e-27])

In [20]:
predict_loan_default(40, 1021233, 700, 10, 120789)

array([2.12347598e-44])

In [None]:
from sklearn.metrics import confusion_matrix