# 🏦 Loan Analysis Training Pipeline

https://www.kaggle.com/code/faressayah/lending-club-loan-defaulters-prediction 


This notebook:

 * selects features from feature groups to create a feature view
 * creates train/test data with the feature view
 * trains a model to predict loan approvals with the training set
 * evaluates the model on the test set
 * uploads the model along with evaluation data to Hopsworks

In [None]:
!pip install hopsworks

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
from scipy import stats 
import matplotlib.pyplot as plt
import time
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import MinMaxScaler

from sklearn.metrics import (
    accuracy_score, confusion_matrix, classification_report, 
    roc_auc_score, roc_curve, auc
)
from sklearn.metrics import ConfusionMatrixDisplay, RocCurveDisplay

from sklearn.ensemble import RandomForestClassifier

pd.set_option('display.float', '{:.2f}'.format)
pd.set_option('display.max_columns', 50)
pd.set_option('display.max_rows', 50)

## 🔖 Hopsworks Feature Store

In [None]:
import hopsworks

proj = hopsworks.login()
fs = proj.get_feature_store()

## Create the Feature View

In [None]:
fg_loans = fs.get_feature_group(name="loans", version=1)
fg_applicants = fs.get_feature_group(name="applicants", version=1)

query = fg_loans.select_except(["id", "issue_d"]).join(\
            fg_applicants.select_except(["earliest_cr_line", "earliest_cr_line_year", "id"]))

In [None]:
fv_version=1
try:
    fv = fs.get_feature_view(name="loans_approvals", version=fv_version)
except:
    fv = fs.create_feature_view(name="loans_approvals", 
                       version=fv_version,
                        description="Loan applicant data",
                        labels=["loan_status"],
                        query=query
                       )

In [None]:
start_time = time.time()

X_train, X_test, y_train, y_test = fv.train_test_split(test_size=0.2)

print("Get Random Split Training Data in %s seconds ---" % (time.time() - start_time))

In [None]:
X_test

In [None]:
y_train

In [None]:
# Map features to transformation functions.
categorical_features = []
numeric_features = []
for col in X_train.columns:
    if X_train[col].dtype == object:
        categorical_features.append(col)
    else:
        numeric_features.append(col)

print(numeric_features)
print(categorical_features)

In [None]:
import numpy as np

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.feature_selection import SelectPercentile, chi2

numeric_transformer = Pipeline(
    steps=[("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())]
)

categorical_transformer = Pipeline(
    steps=[
        ("encoder", OneHotEncoder(handle_unknown="ignore")),
        ("selector", SelectPercentile(chi2, percentile=50)),
    ]
)

# Here, we are passing the column names to the ColumnTransformer 
# in 'numeric_features' and 'categorical_features'
# This means in inference, we also have to pass the names of the columns - you can only do this with a
# Pandas DataFrame. So, that means in online inference, you have to create a Pandas DataFrame to make
# the feature vector. If that is too slow, you should use "ids" here, instead of names, then you can use
# a numpy array instead of a Pandas DataFrame in online inference
# https://stackoverflow.com/questions/71715754/valueerror-specifying-the-columns-using-strings-is-only-supported-for-pandas-da
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

In [None]:
y_train['loan_status'] = y_train.loan_status.map({'Fully Paid':1, 'Charged Off':0})
y_test['loan_status'] = y_test.loan_status.map({'Fully Paid':1, 'Charged Off':0})

In [None]:
y_train

# 🤖 Models Building

In [None]:
def print_score(true, pred, train=True):
    if train:
        clf_report = pd.DataFrame(classification_report(true, pred, output_dict=True))
        print("Train Result:\n================================================")
        print(f"Accuracy Score: {accuracy_score(true, pred) * 100:.2f}%")
        print("_______________________________________________")
        print(f"CLASSIFICATION REPORT:\n{clf_report}")
        print("_______________________________________________")
        print(f"Confusion Matrix: \n {confusion_matrix(true, pred)}\n")
        
    elif train==False:
        clf_report = pd.DataFrame(classification_report(true, pred, output_dict=True))
        print("Test Result:\n================================================")        
        print(f"Accuracy Score: {accuracy_score(true, pred) * 100:.2f}%")
        print("_______________________________________________")
        print(f"CLASSIFICATION REPORT:\n{clf_report}")
        print("_______________________________________________")
        print(f"Confusion Matrix: \n {confusion_matrix(true, pred)}\n")

In [None]:
from sklearn.linear_model import LogisticRegression

clf = Pipeline(
    steps=[("preprocessor", preprocessor), ("classifier", LogisticRegression())]
)

clf.fit(X_train, y_train['loan_status'].ravel())


In [None]:
y_train_pred = clf.predict(X_train)
y_test_pred = clf.predict(X_test)

print_score(y_train, y_train_pred, train=True)
print_score(y_test, y_test_pred, train=False)

In [None]:
disp = ConfusionMatrixDisplay.from_estimator(
    clf, X_test, y_test, 
    cmap='Blues', values_format='d', 
    display_labels=['Default', 'Fully-Paid']
)

display = RocCurveDisplay.from_estimator(clf, X_test, y_test)
display.plot()

# ✔️ Register the Model with Model Registry

In [None]:
mr = proj.get_model_registry()

In [None]:
import joblib
import os
os.makedirs("lending_model/features", exist_ok=True)
plt.savefig('lending_model/roc_curve.png')
joblib.dump(clf, 'lending_model/lending_model.pkl')

accuracy = roc_auc_score(y_test, clf.predict(X_test))

In [None]:
accuracy

In [None]:
from hsml.schema import Schema
from hsml.model_schema import ModelSchema

input_schema = Schema(X_test)
output_schema = Schema(y_test)

fraud_model = mr.sklearn.create_model("lending_model",
                                      metrics={'accuracy': accuracy},
                                      input_example=X_test.sample().to_numpy(), 
                                      model_schema=ModelSchema(input_schema=input_schema, output_schema=output_schema))
fraud_model.save('lending_model')