# <span style="color:#ff5f27;"> 🏦 Loan Analysis Training Pipeline</span>

https://www.kaggle.com/code/faressayah/lending-club-loan-defaulters-prediction 


This notebook:

 * selects features from feature groups to create a feature view
 * creates train/test data with the feature view
 * trains a model to predict loan approvals with the training set
 * evaluates the model on the test set
 * uploads the model along with evaluation data to Hopsworks

## <span style="color:#ff5f27;">📝 Imports </span>


In [None]:
!pip install joblib scikit-learn --quiet

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import joblib
import os
import warnings
warnings.filterwarnings('ignore')
from sklearn.metrics import (
    accuracy_score, 
    confusion_matrix, 
    classification_report, 
    roc_auc_score,
)
from sklearn.metrics import ConfusionMatrixDisplay, RocCurveDisplay
from sklearn.linear_model import LogisticRegression
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.feature_selection import SelectPercentile, chi2

pd.set_option('display.float', '{:.2f}'.format)
pd.set_option('display.max_columns', 50)
pd.set_option('display.max_rows', 50)

## <span style="color:#ff5f27;"> 🔮 Connect to Hopsworks Feature Store</span>

In [None]:
import hopsworks

project = hopsworks.login()

fs = project.get_feature_store()

## <span style="color:#ff5f27;">🪄 Create the Feature View</span>


In [None]:
# Retrieve feature groups
fg_loans = fs.get_feature_group(
    name="loans", 
    version=1,
)

fg_applicants = fs.get_feature_group(
    name="applicants", 
    version=1,
)

In [None]:
# Select features for training dataset
selected_features = fg_loans.select_features().join(\
            fg_applicants.select_features())

# Uncomment this if you would like to view your selected features
# selected_features.show(5)

In [None]:
feature_view = fs.get_or_create_feature_view(
    name="loans_approvals", 
    version=1,
    description="Loan applicant data",
    labels=["loan_status"],
    query=selected_features,
)

In [None]:
X_train, X_test, y_train, y_test = feature_view.train_test_split(
    test_size=0.2,
)

In [None]:
X_test.head(3)

In [None]:
y_train.head(3)

## <span style="color:#ff5f27;">👩🏻‍🔬 Feature Transformation</span>


In [None]:
# Map features to transformation functions using list comprehensions
# Identify categorical features by checking the data type of each column
categorical_features = [
    col 
    for col 
    in X_train.columns 
    if X_train[col].dtype == object
]

# Identify numeric features by checking the data type of each column
numeric_features = [
    col 
    for col 
    in X_train.columns 
    if X_train[col].dtype != object
]

# Print the identified numeric and categorical features
print("⛳️ Numeric Features:", numeric_features)
print("⛳️ Categorical Features:", categorical_features)

In [None]:
# Define a numeric transformer pipeline
numeric_transformer = Pipeline(
    steps=[
        # Impute missing values with the median and scale the numeric features
        ("imputer", SimpleImputer(strategy="median")), 
        ("scaler", StandardScaler()),
    ]
)

# Define a categorical transformer pipeline
categorical_transformer = Pipeline(
    steps=[
        # Encode categorical features using one-hot encoding and select top features using chi-squared test
        ("encoder", OneHotEncoder(handle_unknown="ignore")),
        ("selector", SelectPercentile(chi2, percentile=50)),
    ]
)

# Use ColumnTransformer to apply transformers to different subsets of columns
# Here, numeric features are processed by the numeric_transformer,
# and categorical features are processed by the categorical_transformer
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),          # Apply numeric transformer to numeric features
        ("cat", categorical_transformer, categorical_features),  # Apply categorical transformer to categorical features
    ]
)

In [None]:
# Map the 'loan_status' column in y_train to binary labels
y_train['loan_status'] = y_train.loan_status.map({'Fully Paid': 1, 'Charged Off': 0})

# Map the 'loan_status' column in y_test to binary labels
y_test['loan_status'] = y_test.loan_status.map({'Fully Paid': 1, 'Charged Off': 0})

In [None]:
y_train.head()

## <span style="color:#ff5f27;">🤖 Models Building</span>


In [None]:
def print_score(true, pred, train=True):
    if train:
        clf_report = pd.DataFrame(classification_report(true, pred, output_dict=True))
        print("Train Result:\n================================================")
        print(f"Accuracy Score: {accuracy_score(true, pred) * 100:.2f}%")
        print("_______________________________________________")
        print(f"CLASSIFICATION REPORT:\n{clf_report}")
        print("_______________________________________________")
        print(f"Confusion Matrix: \n {confusion_matrix(true, pred)}\n")
        
    elif train==False:
        clf_report = pd.DataFrame(classification_report(true, pred, output_dict=True))
        print("Test Result:\n================================================")        
        print(f"Accuracy Score: {accuracy_score(true, pred) * 100:.2f}%")
        print("_______________________________________________")
        print(f"CLASSIFICATION REPORT:\n{clf_report}")
        print("_______________________________________________")
        print(f"Confusion Matrix: \n {confusion_matrix(true, pred)}\n")

In [None]:
# Create a pipeline with preprocessing and a logistic regression classifier
clf = Pipeline(
    steps=[
        ("preprocessor", preprocessor),  # Apply the specified preprocessor (ColumnTransformer)
        ("classifier", LogisticRegression()),  # Use Logistic Regression as the classifier
    ]
)

# Fit the pipeline on the training data
clf.fit(X_train, y_train['loan_status'].ravel())

In [None]:
# Predictions on the training set
y_train_pred = clf.predict(X_train)

# Predictions on the test set
y_test_pred = clf.predict(X_test)

# Print performance scores for the training set
print_score(y_train, y_train_pred, train=True)

# Print performance scores for the test set
print_score(y_test, y_test_pred, train=False)

In [None]:
# Calculate and print the ROC AUC score on the test set
accuracy = roc_auc_score(y_test, clf.predict(X_test))
print("⛳️ ROC AUC Score on Test Set:", accuracy)

## <span style="color:#ff5f27;">🗄️ Register the Model with Model Registry</span>


In [None]:
# Get the model registry
mr = project.get_model_registry()

In [None]:
# Define directories
model_dir = "lending_model"
images_dir = os.path.join(model_dir, "images")

# Create directories if they don't exist
os.makedirs(images_dir, exist_ok=True)

In [None]:
# Save the trained classifier pipeline as 'lending_model.pkl' in the "lending_model" directory
joblib.dump(clf, os.path.join(model_dir, 'lending_model.pkl'))

In [None]:
# Create and save Confusion Matrix
plt.figure(figsize=(8, 6))
disp = ConfusionMatrixDisplay.from_estimator(
    clf,
    X_test,
    y_test,
    cmap='Blues',
    values_format='d',
    display_labels=['Default', 'Fully-Paid']
)
disp.plot()
plt.savefig(f'{images_dir}/confusion_matrix.png')
plt.close()

In [None]:
# Create and save ROC Curve
plt.figure(figsize=(8, 6))
display = RocCurveDisplay.from_estimator(clf, X_test, y_test)
display.plot()
plt.savefig(f'{images_dir}/roc_curve.png')
plt.close()

## <span style="color:#ff5f27;">📝 Register model</span>

One of the features in Hopsworks is the model registry. This is where we can store different versions of models and compare their performance. Models from the registry can then be served as API endpoints.

In [None]:
# Create a sklearn model in the Model Registry
fraud_model = mr.sklearn.create_model(
    "lending_model",
    metrics={'accuracy': accuracy},            # Specify metrics for the model
    input_example=X_test.sample().to_numpy(),  # Provide an input example for the model
    feature_view=feature_view,                 # Add a feature view to the model
)

# Save the created model in the model directory
fraud_model.save(model_dir)

---
## <span style="color:#ff5f27;">⏭️ **Next:** Part 03: Batch Inference</span>

In the following notebook you will use your model for batch inference.
