### Batch Inference Pipeline for Aggregate Loan Default Statistics

This notebook performs the following actions:

* gets a feature view object with its name/version from Hopsworks
* downloads a Pandas DataFrame with new inference data from Hopsworks using the feature view and the call `fv.get_batch_data(start_time="...")`
* downloads the model from Hopsworks using with its name/version
* makes predictions for all input rows of features in the DataFrame using the model
* logs all the prediction results and features to a `loan_monitoring` feature group in Hopsworks


The *output of this notebook is a dashboard* -  a confusion matrix showing the ratio of predicted good/bad loans for the batch inference data. It is uploaded to Hopsworks and can be viewed from the file system in Hopsworks.

In [None]:
!pip install hopsworks

In [None]:
import hopsworks
import pandas as pd
import joblib
import os
from sklearn.linear_model import LogisticRegression
import numpy as np

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.feature_selection import SelectPercentile, chi2

In [None]:
fv_version=1
model_version=1
start_time_data="2016-11-01"
end_time_data="2016-12-01"

In [None]:
import os

# Hosted notebook environments may not have the local features package
def need_download_modules():
    if 'google.colab' in str(get_ipython()):
        return True
    if 'HOPSWORKS_PROJECT_ID' in os.environ:
        return True
    return False

if need_download_modules():
    print("Downloading modules")
    os.system('mkdir -p features')
    os.system('cd features && wget https://raw.githubusercontent.com/logicalclocks/hopsworks-tutorials/dev/loan_approval/features/loans.py')
    os.system('cd features && wget https://raw.githubusercontent.com/logicalclocks/hopsworks-tutorials/dev/loan_approval/features/applicants.py')
else:
    print("Local environment")

In [None]:
import hopsworks

proj = hopsworks.login()
fs = proj.get_feature_store()

In [None]:
fv = fs.get_feature_view(name="loans_approvals", version=fv_version)

In [None]:
arr = [f.name for f in fv.schema]
print(arr)

In [None]:
def retrieve_data(feature_view):
    batch_data = feature_view.get_batch_data(
        start_time=start_time_data, end_time=end_time_data)
    return batch_data

df = retrieve_data(fv)
df

In [None]:
def get_model(project):
    mr = project.get_model_registry()
    model = mr.get_model("lending_model", version=model_version)
    model_dir = model.download()
    return joblib.load(model_dir + "/lending_model.pkl")

model = get_model(proj)

In [None]:
predictions = model.predict(df)
predictions

In [None]:
np.unique(predictions, return_counts=True)

In [None]:
cols = []
for td in fv.schema:
    cols.append(td.name)

predictions_df = pd.DataFrame(predictions, columns=["prediction"])
predictions_df

In [None]:
import time

fg = fs.get_feature_group(name="loans", version=1)
labels_df = fg.read()

In [None]:
monitor_df = df.copy()
monitor_df['prediction'] = predictions_df

monitor_df['prediction'] = predictions_df.prediction.map({1:'Fully Paid', 0:'Charged Off'})

monitor_df['outcome'] = labels_df['loan_status']
monitor_df['prediction_ts'] = pd.datetime.now()

monitor_df['id'] = monitor_df.index

monitor_df

In [None]:
loan_monitoring_fg = fs.get_or_create_feature_group(name="loan_monitoring",
                                    version=1,
                                    description="Lending Club Loan Predictions and Outcomes",
                                    primary_key=['id'],
                                    event_time='prediction_ts'
)

In [None]:
loan_monitoring_fg.insert(monitor_df)

In [None]:
from sklearn.metrics import confusion_matrix

predictions = monitor_df[['prediction']]
labels = monitor_df[['outcome']]

results = confusion_matrix(labels, predictions)
print(results)

In [None]:
from matplotlib import pyplot
import seaborn as sns

df_cm = pd.DataFrame(results, ['True Defaulted', 'True Paid Off'],
                     ['Predicted Defaulted','Predicted Paid Off'])

cm = sns.heatmap(df_cm, annot=True)

fig = cm.get_figure()
fig.savefig("./confusion_matrix.png") 
df_cm

In [None]:
dataset_api = proj.get_dataset_api()    
dataset_api.upload("./confusion_matrix.png", "Resources/images", overwrite=True)