In [15]:
import pandas as pd
import seaborn as sns
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_recall_fscore_support as score
import mlflow
import datetime
import pickle
import warnings
import numpy as np
from sklearn.model_selection import train_test_split
from arize.pandas.logger import Client, Schema
import datetime as dt
from arize.utils.types import ModelTypes, Environments
warnings.filterwarnings("ignore")

In [16]:
mlflow.set_tracking_uri("file:///C:/Users/msi/Desktop/mlops/mlruns")

mlflow.set_experiment("loan_approval_prediction")

<Experiment: artifact_location='file:///c:/Users/msi/Desktop/mlops/mlruns/195371603636612681', creation_time=1732740008820, experiment_id='195371603636612681', last_update_time=1732740008820, lifecycle_stage='active', name='loan_approval_prediction', tags={}>

In [17]:
#read the data
raw_train = pd.read_csv('../data/loan_data_2.csv')

In [18]:
raw_train=raw_train.drop(columns=['Unnamed: 0'])


In [20]:
#Reading Pandas Dataframe from mlflow
all_experiments = [exp.experiment_id for exp in mlflow.search_experiments()]
df_mlflow = mlflow.search_runs(experiment_ids=all_experiments,filter_string="metrics.F1_score_test <1")
run_id = df_mlflow.loc[df_mlflow['metrics.F1_score_test'].idxmax()]['run_id']

#let's call the model from the model registry ( in production stage)
import mlflow.pyfunc

logged_model = f'runs:/{run_id}/ML_models'

# Load model as a PyFuncModel.
loaded_model = mlflow.pyfunc.load_model(logged_model)
print(loaded_model)

mlflow.pyfunc.loaded_model:
  artifact_path: ML_models
  flavor: mlflow.xgboost
  run_id: 508f98877c244ee58b2ee59373384b32



In [21]:
list(raw_train.columns)

['State',
 'BankState',
 'ApprovalFY',
 'Term',
 'NoEmp',
 'UrbanRural',
 'RevLineCr',
 'LowDoc',
 'DisbursementGross',
 'GrAppv',
 'Industry',
 'IsFranchise',
 'NewBusiness',
 'Default',
 'DisbursementFY',
 'DaysToDisbursement',
 'SBA_AppvPct',
 'AppvDisbursed',
 'RealEstate',
 'GreatRecession']

In [22]:
import json
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()

# Dictionary to store mappings for all categorical columns
mappings = {}

# Loop through object (categorical) columns
for column in raw_train.select_dtypes(include='object').columns:
    # Encode the column
    raw_train[column] = encoder.fit_transform(raw_train[column])
    

In [38]:
#cleaning and preprocessing
X,y = raw_train.drop(columns='Default'),raw_train['Default']

## Transform the training data before sending it to Arize AI :

In [23]:
selected_cols = list(raw_train.columns)

In [24]:
baseline = raw_train[selected_cols]

In [30]:
baseline

Unnamed: 0,State,BankState,ApprovalFY,Term,NoEmp,UrbanRural,RevLineCr,LowDoc,DisbursementGross,GrAppv,Industry,IsFranchise,NewBusiness,Default,DisbursementFY,DaysToDisbursement,SBA_AppvPct,AppvDisbursed,RealEstate,GreatRecession
0,43,19,2010,60,6,1,0,0,25000.0,25000.0,0,0,1,0,2009,0,0.90,1,0,1
1,35,36,2010,78,4,1,0,0,9000.0,9000.0,16,0,0,0,2009,31,1.00,1,0,1
2,15,16,2010,78,14,1,0,0,12000.0,12000.0,0,1,0,0,2009,26,1.00,1,0,1
3,30,31,2010,76,12,2,0,0,35000.0,35000.0,17,0,0,0,2009,0,1.00,1,0,1
4,23,24,2010,78,4,2,0,0,35000.0,35000.0,9,0,0,0,2009,31,1.00,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35411,11,12,2014,60,0,1,0,0,50000.0,50000.0,17,0,1,0,2014,0,0.50,1,0,0
35412,37,36,2014,60,1,1,0,0,165000.0,165000.0,7,0,0,0,2014,0,0.50,1,0,0
35413,4,5,2014,84,15,1,0,0,150000.0,150000.0,9,0,0,0,2014,0,0.50,1,0,0
35414,26,27,2014,6,30,2,1,0,210000.0,210000.0,4,0,0,0,2014,0,0.75,1,0,0


In [31]:
baseline.rename(columns = {'Default':'actual_label'}, inplace = True)

In [32]:
transform_bin_str = { 0 : 'Paid', 1 : 'charged off'}
baseline['actual_label'] = baseline['actual_label'].map(transform_bin_str)

In [40]:
import json
import pandas as pd
from sklearn.preprocessing import StandardScaler

scale = StandardScaler()


# Function to scale data
def scaling(df):
    # Only scale numerical columns
    num_cols = df.select_dtypes(include=['number']).columns
    df[num_cols] = scale.fit_transform(df[num_cols])
    return df
X=scaling(X)

In [41]:
preds = loaded_model.predict(X)

In [42]:
baseline['prediction_label'] = preds

In [43]:
baseline['prediction_label'] = baseline['prediction_label'].map(transform_bin_str)

In [44]:
import uuid
# Prediction ID is required for all datasets
def generate_prediction_ids(X):
    return pd.Series((str(uuid.uuid4()) for _ in range(len(X))), index=X.index)

In [45]:
baseline["prediction_id"]=generate_prediction_ids(baseline)

In [46]:
baseline.head(3)

Unnamed: 0,State,BankState,ApprovalFY,Term,NoEmp,UrbanRural,RevLineCr,LowDoc,DisbursementGross,GrAppv,...,NewBusiness,actual_label,DisbursementFY,DaysToDisbursement,SBA_AppvPct,AppvDisbursed,RealEstate,GreatRecession,prediction_label,prediction_id
0,43,19,2010,60,6,1,0,0,25000.0,25000.0,...,1,Paid,2009,0,0.9,1,0,1,Paid,38fdd328-8199-46d8-b0d9-b74d4ee783e6
1,35,36,2010,78,4,1,0,0,9000.0,9000.0,...,0,Paid,2009,31,1.0,1,0,1,Paid,b670117b-dc8b-4cbf-82e0-3c8a4811b528
2,15,16,2010,78,14,1,0,0,12000.0,12000.0,...,0,Paid,2009,26,1.0,1,0,1,Paid,ebf79884-5389-4d7a-845a-8668bf2dfa32


## Setup Arize AI :

In [61]:
SPACE_KEY = "put_your_space_key"
API_KEY = "put_your_api_key"

arize_client = Client(space_key=SPACE_KEY, api_key=API_KEY)

model_id = (
    "loan_prediction"  # This is the model name that will show up in Arize
)
model_version = "v2"  # Version of model - can be any string

if SPACE_KEY == "SPACE_KEY" or API_KEY == "API_KEY":
    raise ValueError("❌ NEED TO CHANGE SPACE AND/OR API_KEY")
else:
    print("✅ Arize setup complete!")

✅ Arize setup complete!


## Send train data to Arize AI :
the training data will be the reference data later in production

In [59]:
features = feature_column_names=list(baseline.columns.drop(
        ["prediction_id", "prediction_label", "actual_label"]))

In [53]:
features

['State',
 'BankState',
 'ApprovalFY',
 'Term',
 'NoEmp',
 'UrbanRural',
 'RevLineCr',
 'LowDoc',
 'DisbursementGross',
 'GrAppv',
 'Industry',
 'IsFranchise',
 'NewBusiness',
 'DisbursementFY',
 'DaysToDisbursement',
 'SBA_AppvPct',
 'AppvDisbursed',
 'RealEstate',
 'GreatRecession']

In [63]:
# Define a Schema() object for Arize to pick up data from the correct columns for logging
training_schema = Schema(
    prediction_id_column_name="prediction_id",
    prediction_label_column_name="prediction_label",
    actual_label_column_name="actual_label",
    feature_column_names=features)

# Logging Training DataFrame
training_response = arize_client.log(
    dataframe=baseline,
    model_id=model_id,
    model_version=model_version,
    model_type=ModelTypes.SCORE_CATEGORICAL,
    environment=Environments.TRAINING,
    schema=training_schema,
)

# If successful, the server will return a status_code of 200
if training_response.status_code != 200:
    print(
        f"logging failed with response code {training_response.status_code}, {training_response.text}"
    )
else:
    print(f"✅ You have successfully logged training set to Arize")

[38;21m  arize.utils.logging | INFO | Success! Check out your data at https://app.arize.com/organizations/QWNjb3VudE9yZ2FuaXphdGlvbjoxMjIzODoySCt3/spaces/U3BhY2U6MTI4NDg6Y3F4Nw==/models/modelName/loan_prediction?selectedTab=performance[0m
✅ You have successfully logged training set to Arize
