# Scoring Function

In [67]:
import pandas as pd
import numpy as np
import joblib
import category_encoders as ce
import pickle

def project_1_scoring(input_data: pd.DataFrame):
    # Load the saved model and encoders
    logreg = joblib.load('./artifacts/LogisticRegressionModel.pkl')  # Load the trained logistic regression model
    pca_obj = joblib.load('./artifacts/pca_obj.pkl')  # Load the PCA object
    onehot_encoder = joblib.load('./artifacts/one_hot_encoder.pkl')  # Load the one-hot encoder object
    woe_encoder = joblib.load('./artifacts/woe_encoder.pkl')  # Load the WOE encoder object
    glm_model = joblib.load('./artifacts/glm.pkl')  # Load the GLM model

    # Following the same preprocess steps that followed in project1 
    
    # Remove the 'index' column
    input_data = input_data.drop('index', axis=1)  # Drop the 'index' column
    
    # Replace encode Na/Null values
    input_data.fillna(0, inplace=True)  # Replace null/missing values with 0
    
    # Convert the dollar columns from string format to float format
    dollar_columns = ['DisbursementGross', 'BalanceGross', 'GrAppv', 'SBA_Appv']
    for col in dollar_columns:
        input_data[col] = [float(val[1:].replace(',', '')) for val in input_data[col].values]

    # Convert MIS_Status to 0/1. Make value "CHGOFF" as 1
    input_data['MIS_Status'] = (input_data['MIS_Status'] == 'CHGOFF').astype(int)  # Convert 'MIS_Status' to binary
    
    # One-hot encoding to the data and transform it
    input_data = onehot_encoder.transform(input_data)  # Apply one-hot encoding to input data
    
    # WOE encoding
    input_data = woe_encoder.transform(input_data)  # Apply WOE encoding to input data
    # Append '_woe' to the column names of the WOE-encoded columns
    input_data = input_data.add_suffix('_woe')
    
    # PCA transformation
    input_data_pca = pca_obj.transform(input_data.drop('MIS_Status_woe',axis=1))  # Perform PCA transformation on input data
    
    # GLM transformation
    input_data_glm = glm_model.predict(input_data[['Term_woe', 'SBA_Appv_woe', 'CreateJob_woe', 'NoEmp_woe', 'GrAppv_woe']])  # Perform GLM transformation on input data
    
    # Add PCA and GLM columns
    def add_pca_glm_columns(input_data, pca_data, glm_data):
        for i in range(5):
            input_data[f"pca{i+1}"] = pca_data[:, i]
        input_data["GLM1"] = glm_data
        features = ['Term_woe', 'SBA_Appv_woe', 'UrbanRural_woe', 'NoEmp_woe', 'GrAppv_woe']
        for i, feature in enumerate(features):
            input_data[f"GLM{i+2}"] = glm_data * input_data[feature]
    add_pca_glm_columns(input_data, input_data_pca, input_data_glm)  # Add PCA and GLM columns to input data
    
    # Make predictions using the loaded model
    Y_pred_prob = logreg.predict_proba(input_data.drop('MIS_Status_woe',axis=1))[:, 1]  # Predict class probabilities for input data
    Y_pred_class = (Y_pred_prob >= 0.4).astype(int)  # Predict class using class probabilities

    # Create the output DataFrame
    output_data = pd.DataFrame({
        "record_index": input_data.index,
        "predicted_class": Y_pred_class,
        "probability_0": 1 - Y_pred_prob,
        "probability_1": Y_pred_prob
    })

    return output_data

In [68]:
# Load new data and remove the target column (if it's present)
import pandas as pd
import pickle
data1 = pd.read_csv('SBA_loans_project_1.zip')

# Get predictions using the scoring function
results_data = project_1_scoring(data1)
print(results_data.head())

   record_index  predicted_class  probability_0  probability_1
0             0                0       0.999940       0.000060
1             1                0       0.998103       0.001897
2             2                0       0.989954       0.010046
3             3                0       0.998648       0.001352
4             4                0       0.979053       0.020947
