In [0]:
%run ./config_and_imports

Installing prophet...
Collecting prophet
  Using cached prophet-1.1.6-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.5 kB)
Collecting cmdstanpy>=1.0.4 (from prophet)
  Using cached cmdstanpy-1.2.5-py3-none-any.whl.metadata (4.0 kB)
Collecting holidays<1,>=0.25 (from prophet)
  Using cached holidays-0.70-py3-none-any.whl.metadata (34 kB)
Collecting tqdm>=4.36.1 (from prophet)
  Using cached tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Collecting importlib-resources (from prophet)
  Using cached importlib_resources-6.5.2-py3-none-any.whl.metadata (3.9 kB)
Collecting stanio<2.0.0,>=0.4.0 (from cmdstanpy>=1.0.4->prophet)
  Using cached stanio-0.5.1-py3-none-any.whl.metadata (1.6 kB)
Using cached prophet-1.1.6-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (14.4 MB)
Using cached cmdstanpy-1.2.5-py3-none-any.whl (94 kB)
Using cached holidays-0.70-py3-none-any.whl (903 kB)
Using cached tqdm-4.67.1-py3-none-any.whl (78 kB)
Using cached importlib_resources-6.5.2-p

DataFrame[]

### Load Scenarios Data

In [0]:
df = spark.read.table("gold_scenarios")

In [0]:
# Create widgets to accept percentage increases or decreases as text
dbutils.widgets.text("hpi_change", "0", "Enter % change for HPI (e.g., 10 or -10):")
dbutils.widgets.text("unemp_change", "0", "Enter % change for Unemployment Rate (e.g., 15 or -15):")
dbutils.widgets.text("cpi_change", "0", "Enter % change for CPI Inflation Rate (e.g., 15 or -15):")
dbutils.widgets.text("scenario", "Custom Scenario", "Enter name of the scenario:")

# Validate and convert inputs to float
try:
    hpi_percentage_change = float(dbutils.widgets.get("hpi_change"))
    unemp_percentage_change = float(dbutils.widgets.get("unemp_change"))
    cpi_percentage_change = float(dbutils.widgets.get("cpi_change"))
    scenario = dbutils.widgets.get("scenario")
except ValueError:
    raise ValueError("Please enter valid numeric values for the percentage changes.")

In [0]:
print("HPI percentage change: ", hpi_percentage_change, "unemp_percentage_change: ", unemp_percentage_change, "cpi_percentage_change: ", cpi_percentage_change, "scenario: ", scenario)

HPI percentage change:  15.0 unemp_percentage_change:  -10.0 cpi_percentage_change:  0.0 scenario:  Custom Scenario


In [0]:
def update_and_append_scenario_data(scoring_table, hpi_percentage_increase, unemp_percentage_increase, cpi_percentage_change, scenario):
    """
    Update HPI and Unemployment Rate for 'Supervisory Baseline' rows and append them
    to the original scoring table with scenario set to 'Scenario Results'.
    """
    # Filter rows where scenario is 'Supervisory Baseline'
    filtered_data = scoring_table.filter(col('scenario') == 'Supervisory Baseline')

    # Apply the percentage increases
    filtered_data = filtered_data.withColumn(
        'HousePriceIndex_pred', 
        col('HousePriceIndex_pred') * (1 + hpi_percentage_increase / 100)
    ).withColumn(
        'unemploymentrate_pred', 
        col('unemploymentrate_pred') * (1 + unemp_percentage_increase / 100)
    ).withColumn(
        'CPIInflationRate_pred', 
        col('CPIInflationRate_pred') * (1 + cpi_percentage_change / 100)
    )

    # Update the scenario column to 'Scenario Results'
    filtered_data = filtered_data.withColumn('scenario', lit(scenario))

    # Append the updated rows to the original table
    updated_table = scoring_table.union(filtered_data)

    return updated_table

In [0]:
updated_scenario_data = update_and_append_scenario_data(df, hpi_percentage_change, unemp_percentage_change, cpi_percentage_change, scenario)

### Load the previously saved PD model

In [0]:
# %pip install mlflow[databricks]
import mlflow
mlflow.set_registry_uri("databricks-uc")


In [0]:
model_path = f"models:/{catalog}.{output_schema}.{model_name}@{model_tag}"
loaded_model = mlflow.sklearn.load_model(model_path)

In [0]:
def score_in_parallel(pdf):
  continuous_cols = ['unemploymentrate_pred', 'HousePriceIndex_pred', 'CPIInflationRate_pred', 'credit_score', 'remaining_balance_den', 'curr_LTV']
  processed_data = pdf[continuous_cols]
  probabilities = loaded_model.predict_proba(processed_data)
  pdf['prob_class_0'] = probabilities[:, 0]
  pdf['prob_class_1'] = probabilities[:, 1]
  return(pdf)

### Run the loaded model on the updated scenarios table

In [0]:
result = updated_scenario_data.groupby("scenario").applyInPandas(
    score_in_parallel, schema="date date, unemploymentrate_pred double, CPIInflationRate_pred double, HousePriceIndex_pred double, loan_id string, credit_score int, remaining_balance_den double, scenario string, curr_LTV double, prob_class_0 double, prob_class_1 double")

### Calculate ECL by Scenarios

In [0]:
def calculate_ECL(data, LGD):
    return data.withColumn('ECL', col('remaining_balance_den') * col('prob_class_1') * LGD)

In [0]:
LGD = 0.20
predicted = calculate_ECL(result, LGD)


In [0]:
from pyspark.sql.functions import col, sum
from pyspark.sql.window import Window

# Group by 'date' and 'scenario', sum 'ECL', and calculate cumulative ECL
aggregated_df = (predicted.withColumn("date", col("date").cast("date"))
                 .groupBy("date", "scenario")
                 .agg(sum("ECL").alias("ECL"))
                 .withColumn("cumulative_ecl", sum("ECL").over(Window.partitionBy("scenario").orderBy("date").rowsBetween(Window.unboundedPreceding, 0)))
                 .orderBy("date"))

# Display the result
display(aggregated_df)

date,scenario,ECL,cumulative_ecl
2021-09-01,Custom Scenario,407604756.1647724,407604756.1647724
2021-09-01,Predicted by Databricks,431961210.01693547,431961210.01693547
2021-09-01,Supervisory Baseline,445362776.4754792,445362776.4754792
2021-09-01,Supervisory Severely Adverse,436164312.41357046,436164312.41357046
2021-10-01,Custom Scenario,418193848.7480434,825798604.9128158
2021-10-01,Predicted by Databricks,441816064.1674456,873777274.184381
2021-10-01,Supervisory Baseline,457589902.0429542,902952678.5184332
2021-10-01,Supervisory Severely Adverse,454701495.4674267,890865807.8809972
2021-11-01,Custom Scenario,429335302.4623767,1255133907.3751926
2021-11-01,Predicted by Databricks,452403470.6563406,1326180744.8407216


Databricks visualization. Run in Databricks to view.

In [0]:
grouped = predicted.groupBy('date','scenario').agg(sum('ECL').alias('ECL')).orderBy('date')
# grouped.display()

date,scenario,ECL
2021-09-01,Supervisory Severely Adverse,486249041.0924988
2021-09-01,Predicted by Databricks,432811229.11745673
2021-09-01,Supervisory Baseline,457766995.3227846
2021-09-01,Custom Scenario,417761417.6490449
2021-10-01,Supervisory Severely Adverse,509374635.5113331
2021-10-01,Predicted by Databricks,442174646.0614249
2021-10-01,Supervisory Baseline,467033854.7540709
2021-10-01,Custom Scenario,425710721.2886301
2021-11-01,Supervisory Severely Adverse,522697649.6543227
2021-11-01,Predicted by Databricks,452911605.69799167


Databricks visualization. Run in Databricks to view.

In [0]:
# from pyspark.sql.functions import countDistinct

# unique_loans_df = predicted.groupBy("date").agg(countDistinct("loan_id").alias("unique_loans")).orderBy("date")
# unique_loans_df.display()

date,unique_loans
2021-09-01,89977
2021-10-01,89977
2021-11-01,89977
2021-12-01,89973
2022-01-01,89967
2022-02-01,89961
2022-03-01,89952
2022-04-01,89946
2022-05-01,89938
2022-06-01,89928


In [0]:
aggregated_df.write.format("delta").mode("overwrite").saveAsTable(f"{catalog}.{output_schema}.ECL_sensitivity_results")