In [0]:
%run ./config_and_imports

Installing prophet...
Collecting prophet
  Using cached prophet-1.1.6-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.5 kB)
Collecting cmdstanpy>=1.0.4 (from prophet)
  Using cached cmdstanpy-1.2.5-py3-none-any.whl.metadata (4.0 kB)
Collecting holidays<1,>=0.25 (from prophet)
  Using cached holidays-0.70-py3-none-any.whl.metadata (34 kB)
Collecting tqdm>=4.36.1 (from prophet)
  Using cached tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Collecting importlib-resources (from prophet)
  Using cached importlib_resources-6.5.2-py3-none-any.whl.metadata (3.9 kB)
Collecting stanio<2.0.0,>=0.4.0 (from cmdstanpy>=1.0.4->prophet)
  Using cached stanio-0.5.1-py3-none-any.whl.metadata (1.6 kB)
Using cached prophet-1.1.6-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (14.4 MB)
Using cached cmdstanpy-1.2.5-py3-none-any.whl (94 kB)
Using cached holidays-0.70-py3-none-any.whl (903 kB)
Using cached tqdm-4.67.1-py3-none-any.whl (78 kB)
Using cached importlib_resources-6.5.2-p

DataFrame[]

### 1. Load Scenarios Data

In [0]:
df = spark.read.table("gold_scenarios")
df.display()

date,unemploymentrate_pred,CPIInflationRate_pred,HousePriceIndex_pred,loan_id,credit_score,remaining_balance_den,curr_LTV,scenario
2022-07-01,3.7,2.5,273.8,F20Q40245783,807,42684.73277645811,0.054,Supervisory Baseline
2022-07-01,3.7,2.5,273.8,F20Q40246136,757,31258.41967248011,0.134,Supervisory Baseline
2022-07-01,3.7,2.5,273.8,F20Q40246415,764,22278.5559965704,0.053,Supervisory Baseline
2022-07-01,3.7,2.5,273.8,F20Q40246467,859,18850.459628548266,0.062,Supervisory Baseline
2022-07-01,3.7,2.5,273.8,F20Q40246475,825,52177.51175403023,0.049,Supervisory Baseline
2022-07-01,3.7,2.5,273.8,F20Q40246658,759,23737.54219516837,0.051,Supervisory Baseline
2022-07-01,3.7,2.5,273.8,F20Q40246835,787,17817.44779438286,0.128,Supervisory Baseline
2022-07-01,3.7,2.5,273.8,F20Q40246891,659,19122.310640252123,0.136,Supervisory Baseline
2022-07-01,3.7,2.5,273.8,F20Q40248195,788,21361.28648890929,0.129,Supervisory Baseline
2022-07-01,3.7,2.5,273.8,F20Q40248218,764,32045.68917293792,0.051,Supervisory Baseline


In [0]:
# We have ~16.6M rows in the scenarios table
df.count()

16615416

In [0]:
# Get the min and max date
date_range = df.select(min("date").alias("start_date"), max("date").alias("end_date")).collect()[0]

# Print the date range
print(f"Date range: {date_range['start_date']} to {date_range['end_date']}")

Date range: 2021-09-01 to 2026-10-01


### 2. Load the previously saved PD model

In [0]:
import mlflow
mlflow.set_registry_uri("databricks-uc")

In [0]:
model_path = f"models:/{catalog}.{output_schema}.{model_name}@{model_tag}"
loaded_model = mlflow.sklearn.load_model(model_path)

Downloading artifacts:   0%|          | 0/9 [00:00<?, ?it/s]

In [0]:
def score_in_parallel(pdf):
  continuous_cols = ['unemploymentrate_pred', 'HousePriceIndex_pred', 'CPIInflationRate_pred', 'credit_score', 'remaining_balance_den', 'curr_LTV']
  processed_data = pdf[continuous_cols]
  probabilities = loaded_model.predict_proba(processed_data)
  pdf['prob_class_0'] = probabilities[:, 0]
  pdf['prob_class_1'] = probabilities[:, 1]
  return(pdf)

### 4. Score on the Scenarios Data in parallel

In [0]:
result = df.groupby("scenario").applyInPandas(
    score_in_parallel, schema="date date, unemploymentrate_pred double, CPIInflationRate_pred double, HousePriceIndex_pred double, loan_id string, credit_score int, remaining_balance_den double, scenario string, curr_LTV double, prob_class_0 double, prob_class_1 double")

In [0]:
# Change if needed by loan_id and re-run calculate_ECL
result_by_loan = df.groupby("loan_id").applyInPandas(
     score_in_parallel, schema="date date, unemploymentrate_pred double, CPIInflationRate_pred double, HousePriceIndex_pred double, loan_id string, credit_score int, remaining_balance_den double, curr_LTV double, prob_class_0 double, prob_class_1 double")

### 5. Calculate ECL by Scenarios

In [0]:
def calculate_ECL(data, LGD):
    return data.withColumn('ECL', col('remaining_balance_den') * col('prob_class_1') * LGD)

In [0]:
LGD = 0.20
predicted = calculate_ECL(result, LGD)


#### 5.1 Cumulative ECL over time

In [0]:
# Group by 'date' and 'scenario', sum 'ECL', and calculate cumulative ECL
aggregated_df = (predicted.withColumn("date", col("date").cast("date"))
                 .groupBy("date", "scenario")
                 .agg(sum("ECL").alias("ECL"))
                 .withColumn("cumulative_ecl", sum("ECL").over(Window.partitionBy("scenario").orderBy("date").rowsBetween(Window.unboundedPreceding, 0)))
                 .orderBy("date"))

# Display the result
display(aggregated_df)

date,scenario,ECL,cumulative_ecl
2021-09-01,Predicted by Databricks,431961210.0169278,431961210.0169278
2021-09-01,Supervisory Baseline,445362776.4754792,445362776.4754792
2021-09-01,Supervisory Severely Adverse,436164312.41357046,436164312.41357046
2021-10-01,Predicted by Databricks,441816064.1674464,873777274.1843741
2021-10-01,Supervisory Baseline,457589902.0429542,902952678.5184332
2021-10-01,Supervisory Severely Adverse,454701495.4674267,890865807.8809972
2021-11-01,Predicted by Databricks,452403470.65633583,1326180744.84071
2021-11-01,Supervisory Baseline,469677920.9592435,1372630599.4776769
2021-11-01,Supervisory Severely Adverse,466720642.7895648,1357586450.670562
2021-12-01,Predicted by Databricks,463362615.4552068,1789543360.2959168


Databricks visualization. Run in Databricks to view.

#### 5.2 ECL by month

In [0]:
grouped = predicted.groupBy('date','scenario').agg(sum('ECL').alias('ECL')).orderBy('date')
grouped.display()

date,scenario,ECL
2021-09-01,Supervisory Severely Adverse,436164312.41357046
2021-09-01,Supervisory Baseline,445362776.4754792
2021-09-01,Predicted by Databricks,431961210.0169278
2021-10-01,Supervisory Severely Adverse,454701495.4674267
2021-10-01,Supervisory Baseline,457589902.0429542
2021-10-01,Predicted by Databricks,441816064.1674464
2021-11-01,Supervisory Severely Adverse,466720642.7895648
2021-11-01,Supervisory Baseline,469677920.9592435
2021-11-01,Predicted by Databricks,452403470.65633583
2021-12-01,Supervisory Severely Adverse,478848582.9450377


Databricks visualization. Run in Databricks to view.

In [0]:
aggregated_df.write.format("delta").mode("overwrite").saveAsTable(f"{catalog}.{output_schema}.ECL_scenario_results")