In [1]:
!pip install d3rlpy --quiet

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/721.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m716.8/721.7 kB[0m [31m24.9 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m721.7/721.7 kB[0m [31m17.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m201.1/201.1 kB[0m [31m14.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m958.1/958.1 kB[0m [31m34.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.5/72.5 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.9/50.9 kB[0m [31m3.8 MB/s[0m 

In [2]:
# =========================================
# 🚀 Offline Reinforcement Learning: Loan Approval
# =========================================

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from d3rlpy.algos import DiscreteCQLConfig
from d3rlpy.dataset import MDPDataset
from tensorflow.keras.models import load_model
import tensorflow as tf

Gym has been unmaintained since 2022 and does not support NumPy 2.0 amongst other critical functionality.
Please upgrade to Gymnasium, the maintained drop-in replacement of Gym, or contact the authors of your software and request that they upgrade.
Users of this version of Gym should be able to simply replace 'import gym' with 'import gymnasium as gym' in the vast majority of cases.
See the migration guide at https://gymnasium.farama.org/introduction/migration_guide/ for additional information.


In [3]:
# =========================================
# 1️⃣ Reward Function
# =========================================
def make_rewards(loan_amnt_arr, int_rate_arr, outcomes, actions):
    """
    Compute rewards:
      - If action == 0 (deny): reward = 0
      - If action == 1 and Fully Paid: +loan_amnt * int_rate
      - If action == 1 and Defaulted: -loan_amnt
    """
    int_frac = np.array(int_rate_arr, dtype=np.float32) / 100.0
    loan_amnt = np.array(loan_amnt_arr, dtype=np.float32)
    outcomes = np.array(outcomes, dtype=np.int32)
    actions = np.array(actions, dtype=np.int32)

    rewards = np.zeros_like(actions, dtype=np.float32)
    approve_paid = (actions == 1) & (outcomes == 0)
    approve_default = (actions == 1) & (outcomes == 1)

    rewards[approve_paid] = loan_amnt[approve_paid] * int_frac[approve_paid]
    rewards[approve_default] = -loan_amnt[approve_default]
    return rewards

In [6]:

# =========================================
# 2️⃣ Load Processed Data
# =========================================
processed_df = pd.read_csv("/content/drive/MyDrive/loan_data_processed.csv")

# Ensure numeric inputs only
num_cols = processed_df.select_dtypes(include=[np.number]).columns.tolist()
assert 'target' in num_cols, "❌ 'target' column missing in processed data!"
num_cols.remove('target')

X = processed_df[num_cols].values.astype('float32')
y = processed_df['target'].values.astype(int)

# Require unscaled versions for correct reward magnitudes
if 'loan_amnt' not in processed_df.columns or 'int_rate' not in processed_df.columns:
    raise ValueError("❌ loan_amnt and int_rate columns must exist for reward computation!")

loan_amnt = processed_df['loan_amnt'].values
int_rate = processed_df['int_rate'].values

# Split train/test
X_train, X_test, y_train, y_test, loan_train, loan_test, int_train, int_test = train_test_split(
    X, y, loan_amnt, int_rate, test_size=0.2, random_state=42, stratify=y
)

print(f"✅ Data split: Train {len(X_train)}, Test {len(X_test)}")


✅ Data split: Train 1078479, Test 269620


In [7]:
# =========================================
# 3️⃣ Build Offline Dataset (Augment Deny Actions)
# =========================================
actions_logged = np.ones((X_train.shape[0],), dtype=int)  # all historical approvals
rewards_logged = make_rewards(loan_train, int_train, y_train, actions_logged)
terminals = np.ones_like(rewards_logged, dtype=bool)

# Create augmented deny data (synthetic exploration)
obs_aug = np.concatenate([X_train, X_train], axis=0)
actions_aug = np.concatenate([actions_logged, np.zeros_like(actions_logged)], axis=0)
rewards_aug = np.concatenate([rewards_logged, np.zeros_like(rewards_logged)], axis=0)
terminals_aug = np.concatenate([terminals, np.ones_like(terminals)], axis=0)

# Normalize reward scale to stabilize training
reward_mean, reward_std = rewards_aug.mean(), rewards_aug.std() + 1e-8
rewards_norm = (rewards_aug - reward_mean) / reward_std

dataset = MDPDataset(
    observations=obs_aug.astype(np.float32),
    actions=actions_aug.astype(np.int32),
    rewards=rewards_norm.astype(np.float32),
    terminals=terminals_aug.astype(bool)
)

print(f"✅ Offline dataset ready: {dataset.transition_count} transitions")



[2m2025-10-30 09:48.54[0m [[32m[1minfo     [0m] [1mSignatures have been automatically determined.[0m [36maction_signature[0m=[35mSignature(dtype=[dtype('int32')], shape=[(1,)])[0m [36mobservation_signature[0m=[35mSignature(dtype=[dtype('float32')], shape=[(80,)])[0m [36mreward_signature[0m=[35mSignature(dtype=[dtype('float32')], shape=[(1,)])[0m
[2m2025-10-30 09:48.54[0m [[32m[1minfo     [0m] [1mAction-space has been automatically determined.[0m [36maction_space[0m=[35m<ActionSpace.DISCRETE: 2>[0m
[2m2025-10-30 09:49.02[0m [[32m[1minfo     [0m] [1mAction size has been automatically determined.[0m [36maction_size[0m=[35m2[0m
✅ Offline dataset ready: 2156958 transitions


In [9]:
# =========================================
# 4️⃣ Configure and Train Offline RL (Discrete CQL)
# =========================================
from d3rlpy.models.encoders import DefaultEncoderFactory
from d3rlpy.models.q_functions import QRQFunctionFactory

config = DiscreteCQLConfig(
    learning_rate=1e-4,
    batch_size=512,
    encoder_factory=DefaultEncoderFactory(),
    q_func_factory=QRQFunctionFactory()   # Quantile regression for stability
    # alpha_threshold=10.0 # Removed as it's not a valid argument for DiscreteCQLConfig
)

cql = config.create(device="cuda" if tf.config.list_physical_devices('GPU') else "cpu")

# Estimate steps per epoch
steps_per_epoch = dataset.transition_count // 512
n_epochs = 30
n_steps = n_epochs * steps_per_epoch

print("🚀 Starting CQL training ...")
cql.fit(
    dataset,
    n_steps=n_steps,
    n_steps_per_epoch=steps_per_epoch,
    show_progress=True,
    # verbose=True # Removed as it's not a valid argument for QLearningAlgoBase.fit()
)
print("✅ CQL training completed.")

🚀 Starting CQL training ...
[2m2025-10-30 09:53.13[0m [[32m[1minfo     [0m] [1mdataset info                  [0m [36mdataset_info[0m=[35mDatasetInfo(observation_signature=Signature(dtype=[dtype('float32')], shape=[(80,)]), action_signature=Signature(dtype=[dtype('int32')], shape=[(1,)]), reward_signature=Signature(dtype=[dtype('float32')], shape=[(1,)]), action_space=<ActionSpace.DISCRETE: 2>, action_size=2)[0m
[2m2025-10-30 09:53.13[0m [[32m[1mdebug    [0m] [1mBuilding models...            [0m
[2m2025-10-30 09:53.13[0m [[32m[1mdebug    [0m] [1mModels have been built.       [0m
[2m2025-10-30 09:53.13[0m [[32m[1minfo     [0m] [1mDirectory is created at d3rlpy_logs/DiscreteCQL_20251030095313[0m
[2m2025-10-30 09:53.13[0m [[32m[1minfo     [0m] [1mParameters                    [0m [36mparams[0m=[35m{'observation_shape': [80], 'action_size': 2, 'config': {'type': 'discrete_cql', 'params': {'batch_size': 512, 'gamma': 0.99, 'observation_scaler': {'typ

Epoch 1/30:   0%|          | 0/4212 [00:00<?, ?it/s]

[2m2025-10-30 09:54.18[0m [[32m[1minfo     [0m] [1mDiscreteCQL_20251030095313: epoch=1 step=4212[0m [36mepoch[0m=[35m1[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.009505764541462956, 'time_algorithm_update': 0.005917527227320223, 'loss': 3.0844543419445793, 'td_loss': 2.3498225343702863, 'conservative_loss': 0.7346318037393312, 'time_step': 0.015521183652415914}[0m [36mstep[0m=[35m4212[0m
[2m2025-10-30 09:54.18[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/DiscreteCQL_20251030095313/model_4212.d3[0m


Epoch 2/30:   0%|          | 0/4212 [00:00<?, ?it/s]

[2m2025-10-30 09:55.24[0m [[32m[1minfo     [0m] [1mDiscreteCQL_20251030095313: epoch=2 step=8424[0m [36mepoch[0m=[35m2[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.009510648329826276, 'time_algorithm_update': 0.005906505179427854, 'loss': 2.9975397919940585, 'td_loss': 2.263198749631898, 'conservative_loss': 0.73434104173951, 'time_step': 0.015513129383750112}[0m [36mstep[0m=[35m8424[0m
[2m2025-10-30 09:55.24[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/DiscreteCQL_20251030095313/model_8424.d3[0m


Epoch 3/30:   0%|          | 0/4212 [00:00<?, ?it/s]

[2m2025-10-30 09:56.30[0m [[32m[1minfo     [0m] [1mDiscreteCQL_20251030095313: epoch=3 step=12636[0m [36mepoch[0m=[35m3[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.009523530461509683, 'time_algorithm_update': 0.005904209749651091, 'loss': 2.982226348330832, 'td_loss': 2.250770377410896, 'conservative_loss': 0.7314559703397389, 'time_step': 0.015522322140754346}[0m [36mstep[0m=[35m12636[0m
[2m2025-10-30 09:56.30[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/DiscreteCQL_20251030095313/model_12636.d3[0m


Epoch 4/30:   0%|          | 0/4212 [00:00<?, ?it/s]

[2m2025-10-30 09:57.36[0m [[32m[1minfo     [0m] [1mDiscreteCQL_20251030095313: epoch=4 step=16848[0m [36mepoch[0m=[35m4[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.009647740314715495, 'time_algorithm_update': 0.005949564865398498, 'loss': 2.97592445811768, 'td_loss': 2.2458756748597506, 'conservative_loss': 0.7300487846447419, 'time_step': 0.015697215244188156}[0m [36mstep[0m=[35m16848[0m
[2m2025-10-30 09:57.36[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/DiscreteCQL_20251030095313/model_16848.d3[0m


Epoch 5/30:   0%|          | 0/4212 [00:00<?, ?it/s]

[2m2025-10-30 09:58.42[0m [[32m[1minfo     [0m] [1mDiscreteCQL_20251030095313: epoch=5 step=21060[0m [36mepoch[0m=[35m5[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.009536473160116082, 'time_algorithm_update': 0.005896254652246111, 'loss': 2.9769535402626732, 'td_loss': 2.2474617601742883, 'conservative_loss': 0.7294917795223388, 'time_step': 0.015527852070637239}[0m [36mstep[0m=[35m21060[0m
[2m2025-10-30 09:58.42[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/DiscreteCQL_20251030095313/model_21060.d3[0m


Epoch 6/30:   0%|          | 0/4212 [00:00<?, ?it/s]

[2m2025-10-30 09:59.48[0m [[32m[1minfo     [0m] [1mDiscreteCQL_20251030095313: epoch=6 step=25272[0m [36mepoch[0m=[35m6[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.009463715304223448, 'time_algorithm_update': 0.005875946562967182, 'loss': 2.9632944761574667, 'td_loss': 2.234307576873024, 'conservative_loss': 0.7289869028505324, 'time_step': 0.015433306707615866}[0m [36mstep[0m=[35m25272[0m
[2m2025-10-30 09:59.48[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/DiscreteCQL_20251030095313/model_25272.d3[0m


Epoch 7/30:   0%|          | 0/4212 [00:00<?, ?it/s]

[2m2025-10-30 10:00.53[0m [[32m[1minfo     [0m] [1mDiscreteCQL_20251030095313: epoch=7 step=29484[0m [36mepoch[0m=[35m7[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.009462349038970889, 'time_algorithm_update': 0.005880623857746323, 'loss': 2.9680772563557567, 'td_loss': 2.2389397998990495, 'conservative_loss': 0.729137457022753, 'time_step': 0.015442480332264986}[0m [36mstep[0m=[35m29484[0m
[2m2025-10-30 10:00.53[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/DiscreteCQL_20251030095313/model_29484.d3[0m


Epoch 8/30:   0%|          | 0/4212 [00:00<?, ?it/s]

[2m2025-10-30 10:02.00[0m [[32m[1minfo     [0m] [1mDiscreteCQL_20251030095313: epoch=8 step=33696[0m [36mepoch[0m=[35m8[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.009616854976498277, 'time_algorithm_update': 0.005967537392238606, 'loss': 2.965244644218021, 'td_loss': 2.235844206209989, 'conservative_loss': 0.7294004367485798, 'time_step': 0.015687637519293023}[0m [36mstep[0m=[35m33696[0m
[2m2025-10-30 10:02.00[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/DiscreteCQL_20251030095313/model_33696.d3[0m


Epoch 9/30:   0%|          | 0/4212 [00:00<?, ?it/s]

[2m2025-10-30 10:03.06[0m [[32m[1minfo     [0m] [1mDiscreteCQL_20251030095313: epoch=9 step=37908[0m [36mepoch[0m=[35m9[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.009610333334007154, 'time_algorithm_update': 0.0059306033429710045, 'loss': 2.9604580416242396, 'td_loss': 2.231249955106891, 'conservative_loss': 0.7292080865881042, 'time_step': 0.01564267133375858}[0m [36mstep[0m=[35m37908[0m
[2m2025-10-30 10:03.06[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/DiscreteCQL_20251030095313/model_37908.d3[0m


Epoch 10/30:   0%|          | 0/4212 [00:00<?, ?it/s]

[2m2025-10-30 10:04.13[0m [[32m[1minfo     [0m] [1mDiscreteCQL_20251030095313: epoch=10 step=42120[0m [36mepoch[0m=[35m10[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.00978110013184724, 'time_algorithm_update': 0.005980694565677914, 'loss': 2.9603095813128117, 'td_loss': 2.2310608844471793, 'conservative_loss': 0.7292486965684583, 'time_step': 0.015862967100804575}[0m [36mstep[0m=[35m42120[0m
[2m2025-10-30 10:04.13[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/DiscreteCQL_20251030095313/model_42120.d3[0m


Epoch 11/30:   0%|          | 0/4212 [00:00<?, ?it/s]

[2m2025-10-30 10:05.19[0m [[32m[1minfo     [0m] [1mDiscreteCQL_20251030095313: epoch=11 step=46332[0m [36mepoch[0m=[35m11[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.009542983198664014, 'time_algorithm_update': 0.0059137279265060495, 'loss': 2.961698774909928, 'td_loss': 2.2324812058483205, 'conservative_loss': 0.7292175709437101, 'time_step': 0.01555066519653016}[0m [36mstep[0m=[35m46332[0m
[2m2025-10-30 10:05.19[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/DiscreteCQL_20251030095313/model_46332.d3[0m


Epoch 12/30:   0%|          | 0/4212 [00:00<?, ?it/s]

[2m2025-10-30 10:06.25[0m [[32m[1minfo     [0m] [1mDiscreteCQL_20251030095313: epoch=12 step=50544[0m [36mepoch[0m=[35m12[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.009524702459771862, 'time_algorithm_update': 0.005912564362329409, 'loss': 2.9578973247341507, 'td_loss': 2.2285582313397216, 'conservative_loss': 0.7293390935076381, 'time_step': 0.015534307259326873}[0m [36mstep[0m=[35m50544[0m
[2m2025-10-30 10:06.25[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/DiscreteCQL_20251030095313/model_50544.d3[0m


Epoch 13/30:   0%|          | 0/4212 [00:00<?, ?it/s]

[2m2025-10-30 10:07.30[0m [[32m[1minfo     [0m] [1mDiscreteCQL_20251030095313: epoch=13 step=54756[0m [36mepoch[0m=[35m13[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.009446750282675226, 'time_algorithm_update': 0.0059202190025126715, 'loss': 2.954076751967554, 'td_loss': 2.2246191992662343, 'conservative_loss': 0.7294575531117031, 'time_step': 0.015459295634643757}[0m [36mstep[0m=[35m54756[0m
[2m2025-10-30 10:07.30[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/DiscreteCQL_20251030095313/model_54756.d3[0m


Epoch 14/30:   0%|          | 0/4212 [00:00<?, ?it/s]

[2m2025-10-30 10:08.36[0m [[32m[1minfo     [0m] [1mDiscreteCQL_20251030095313: epoch=14 step=58968[0m [36mepoch[0m=[35m14[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.009445694154031143, 'time_algorithm_update': 0.005897819429834117, 'loss': 2.9638511669828227, 'td_loss': 2.2343410127862566, 'conservative_loss': 0.7295101544371357, 'time_step': 0.015438543142642957}[0m [36mstep[0m=[35m58968[0m
[2m2025-10-30 10:08.36[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/DiscreteCQL_20251030095313/model_58968.d3[0m


Epoch 15/30:   0%|          | 0/4212 [00:00<?, ?it/s]

[2m2025-10-30 10:09.41[0m [[32m[1minfo     [0m] [1mDiscreteCQL_20251030095313: epoch=15 step=63180[0m [36mepoch[0m=[35m15[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.009464014516143943, 'time_algorithm_update': 0.005880149737608965, 'loss': 2.95618550823285, 'td_loss': 2.226857902062924, 'conservative_loss': 0.7293276069482394, 'time_step': 0.015438200288575146}[0m [36mstep[0m=[35m63180[0m
[2m2025-10-30 10:09.41[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/DiscreteCQL_20251030095313/model_63180.d3[0m


Epoch 16/30:   0%|          | 0/4212 [00:00<?, ?it/s]

[2m2025-10-30 10:10.47[0m [[32m[1minfo     [0m] [1mDiscreteCQL_20251030095313: epoch=16 step=67392[0m [36mepoch[0m=[35m16[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.009463514584308563, 'time_algorithm_update': 0.0058999299889842555, 'loss': 2.9590245271340394, 'td_loss': 2.2298389777385497, 'conservative_loss': 0.7291855489426529, 'time_step': 0.015454903457239483}[0m [36mstep[0m=[35m67392[0m
[2m2025-10-30 10:10.47[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/DiscreteCQL_20251030095313/model_67392.d3[0m


Epoch 17/30:   0%|          | 0/4212 [00:00<?, ?it/s]

[2m2025-10-30 10:11.52[0m [[32m[1minfo     [0m] [1mDiscreteCQL_20251030095313: epoch=17 step=71604[0m [36mepoch[0m=[35m17[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.009434286968434073, 'time_algorithm_update': 0.0059170262199974245, 'loss': 2.9507386856778735, 'td_loss': 2.221498932792024, 'conservative_loss': 0.7292397519660245, 'time_step': 0.01544405478923975}[0m [36mstep[0m=[35m71604[0m
[2m2025-10-30 10:11.52[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/DiscreteCQL_20251030095313/model_71604.d3[0m


Epoch 18/30:   0%|          | 0/4212 [00:00<?, ?it/s]

[2m2025-10-30 10:12.58[0m [[32m[1minfo     [0m] [1mDiscreteCQL_20251030095313: epoch=18 step=75816[0m [36mepoch[0m=[35m18[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.00946414878225734, 'time_algorithm_update': 0.0059201249256659325, 'loss': 2.9471689605022315, 'td_loss': 2.217719998369869, 'conservative_loss': 0.7294489599813876, 'time_step': 0.01547723051942425}[0m [36mstep[0m=[35m75816[0m
[2m2025-10-30 10:12.58[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/DiscreteCQL_20251030095313/model_75816.d3[0m


Epoch 19/30:   0%|          | 0/4212 [00:00<?, ?it/s]

[2m2025-10-30 10:14.03[0m [[32m[1minfo     [0m] [1mDiscreteCQL_20251030095313: epoch=19 step=80028[0m [36mepoch[0m=[35m19[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.00945536555954081, 'time_algorithm_update': 0.005899959366772137, 'loss': 2.944363537676993, 'td_loss': 2.2150680779674907, 'conservative_loss': 0.7292954581387249, 'time_step': 0.015447910185213442}[0m [36mstep[0m=[35m80028[0m
[2m2025-10-30 10:14.03[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/DiscreteCQL_20251030095313/model_80028.d3[0m


Epoch 20/30:   0%|          | 0/4212 [00:00<?, ?it/s]

[2m2025-10-30 10:15.09[0m [[32m[1minfo     [0m] [1mDiscreteCQL_20251030095313: epoch=20 step=84240[0m [36mepoch[0m=[35m20[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.009430668689133661, 'time_algorithm_update': 0.005906573218158168, 'loss': 2.9528897305612665, 'td_loss': 2.2232821526577218, 'conservative_loss': 0.729607576148802, 'time_step': 0.015427887043495576}[0m [36mstep[0m=[35m84240[0m
[2m2025-10-30 10:15.09[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/DiscreteCQL_20251030095313/model_84240.d3[0m


Epoch 21/30:   0%|          | 0/4212 [00:00<?, ?it/s]

[2m2025-10-30 10:16.14[0m [[32m[1minfo     [0m] [1mDiscreteCQL_20251030095313: epoch=21 step=88452[0m [36mepoch[0m=[35m21[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.0094299988302863, 'time_algorithm_update': 0.005897563067596523, 'loss': 2.9515121900684362, 'td_loss': 2.221989299893266, 'conservative_loss': 0.729522891307262, 'time_step': 0.015419362050968471}[0m [36mstep[0m=[35m88452[0m
[2m2025-10-30 10:16.14[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/DiscreteCQL_20251030095313/model_88452.d3[0m


Epoch 22/30:   0%|          | 0/4212 [00:00<?, ?it/s]

[2m2025-10-30 10:17.20[0m [[32m[1minfo     [0m] [1mDiscreteCQL_20251030095313: epoch=22 step=92664[0m [36mepoch[0m=[35m22[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.009544430861332704, 'time_algorithm_update': 0.005967800490423712, 'loss': 2.9460201550413063, 'td_loss': 2.216581157441379, 'conservative_loss': 0.7294389993405184, 'time_step': 0.01560654095661493}[0m [36mstep[0m=[35m92664[0m
[2m2025-10-30 10:17.20[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/DiscreteCQL_20251030095313/model_92664.d3[0m


Epoch 23/30:   0%|          | 0/4212 [00:00<?, ?it/s]

[2m2025-10-30 10:18.26[0m [[32m[1minfo     [0m] [1mDiscreteCQL_20251030095313: epoch=23 step=96876[0m [36mepoch[0m=[35m23[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.009486560128692888, 'time_algorithm_update': 0.005919826959046657, 'loss': 2.943212130074261, 'td_loss': 2.213929661573508, 'conservative_loss': 0.7292824700290774, 'time_step': 0.01549836848875736}[0m [36mstep[0m=[35m96876[0m
[2m2025-10-30 10:18.26[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/DiscreteCQL_20251030095313/model_96876.d3[0m


Epoch 24/30:   0%|          | 0/4212 [00:00<?, ?it/s]

[2m2025-10-30 10:19.31[0m [[32m[1minfo     [0m] [1mDiscreteCQL_20251030095313: epoch=24 step=101088[0m [36mepoch[0m=[35m24[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.009491374916047679, 'time_algorithm_update': 0.005942597914511888, 'loss': 2.944846249968238, 'td_loss': 2.215240249436805, 'conservative_loss': 0.7296060020880595, 'time_step': 0.015529435867371157}[0m [36mstep[0m=[35m101088[0m
[2m2025-10-30 10:19.31[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/DiscreteCQL_20251030095313/model_101088.d3[0m


Epoch 25/30:   0%|          | 0/4212 [00:00<?, ?it/s]

[2m2025-10-30 10:20.37[0m [[32m[1minfo     [0m] [1mDiscreteCQL_20251030095313: epoch=25 step=105300[0m [36mepoch[0m=[35m25[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.009451798790427242, 'time_algorithm_update': 0.005913335260389424, 'loss': 2.9409387252609274, 'td_loss': 2.211638442301682, 'conservative_loss': 0.7293002836385004, 'time_step': 0.015459264049276441}[0m [36mstep[0m=[35m105300[0m
[2m2025-10-30 10:20.37[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/DiscreteCQL_20251030095313/model_105300.d3[0m


Epoch 26/30:   0%|          | 0/4212 [00:00<?, ?it/s]

[2m2025-10-30 10:21.43[0m [[32m[1minfo     [0m] [1mDiscreteCQL_20251030095313: epoch=26 step=109512[0m [36mepoch[0m=[35m26[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.009469476916737933, 'time_algorithm_update': 0.005929954880662793, 'loss': 2.938414937382297, 'td_loss': 2.208871046022812, 'conservative_loss': 0.729543887595279, 'time_step': 0.015492902522073512}[0m [36mstep[0m=[35m109512[0m
[2m2025-10-30 10:21.43[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/DiscreteCQL_20251030095313/model_109512.d3[0m


Epoch 27/30:   0%|          | 0/4212 [00:00<?, ?it/s]

[2m2025-10-30 10:22.48[0m [[32m[1minfo     [0m] [1mDiscreteCQL_20251030095313: epoch=27 step=113724[0m [36mepoch[0m=[35m27[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.009484686403193025, 'time_algorithm_update': 0.0059567688197491855, 'loss': 2.9419270779615565, 'td_loss': 2.212432403538415, 'conservative_loss': 0.7294946742108744, 'time_step': 0.015533825214545384}[0m [36mstep[0m=[35m113724[0m
[2m2025-10-30 10:22.48[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/DiscreteCQL_20251030095313/model_113724.d3[0m


Epoch 28/30:   0%|          | 0/4212 [00:00<?, ?it/s]

[2m2025-10-30 10:23.55[0m [[32m[1minfo     [0m] [1mDiscreteCQL_20251030095313: epoch=28 step=117936[0m [36mepoch[0m=[35m28[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.00967027562522707, 'time_algorithm_update': 0.006027212453137549, 'loss': 2.9372969163618876, 'td_loss': 2.2081768614331088, 'conservative_loss': 0.7291200540655586, 'time_step': 0.015798817955643817}[0m [36mstep[0m=[35m117936[0m
[2m2025-10-30 10:23.55[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/DiscreteCQL_20251030095313/model_117936.d3[0m


Epoch 29/30:   0%|          | 0/4212 [00:00<?, ?it/s]

[2m2025-10-30 10:25.02[0m [[32m[1minfo     [0m] [1mDiscreteCQL_20251030095313: epoch=29 step=122148[0m [36mepoch[0m=[35m29[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.009598955469593363, 'time_algorithm_update': 0.005988405923671306, 'loss': 2.9347925457576287, 'td_loss': 2.2053945562665165, 'conservative_loss': 0.7293979893212984, 'time_step': 0.015690218689095262}[0m [36mstep[0m=[35m122148[0m
[2m2025-10-30 10:25.02[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/DiscreteCQL_20251030095313/model_122148.d3[0m


Epoch 30/30:   0%|          | 0/4212 [00:00<?, ?it/s]

[2m2025-10-30 10:26.08[0m [[32m[1minfo     [0m] [1mDiscreteCQL_20251030095313: epoch=30 step=126360[0m [36mepoch[0m=[35m30[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.009487925261853427, 'time_algorithm_update': 0.005953567546543799, 'loss': 2.938644832072661, 'td_loss': 2.209187679962102, 'conservative_loss': 0.7294571518416871, 'time_step': 0.015535646863812395}[0m [36mstep[0m=[35m126360[0m
[2m2025-10-30 10:26.08[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/DiscreteCQL_20251030095313/model_126360.d3[0m
✅ CQL training completed.


In [10]:
# =========================================
# 5️⃣ Evaluate RL Policy vs Baselines
# =========================================
policy_actions = cql.predict(X_test)
policy_rewards = make_rewards(loan_test, int_test, y_test, policy_actions)

avg_reward = policy_rewards.mean()
total_reward = policy_rewards.sum()

print(f"\n🎯 RL Policy Evaluation:")
print(f"Avg Reward per Loan: {avg_reward:.4f}")
print(f"Total Expected Reward: {total_reward:.2f}")
print(f"Approval Rate (RL policy): {policy_actions.mean():.2f}")

# Baselines
always_approve_rewards = make_rewards(loan_test, int_test, y_test, np.ones_like(policy_actions))
always_deny_rewards = make_rewards(loan_test, int_test, y_test, np.zeros_like(policy_actions))

print("\n🧩 Baseline Comparisons:")
print(f"Always-Approve Avg Reward: {always_approve_rewards.mean():.4f}")
print(f"Always-Deny Avg Reward: {always_deny_rewards.mean():.4f}")



🎯 RL Policy Evaluation:
Avg Reward per Loan: 0.0691
Total Expected Reward: 18621.95
Approval Rate (RL policy): 0.56

🧩 Baseline Comparisons:
Always-Approve Avg Reward: -0.0253
Always-Deny Avg Reward: 0.0000


In [11]:
# =========================================
# 6️⃣ Deep Learning Baseline Policy
# =========================================
try:
    dl_model = load_model("/content/drive/MyDrive/nn_best_model.h5")
    dl_probs = dl_model.predict(X_test, verbose=0).reshape(-1)

    # Approve if predicted default probability is below threshold
    threshold = 0.5
    dl_actions = (dl_probs < threshold).astype(int)

    dl_rewards = make_rewards(loan_test, int_test, y_test, dl_actions)

    print("\n🤖 Deep Learning Policy Baseline:")
    print(f"Threshold: {threshold}")
    print(f"DL Avg Reward: {dl_rewards.mean():.4f}")
    print(f"DL Total Reward: {dl_rewards.sum():.2f}")
    print(f"DL Approval Rate: {dl_actions.mean():.2f}")
except Exception as e:
    print("⚠️ Skipping DL baseline (model not found):", e)




🤖 Deep Learning Policy Baseline:
Threshold: 0.5
DL Avg Reward: 0.0098
DL Total Reward: 2652.09
DL Approval Rate: 0.58


In [12]:
# =========================================
# 7️⃣ Summary
# =========================================
print("\n📊 Summary of Policies:")
print(f"{'Policy':<25}{'Avg Reward':>15}{'Total Reward':>20}{'Approve %':>15}")
print(f"{'-'*75}")
print(f"{'RL (CQL) Policy':<25}{avg_reward:>15.4f}{total_reward:>20.2f}{policy_actions.mean():>15.2f}")
print(f"{'DL Baseline':<25}{dl_rewards.mean() if 'dl_rewards' in locals() else np.nan:>15.4f}"
      f"{dl_rewards.sum() if 'dl_rewards' in locals() else np.nan:>20.2f}"
      f"{dl_actions.mean() if 'dl_actions' in locals() else np.nan:>15.2f}")
print(f"{'Always Approve':<25}{always_approve_rewards.mean():>15.4f}{always_approve_rewards.sum():>20.2f}{1.00:>15.2f}")
print(f"{'Always Deny':<25}{always_deny_rewards.mean():>15.4f}{always_deny_rewards.sum():>20.2f}{0.00:>15.2f}")



📊 Summary of Policies:
Policy                        Avg Reward        Total Reward      Approve %
---------------------------------------------------------------------------
RL (CQL) Policy                   0.0691            18621.95           0.56
DL Baseline                       0.0098             2652.09           0.58
Always Approve                   -0.0253            -6834.06           1.00
Always Deny                       0.0000                0.00           0.00
