In [19]:
# import packages
import sys
import os
sys.path.append(os.path.abspath('.'))

from pages.A_Explore_Preprocess_Dataset import load_dataset

In [20]:
# load dataset
filepath = "mortgage_3000_onehot.csv"
df = load_dataset(filepath)
df.head()



Unnamed: 0.1,Unnamed: 0,pages_viewed,time_on_site_sec,propensity_score,bounced,started_application,used_calculator,completed_application,user_type_New,user_type_Returning,...,device_type_Tablet,entry_page_Calculator,entry_page_Contact,entry_page_Home,entry_page_Learn,entry_page_Rates,geo_region_Midwest,geo_region_Northeast,geo_region_South,geo_region_West
0,0,3,260,0.28,0,1,1,1,1,0,...,0,0,0,0,0,1,0,1,0,0
1,1,1,740,0.93,1,1,0,1,0,1,...,0,0,0,0,1,0,0,1,0,0
2,2,10,516,0.2,0,0,0,1,1,0,...,0,0,1,0,0,0,0,0,1,0
3,3,5,458,0.75,1,0,1,0,1,0,...,0,0,1,0,0,0,1,0,0,0
4,4,5,755,0.65,1,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,1


In [21]:
import numpy as np
import pandas as pd
from pages.B_Train_Model import split_dataset, LinearRegression
from pages.C_Test_Model import rmse, mae, r2, compute_eval_metrics

# split dataset
target = 'propensity_score'
features = [col for col in df.select_dtypes(include='number').columns if col != target]

X = df[features].values
Y = df[[target]].values

X_train, X_val, y_train, y_val = split_dataset(X, Y, number=30)

# train 
model = LinearRegression(learning_rate=0.01, num_iterations=200)
model.fit(X_train, y_train)

def test(model):
    # predict
    y_train_pred = model.predict(X_train)
    y_val_pred = model.predict(X_val)

    # evaluate
    metrics = ['mean_absolute_error', 'root_mean_squared_error', 'r2_score']
    train_metrics = compute_eval_metrics(X_train, y_train, model, metrics)
    val_metrics = compute_eval_metrics(X_val, y_val, model, metrics)

    print("Train Metrics:")
    print(train_metrics)

    print("\nValidation Metrics:")
    print(val_metrics)

test(model)



Train Metrics:
{'mean_absolute_error': 0.25376739393787084, 'root_mean_squared_error': 0.2921705908162253, 'r2_score': 0.00591988776325314}

Validation Metrics:
{'mean_absolute_error': 0.25152548471012465, 'root_mean_squared_error': 0.29078815153043114, 'r2_score': -0.010938289155578484}


In [22]:
# grid search for best learning rate - on normal Linear Regression
from pages.B_Train_Model import LinearRegression

learning_rates = [0.0001, 0.001, 0.01, 0.1, 0.5]
best_rmse = float('inf')
best_lr = None
best_model = None

for lr in learning_rates:
    model = LinearRegression(learning_rate=lr, num_iterations=200)
    model.fit(X_train, y_train)
    y_val_pred = model.predict(X_val)
    current_rmse = rmse(y_val, y_val_pred)

    print(f"Learning Rate: {lr}, RMSE: {current_rmse:.4f}")

    if current_rmse < best_rmse:
        best_rmse = current_rmse
        best_lr = lr
        best_model = model

print(f"\nBest Learning Rate: {best_lr}, RMSE: {best_rmse:.4f}")



Learning Rate: 0.0001, RMSE: 0.5553




Learning Rate: 0.001, RMSE: 0.4371




Learning Rate: 0.01, RMSE: 0.2908




Learning Rate: 0.1, RMSE: 0.2910




Learning Rate: 0.5, RMSE: 154.0592

Best Learning Rate: 0.01, RMSE: 0.2908


In [23]:
# grid search to choose l1 and l2 penalty
from pages.B_Train_Model import LassoRegression

l1_penalties = [0.01, 0.1, 0.5, 1.0]
best_rmse = float('inf')
best_l1 = None
best_lasso_model = None

for l1 in l1_penalties:
    model = LassoRegression(learning_rate=best_lr, num_iterations=200, l1_penalty=l1)
    model.fit(X_train, y_train)
    y_val_pred = model.predict(X_val)
    current_rmse = rmse(y_val, y_val_pred)

    print(f"L1 Penalty: {l1}, RMSE: {current_rmse:.4f}")

    if current_rmse < best_rmse:
        best_rmse = current_rmse
        best_l1 = l1
        best_lasso_model = model

print(f"\nBest L1 Penalty: {best_l1}, RMSE: {best_rmse:.4f}")



L1 Penalty: 0.01, RMSE: 0.2908




L1 Penalty: 0.1, RMSE: 0.2908




L1 Penalty: 0.5, RMSE: 0.2907




L1 Penalty: 1.0, RMSE: 0.2907

Best L1 Penalty: 1.0, RMSE: 0.2907


In [24]:
from pages.B_Train_Model import RidgeRegression

l2_penalties = [0.01, 0.1, 0.5, 1.0]
best_rmse = float('inf')
best_l2 = None
best_ridge_model = None

for l2 in l2_penalties:
    model = RidgeRegression(learning_rate=best_lr, num_iterations=200, l2_penalty=l2)
    model.fit(X_train, y_train)
    y_val_pred = model.predict(X_val)
    current_rmse = rmse(y_val, y_val_pred)

    print(f"L2 Penalty: {l2}, RMSE: {current_rmse:.4f}")

    if current_rmse < best_rmse:
        best_rmse = current_rmse
        best_l2 = l2
        best_ridge_model = model

print(f"\nBest L2 Penalty: {best_l2}, RMSE: {best_rmse:.4f}")



L2 Penalty: 0.01, RMSE: 0.2908




L2 Penalty: 0.1, RMSE: 0.2908




L2 Penalty: 0.5, RMSE: 0.2908




L2 Penalty: 1.0, RMSE: 0.2908

Best L2 Penalty: 0.01, RMSE: 0.2908


In [25]:
from pages.B_Train_Model import LassoRegression

lasso_model = LassoRegression(learning_rate=0.01, num_iterations=200, l1_penalty=best_l1)
lasso_model.fit(X_train, y_train)

test(lasso_model)



Train Metrics:
{'mean_absolute_error': 0.25379701592179515, 'root_mean_squared_error': 0.29218071983712485, 'r2_score': 0.005850960683751794}

Validation Metrics:
{'mean_absolute_error': 0.2514341392381936, 'root_mean_squared_error': 0.29067597980260507, 'r2_score': -0.01015849931743329}


In [26]:
from pages.B_Train_Model import RidgeRegression

ridge_model = RidgeRegression(learning_rate=best_lr, num_iterations=200, l2_penalty=best_l2)
ridge_model.fit(X_train, y_train)

test(ridge_model)



Train Metrics:
{'mean_absolute_error': 0.25376736170538194, 'root_mean_squared_error': 0.2921705244010992, 'r2_score': 0.005920339704349353}

Validation Metrics:
{'mean_absolute_error': 0.2515255592108344, 'root_mean_squared_error': 0.29078817303829113, 'r2_score': -0.01093843870169886}


In [27]:
!pip install -q matplotlib

In [27]:
# plot loss curves
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))
plt.plot(model.cost_history, label='Linear Regression', linewidth=2)
plt.plot(lasso_model.cost_history, label='Lasso Regression', linestyle='--')
plt.plot(ridge_model.cost_history, label='Ridge Regression', linestyle=':')
plt.title('Loss Curves Over Iterations')
plt.xlabel('Iteration')
plt.ylabel('Loss (RMSE)')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

# save plot as image
plt.savefig('loss_curves.png', dpi=300)

  plt.show()


In [31]:
np.array(ridge_model.cost_history)-np.array(lasso_model.cost_history)

array([ 0.00000000e+00,  0.00000000e+00, -5.19776763e-06, -1.02085657e-05,
       -1.50403291e-05, -1.96937496e-05, -2.41733539e-05, -2.84828753e-05,
       -3.26248207e-05, -3.66032630e-05, -4.04212783e-05, -4.40814271e-05,
       -4.75880047e-05, -5.09421550e-05, -5.41493481e-05, -5.72110118e-05,
       -6.01290325e-05, -6.29085404e-05, -6.55521772e-05, -6.80597092e-05,
       -7.04373943e-05, -7.26879434e-05, -7.48118593e-05, -7.68065456e-05,
       -7.86889693e-05, -8.04526534e-05, -8.21034226e-05, -8.36407641e-05,
       -8.50665560e-05, -8.63901551e-05, -8.76082232e-05, -8.87274715e-05,
       -8.97487097e-05, -9.06746492e-05, -9.15009701e-05, -9.22444176e-05,
       -9.29006968e-05, -9.34697594e-05, -9.39596593e-05, -9.43704574e-05,
       -9.47048141e-05, -9.49622744e-05, -9.51470986e-05, -9.52679626e-05,
       -9.53195876e-05, -9.53110059e-05, -9.52415658e-05, -9.51137697e-05,
       -9.49300882e-05, -9.46848652e-05, -9.43967429e-05, -9.40599423e-05,
       -9.36767829e-05, -

Hyperparameter Search (Learning Rate, L1 Penalty, L2 Penalty)

In [None]:
# grid search for best learning rate - on normal Linear Regression
from pages.B_Train_Model import LinearRegression

learning_rates = [0.0001, 0.001, 0.01, 0.1, 0.5]
best_rmse = float('inf')
best_lr = None
best_model = None

for lr in learning_rates:
    model = LinearRegression(learning_rate=lr, num_iterations=200)
    model.fit(X_train, y_train)
    y_val_pred = model.predict(X_val)
    current_rmse = rmse(y_val, y_val_pred)

    print(f"Learning Rate: {lr}, RMSE: {current_rmse:.4f}")

    if current_rmse < best_rmse:
        best_rmse = current_rmse
        best_lr = lr
        best_model = model

print(f"\nBest Learning Rate: {best_lr}, RMSE: {best_rmse:.4f}")



Learning Rate: 0.0001, RMSE: 0.5603




Learning Rate: 0.001, RMSE: 0.4926




Learning Rate: 0.01, RMSE: 0.3000




Learning Rate: 0.1, RMSE: 0.2973




Learning Rate: 0.5, RMSE: 0.2987

Best Learning Rate: 0.1, RMSE: 0.2973


In [None]:
# grid search to choose l1 and l2 penalty
from pages.B_Train_Model import LassoRegression

l1_penalties = [0.01, 0.1, 0.5, 1.0]
best_rmse = float('inf')
best_l1 = None
best_lasso_model = None

for l1 in l1_penalties:
    model = LassoRegression(learning_rate=best_lr, num_iterations=200, l1_penalty=l1)
    model.fit(X_train, y_train)
    y_val_pred = model.predict(X_val)
    current_rmse = rmse(y_val, y_val_pred)

    print(f"L1 Penalty: {l1}, RMSE: {current_rmse:.4f}")

    if current_rmse < best_rmse:
        best_rmse = current_rmse
        best_l1 = l1
        best_lasso_model = model

print(f"\nBest L1 Penalty: {best_l1}, RMSE: {best_rmse:.4f}")



L1 Penalty: 0.01, RMSE: 0.2973




L1 Penalty: 0.1, RMSE: 0.2973




L1 Penalty: 0.5, RMSE: 0.2972




L1 Penalty: 1.0, RMSE: 0.2972

Best L1 Penalty: 1.0, RMSE: 0.2972


In [None]:
from pages.B_Train_Model import RidgeRegression

l2_penalties = [0.01, 0.1, 0.5, 1.0]
best_rmse = float('inf')
best_l2 = None
best_ridge_model = None

for l2 in l2_penalties:
    model = RidgeRegression(learning_rate=best_lr, num_iterations=200, l2_penalty=l2)
    model.fit(X_train, y_train)
    y_val_pred = model.predict(X_val)
    current_rmse = rmse(y_val, y_val_pred)

    print(f"L2 Penalty: {l2}, RMSE: {current_rmse:.4f}")

    if current_rmse < best_rmse:
        best_rmse = current_rmse
        best_l2 = l2
        best_ridge_model = model

print(f"\nBest L2 Penalty: {best_l2}, RMSE: {best_rmse:.4f}")



L2 Penalty: 0.01, RMSE: 0.2973




L2 Penalty: 0.1, RMSE: 0.2973




L2 Penalty: 0.5, RMSE: 0.2973




L2 Penalty: 1.0, RMSE: 0.2973

Best L2 Penalty: 0.01, RMSE: 0.2973
