In [None]:
# import packages
import sys
import os
sys.path.append(os.path.abspath('.'))

from pages.A_Explore_Preprocess_Dataset import load_dataset



In [5]:
# load dataset
filepath = "mortgage_3000_onehot.csv"
df = load_dataset(filepath)
df.head()



Unnamed: 0.1,Unnamed: 0,pages_viewed,time_on_site_sec,propensity_score,bounced,started_application,used_calculator,completed_application,user_type_New,user_type_Returning,...,device_type_Tablet,entry_page_Calculator,entry_page_Contact,entry_page_Home,entry_page_Learn,entry_page_Rates,geo_region_Midwest,geo_region_Northeast,geo_region_South,geo_region_West
0,0,3,260,0.28,0,1,1,1,1,0,...,0,0,0,0,0,1,0,1,0,0
1,1,1,740,0.93,1,1,0,1,0,1,...,0,0,0,0,1,0,0,1,0,0
2,2,10,516,0.2,0,0,0,1,1,0,...,0,0,1,0,0,0,0,0,1,0
3,3,5,458,0.75,1,0,1,0,1,0,...,0,0,1,0,0,0,1,0,0,0
4,4,5,755,0.65,1,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,1


In [30]:
import numpy as np
import pandas as pd
from pages.B_Train_Model import split_dataset, LinearRegression
from pages.C_Test_Model import rmse, mae, r2, compute_eval_metrics

# split dataset
target = 'propensity_score'
features = [col for col in df.select_dtypes(include='number').columns if col != target]

X = df[features].values
Y = df[[target]].values

X_train, X_val, y_train, y_val = split_dataset(X, Y, number=30)

# train 
model = LinearRegression(learning_rate=0.01, num_iterations=100)
model.fit(X_train, y_train)

def test(model):
    # predict
    y_train_pred = model.predict(X_train)
    y_val_pred = model.predict(X_val)

    # evaluate
    metrics = ['mean_absolute_error', 'root_mean_squared_error', 'r2_score']
    train_metrics = compute_eval_metrics(X_train, y_train, model, metrics)
    val_metrics = compute_eval_metrics(X_val, y_val, model, metrics)

    print("Train Metrics:")
    print(train_metrics)

    print("\nValidation Metrics:")
    print(val_metrics)

test(model)



Train Metrics:
{'mean_absolute_error': np.float64(0.2558542804767965), 'root_mean_squared_error': np.float64(0.2975249502695802), 'r2_score': np.float64(-0.04105128076024411)}

Validation Metrics:
{'mean_absolute_error': np.float64(0.2626073968755234), 'root_mean_squared_error': np.float64(0.3065801758117401), 'r2_score': np.float64(-0.09839256801770313)}


In [31]:
from pages.B_Train_Model import LassoRegression

lasso_model = LassoRegression(learning_rate=best_lr, num_iterations=100, l1_penalty=best_l1)
lasso_model.fit(X_train, y_train)

test(lasso_model)



Train Metrics:
{'mean_absolute_error': np.float64(0.25205056055468983), 'root_mean_squared_error': np.float64(0.29011264614816434), 'r2_score': np.float64(0.01017444778167731)}

Validation Metrics:
{'mean_absolute_error': np.float64(0.2576217091830716), 'root_mean_squared_error': np.float64(0.29695211877652916), 'r2_score': np.float64(-0.030486495018826387)}


In [32]:
from pages.B_Train_Model import RidgeRegression

ridge_model = RidgeRegression(learning_rate=best_lr, num_iterations=100, l2_penalty=best_l2)
ridge_model.fit(X_train, y_train)

test(ridge_model)



Train Metrics:
{'mean_absolute_error': np.float64(0.2520297846334558), 'root_mean_squared_error': np.float64(0.2901099086712768), 'r2_score': np.float64(0.010193127503668076)}

Validation Metrics:
{'mean_absolute_error': np.float64(0.25779069486346773), 'root_mean_squared_error': np.float64(0.29714378613951625), 'r2_score': np.float64(-0.03181717668830775)}


In [27]:
!pip install -q matplotlib

In [33]:
# plot loss curves
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))
plt.plot(model.cost_history, label='Linear Regression', linewidth=2)
plt.plot(lasso_model.cost_history, label='Lasso Regression', linestyle='--')
plt.plot(ridge_model.cost_history, label='Ridge Regression', linestyle=':')
plt.title('Loss Curves Over Iterations')
plt.xlabel('Iteration')
plt.ylabel('Loss (RMSE)')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

# save plot as image
plt.savefig('loss_curves.png', dpi=300)

  plt.show()


In [35]:
model.cost_history

[np.float64(0.5765203089802359),
 np.float64(0.567924023095117),
 np.float64(0.5595442730662304),
 np.float64(0.551376899622248),
 np.float64(0.5434178063749351),
 np.float64(0.5356629579105084),
 np.float64(0.5281083779525789),
 np.float64(0.5207501475986528),
 np.float64(0.5135844036320781),
 np.float64(0.5066073369112101),
 np.float64(0.4998151908374281),
 np.float64(0.4932042599034709),
 np.float64(0.48677088832336446),
 np.float64(0.48051146874499623),
 np.float64(0.474422441046147),
 np.float64(0.46850029121452563),
 np.float64(0.46274155031206504),
 np.float64(0.45714279352342874),
 np.float64(0.45170063928835896),
 np.float64(0.44641174851715787),
 np.float64(0.44127282388825),
 np.float64(0.43628060922642037),
 np.float64(0.4314318889599688),
 np.float64(0.4267234876546656),
 np.float64(0.42215226962204566),
 np.float64(0.41771513859923737),
 np.float64(0.41340903749719715),
 np.float64(0.4092309482139069),
 np.float64(0.40517789150880523),
 np.float64(0.4012469269344566),
 np

Hyperparameter Search (Learning Rate, L1 Penalty, L2 Penalty)

In [22]:
# grid search for best learning rate - on normal Linear Regression
from pages.B_Train_Model import LinearRegression

learning_rates = [0.0001, 0.001, 0.01, 0.1, 0.5]
best_rmse = float('inf')
best_lr = None
best_model = None

for lr in learning_rates:
    model = LinearRegression(learning_rate=lr, num_iterations=100)
    model.fit(X_train, y_train)
    y_val_pred = model.predict(X_val)
    current_rmse = rmse(y_val, y_val_pred)

    print(f"Learning Rate: {lr}, RMSE: {current_rmse:.4f}")

    if current_rmse < best_rmse:
        best_rmse = current_rmse
        best_lr = lr
        best_model = model

print(f"\nBest Learning Rate: {best_lr}, RMSE: {best_rmse:.4f}")



Learning Rate: 0.0001, RMSE: 0.5603




Learning Rate: 0.001, RMSE: 0.4926




Learning Rate: 0.01, RMSE: 0.3000




Learning Rate: 0.1, RMSE: 0.2973




Learning Rate: 0.5, RMSE: 0.2987

Best Learning Rate: 0.1, RMSE: 0.2973


In [19]:
# grid search to choose l1 and l2 penalty
from pages.B_Train_Model import LassoRegression

l1_penalties = [0.01, 0.1, 0.5, 1.0]
best_rmse = float('inf')
best_l1 = None
best_lasso_model = None

for l1 in l1_penalties:
    model = LassoRegression(learning_rate=best_lr, num_iterations=100, l1_penalty=l1)
    model.fit(X_train, y_train)
    y_val_pred = model.predict(X_val)
    current_rmse = rmse(y_val, y_val_pred)

    print(f"L1 Penalty: {l1}, RMSE: {current_rmse:.4f}")

    if current_rmse < best_rmse:
        best_rmse = current_rmse
        best_l1 = l1
        best_lasso_model = model

print(f"\nBest L1 Penalty: {best_l1}, RMSE: {best_rmse:.4f}")



L1 Penalty: 0.01, RMSE: 0.2973




L1 Penalty: 0.1, RMSE: 0.2973




L1 Penalty: 0.5, RMSE: 0.2972




L1 Penalty: 1.0, RMSE: 0.2972

Best L1 Penalty: 1.0, RMSE: 0.2972


In [20]:
from pages.B_Train_Model import RidgeRegression

l2_penalties = [0.01, 0.1, 0.5, 1.0]
best_rmse = float('inf')
best_l2 = None
best_ridge_model = None

for l2 in l2_penalties:
    model = RidgeRegression(learning_rate=best_lr, num_iterations=100, l2_penalty=l2)
    model.fit(X_train, y_train)
    y_val_pred = model.predict(X_val)
    current_rmse = rmse(y_val, y_val_pred)

    print(f"L2 Penalty: {l2}, RMSE: {current_rmse:.4f}")

    if current_rmse < best_rmse:
        best_rmse = current_rmse
        best_l2 = l2
        best_ridge_model = model

print(f"\nBest L2 Penalty: {best_l2}, RMSE: {best_rmse:.4f}")



L2 Penalty: 0.01, RMSE: 0.2973




L2 Penalty: 0.1, RMSE: 0.2973




L2 Penalty: 0.5, RMSE: 0.2973




L2 Penalty: 1.0, RMSE: 0.2973

Best L2 Penalty: 0.01, RMSE: 0.2973
