In [6]:
import numpy as np
import pandas as pd

from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant


In [5]:
# ================================================
# 2. Load datasets (train + test)
# ================================================



data_path = r'C:\Users\sacii\OneDrive\Desktop\Oslomet\ACIT4510 Statistical learning\ACIT4510_statistical_learning_project\data\processed'
X_train_encoded = np.load(f"{data_path}\\X_train_encoded.npy")
X_test_encoded = np.load(f"{data_path}\\X_test_encoded.npy")

y_train = pd.read_csv(f"{data_path}\\y_train.csv")
y_test = pd.read_csv(f"{data_path}\\y_test.csv")

print(X_train_encoded.shape)
print(X_test_encoded.shape)
print(y_train.shape)
print(y_test.shape)


(1920, 14)
(600, 14)
(1920, 1)
(600, 1)


In [7]:
n_features = X_train_encoded.shape[1]
feature_names = [f"x{i}" for i in range(n_features)]

X_train_df = pd.DataFrame(X_train_encoded, columns=feature_names)
X_test_df  = pd.DataFrame(X_test_encoded,  columns=feature_names)


In [8]:
# ============================================
# 4. Compute VIF on training features
#    (diagnostic only, we do not drop features)
# ============================================
def compute_vif(df: pd.DataFrame) -> pd.DataFrame:
    df_const = add_constant(df)
    vif_data = pd.DataFrame()
    vif_data["feature"] = df_const.columns
    vif_data["VIF"] = [
        variance_inflation_factor(df_const.values, i)
        for i in range(df_const.shape[1])
    ]
    return vif_data

vif_results = compute_vif(X_train_df)
print("\nVIF results (first 15 rows):")
print(vif_results.head(15))


VIF results (first 15 rows):
   feature        VIF
0    const   0.000000
1       x0   2.404673
2       x1   1.005894
3       x2   1.009855
4       x3   1.003273
5       x4  21.983721
6       x5        inf
7       x6        inf
8       x7        inf
9       x8        inf
10      x9        inf
11     x10        inf
12     x11        inf
13     x12        inf
14     x13        inf


  return 1 - self.ssr/self.centered_tss
  vif = 1. / (1. - r_squared_i)


In [9]:
# ============================================
# 5. y as numpy arrays
# ============================================
y_train_arr = y_train.to_numpy()
y_test_arr  = y_test.to_numpy()

In [10]:
# ============================================
# 6. Define models
# ============================================
models = {
    "LinearRegression": LinearRegression(),
    "Ridge": Ridge(alpha=1.0, random_state=None),
    "Lasso": Lasso(alpha=0.01, max_iter=10000, random_state=None),
    "ElasticNet": ElasticNet(alpha=0.01, l1_ratio=0.5, max_iter=10000, random_state=None),
}


In [11]:
# ============================================
# 7. Fit and evaluate models
# ============================================
results = []

for name, model in models.items():
    model.fit(X_train_df, y_train_arr)

    y_train_pred = model.predict(X_train_df)
    y_test_pred  = model.predict(X_test_df)

    mae_train = mean_absolute_error(y_train_arr, y_train_pred)
    rmse_train = np.sqrt(mean_squared_error(y_train_arr, y_train_pred))
    r2_train = r2_score(y_train_arr, y_train_pred)

    mae_test = mean_absolute_error(y_test_arr, y_test_pred)
    rmse_test = np.sqrt(mean_squared_error(y_test_arr, y_test_pred))
    r2_test = r2_score(y_test_arr, y_test_pred)

    results.append({
        "model": name,
        "mae_train": mae_train,
        "rmse_train": rmse_train,
        "r2_train": r2_train,
        "mae_test": mae_test,
        "rmse_test": rmse_test,
        "r2_test": r2_test,
    })


In [14]:
# ============================================
# 8. Make comparison table
# ============================================
results_df = pd.DataFrame(results)
results_df = results_df.set_index("model")

print("\nModel comparison (train vs test):")
print(results_df)



Model comparison (train vs test):
                  mae_train  rmse_train  r2_train  mae_test  rmse_test  \
model                                                                    
LinearRegression   2.192279    2.541052  0.015424  2.227267   2.574803   
Ridge              2.192331    2.541054  0.015422  2.227348   2.574835   
Lasso              2.195753    2.542294  0.014461  2.224045   2.571237   
ElasticNet         2.194344    2.541738  0.014892  2.226127   2.573392   

                   r2_test  
model                       
LinearRegression -0.017045  
Ridge            -0.017071  
Lasso            -0.014230  
ElasticNet       -0.015931  


In [13]:

# ============================================
# 9. Save results to processed folder
# ============================================
results_df.to_csv(f"{data_path}\\linear_models_comparison.csv", index=True)
vif_results.to_csv(f"{data_path}\\vif_results_encoded_features.csv", index=False)

print("\nSaved:")
print(f"- linear_models_comparison.csv")
print(f"- vif_results_encoded_features.csv")


Saved:
- linear_models_comparison.csv
- vif_results_encoded_features.csv
