In [1]:
import joblib

X_train_clean = joblib.load("../data/X_train_clean.joblib")
X_test_clean  = joblib.load("../data/X_test_clean.joblib")
y_train        = joblib.load("../data/y_train.joblib")
y_test         = joblib.load("../data/y_test.joblib")

X_train_clean.shape, X_test_clean.shape


((16512, 13), (4128, 13))

In [3]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np


MAE


In [4]:
def mae_manual(actual_test_target, predicted_test_target):
    absolute_error = abs((predicted_test_target - actual_test_target) / actual_test_target)
    return np.mean(absolute_error)



RMSE


In [5]:
def RMSE_manual(actual_test_target, predicted_test_target):
    squared_error =(actual_test_target - predicted_test_target) ** 2
    mean_squared_error = np.mean(squared_error)
    return np.sqrt(mean_squared_error)

R^2


In [6]:
def r2_score_manual(actual_targets,predicted_targets):
    residual_sum_of_squares = np.sum((actual_targets - predicted_targets) ** 2)
    total_sum_of_squares = np.sum((actual_targets - np.mean(actual_targets)) ** 2)
    return 1 - (residual_sum_of_squares / total_sum_of_squares)


Cross-Validation

In [10]:
from sklearn.model_selection import KFold
kfold = KFold(n_splits=5,shuffle=True,random_state=42)


In [14]:
from sklearn.linear_model import LinearRegression
def kfold_cross_validation_rmse(model_class, feature_matrix, target_vector, num_folds = 5):
    kf = KFold(n_splits=num_folds,shuffle=True,random_state=42)

    rmse_scores = []

    fold_splits = enumerate(kf.split(feature_matrix))

    for fold_idx, (train_idx, validation_idx) in fold_splits:
        X_train_fold = feature_matrix.iloc[train_idx]
        y_train_fold = target_vector.iloc[train_idx]

        X_validation_fold = feature_matrix.iloc[validation_idx]
        y_validation = target_vector.iloc[validation_idx]

        model = model_class()
        model.fit(X_train_fold, y_train_fold)

        validation_predictions = model.predict(X_validation_fold)

        fold_rmse = RMSE_manual(y_validation, validation_predictions)
        rmse_scores.append(fold_rmse)

        print(f"Fold {fold_idx + 1} RMSE: {fold_rmse:,.2f}")

    return np.array(rmse_scores)



In [15]:
lr_rmse_scores = kfold_cross_validation_rmse(
    model_class=LinearRegression,
    feature_matrix=X_train_clean,
    target_vector=y_train,
    num_folds=5
)

print("\nLR CV RMSE scores:", lr_rmse_scores)
print("Mean RMSE:", lr_rmse_scores.mean())
print("Std RMSE :", lr_rmse_scores.std())


Fold 1 RMSE: 69,793.50
Fold 2 RMSE: 67,748.61
Fold 3 RMSE: 66,466.61
Fold 4 RMSE: 69,972.41
Fold 5 RMSE: 69,039.68

LR CV RMSE scores: [69793.50229311 67748.61211153 66466.60638207 69972.41192577
 69039.682064  ]
Mean RMSE: 68604.16295529754
Std RMSE : 1324.9472102500572


Linear Regression is a stable, low-variance baseline with RMSE â‰ˆ 68.6k.