In [None]:
import pandas as pd
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, mean_squared_error
import numpy as np
from data_processor import DataProcessor  # Assuming this is available

# Load and preprocess the dataset
dp = DataProcessor(pd.read_csv("../data/train.csv"))
df = dp.numerical_data()  # Use all numerical features
y = df['SalePrice']  # Target variable
X = df.drop(columns=['SalePrice'])  # Features

# Define RMSE and Normalized RMSE scorers
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

def normalized_rmse(y_true, y_pred):
    return rmse(y_true, y_pred) / (y_true.max() - y_true.min())

rmse_scorer = make_scorer(rmse, greater_is_better=False)
normalized_rmse_scorer = make_scorer(normalized_rmse, greater_is_better=False)

# ---- Linear Regression ----
print("---- Linear Regression ----")

# Initialize Linear Regression model
linear_model = LinearRegression()

# 10-fold Cross-Validation for RMSE and Normalized RMSE
linear_rmse_scores = cross_val_score(linear_model, X, y, cv=10, scoring=rmse_scorer)
mean_linear_rmse = -linear_rmse_scores.mean()

linear_normalized_rmse_scores = cross_val_score(linear_model, X, y, cv=10, scoring=normalized_rmse_scorer)
mean_linear_normalized_rmse = -linear_normalized_rmse_scores.mean()

print(f"10-fold Cross-Validated RMSE (Linear): {mean_linear_rmse}")
print(f"10-fold Cross-Validated Normalized RMSE (Linear): {mean_linear_normalized_rmse}")

# Train on full data to evaluate feature importance
linear_model.fit(X, y)
linear_feature_importance = pd.DataFrame({
    'Feature': X.columns,
    'Coefficient': linear_model.coef_
}).sort_values(by='Coefficient', ascending=False)

print("\nFeature Importance (Linear Regression - Sorted by Coefficient):")
print(linear_feature_importance)


# ---- Polynomial Regression with Ridge ----
print("\n---- Polynomial Regression with Ridge ----")

degree = 2  # You can adjust this degree as needed

# Create Polynomial Regression pipeline with Ridge
poly_model = make_pipeline(PolynomialFeatures(degree), Ridge(alpha=1.0))

# 10-fold Cross-Validation for RMSE and Normalized RMSE
poly_rmse_scores = cross_val_score(poly_model, X, y, cv=10, scoring=rmse_scorer)
mean_poly_rmse = -poly_rmse_scores.mean()

poly_normalized_rmse_scores = cross_val_score(poly_model, X, y, cv=10, scoring=normalized_rmse_scorer)
mean_poly_normalized_rmse = -poly_normalized_rmse_scores.mean()

print(f"10-fold Cross-Validated RMSE (Polynomial with Ridge): {mean_poly_rmse}")
print(f"10-fold Cross-Validated Normalized RMSE (Polynomial with Ridge): {mean_poly_normalized_rmse}")


