In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import Ridge, Lasso
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_squared_error
import numpy as np

# Load the dataset
df = pd.read_csv("train.csv")

# Display first few rows to understand data structure (optional)
print("First few rows of the dataset:")
print(df.head())

# Target variable: SalePrice
y = df['SalePrice']

# Selecting features - Update the list based on your dataset's columns
X = df[['GrLivArea', 'TotalBsmtSF', 'YearBuilt', 'LotArea']]

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# --- Regular Linear Regression ---
linear_model = make_pipeline(StandardScaler(), LinearRegression())
linear_model.fit(X_train, y_train)

# Predict and evaluate
y_pred_linear = linear_model.predict(X_test)
rmse_linear = np.sqrt(mean_squared_error(y_test, y_pred_linear))
print(f"Regular Linear Regression RMSE: {rmse_linear}")

# --- Ridge Regression with Hyperparameter Tuning ---
ridge_model = make_pipeline(StandardScaler(), Ridge())
ridge_param_grid = {'ridge__alpha': [0.01, 0.1, 1, 10, 100]}  # Adjust these values as needed

ridge_grid_search = GridSearchCV(ridge_model, ridge_param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
ridge_grid_search.fit(X_train, y_train)
best_ridge_model = ridge_grid_search.best_estimator_

# Predict and evaluate Ridge
y_pred_ridge = best_ridge_model.predict(X_test)
rmse_ridge = np.sqrt(mean_squared_error(y_test, y_pred_ridge))
print(f"Best Ridge alpha: {ridge_grid_search.best_params_['ridge__alpha']}")
print(f"Ridge Regression RMSE after tuning: {rmse_ridge}")

# --- Lasso Regression with Hyperparameter Tuning ---
lasso_model = make_pipeline(StandardScaler(), Lasso(max_iter=10000))
lasso_param_grid = {'lasso__alpha': [0.01, 0.1, 1, 10, 100]}  # Adjust these values as needed

lasso_grid_search = GridSearchCV(lasso_model, lasso_param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
lasso_grid_search.fit(X_train, y_train)
best_lasso_model = lasso_grid_search.best_estimator_

# Predict and evaluate Lasso
y_pred_lasso = best_lasso_model.predict(X_test)
rmse_lasso = np.sqrt(mean_squared_error(y_test, y_pred_lasso))
print(f"Best Lasso alpha: {lasso_grid_search.best_params_['lasso__alpha']}")
print(f"Lasso Regression RMSE after tuning: {rmse_lasso}")



First few rows of the dataset:
   Id  MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape  \
0   1          60       RL         65.0     8450   Pave   NaN      Reg   
1   2          20       RL         80.0     9600   Pave   NaN      Reg   
2   3          60       RL         68.0    11250   Pave   NaN      IR1   
3   4          70       RL         60.0     9550   Pave   NaN      IR1   
4   5          60       RL         84.0    14260   Pave   NaN      IR1   

  LandContour Utilities  ... PoolArea PoolQC Fence MiscFeature MiscVal MoSold  \
0         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      2   
1         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      5   
2         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      9   
3         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      2   
4         Lvl    AllPub  ...        0    NaN   NaN         NaN       0     12   

  YrSold  SaleType  SaleCondition  Sa

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import Ridge
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_squared_error, make_scorer
import numpy as np

# Load the dataset
df = pd.read_csv("train.csv")

# Display first few rows to understand data structure (optional)
print("First few rows of the dataset:")
print(df.head())

# Target variable: SalePrice
y = df['SalePrice']

# Selecting features - Update the list based on your dataset's columns
X = df[['GrLivArea', 'TotalBsmtSF', 'YearBuilt', 'LotArea']]

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# --- Polynomial Regression with Ridge Regularization and Hyperparameter Tuning ---
# Define the pipeline with PolynomialFeatures, StandardScaler, and Ridge
poly_ridge_model = make_pipeline(PolynomialFeatures(), StandardScaler(), Ridge())

# Define parameter grid: try different degrees for PolynomialFeatures and different alphas for Ridge
param_grid = {
    'polynomialfeatures__degree': [1, 2, 3],  # Test degrees 1, 2, and 3
    'ridge__alpha': [0.01, 0.1, 1, 10, 100]  # Test different alpha values for Ridge
}

# Define the scoring metric as RMSE
rmse_scorer = make_scorer(mean_squared_error, squared=False)

# Set up GridSearchCV to search for best hyperparameters
grid_search = GridSearchCV(poly_ridge_model, param_grid, cv=5, scoring=rmse_scorer, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Get the best model and best parameters from GridSearchCV
best_poly_ridge_model = grid_search.best_estimator_
best_params = grid_search.best_params_

# Predict and evaluate the best model on the test set
y_pred_best = best_poly_ridge_model.predict(X_test)
rmse_best = np.sqrt(mean_squared_error(y_test, y_pred_best))

# Display the results
print(f"Best parameters: {best_params}")
print(f"Polynomial Regression with Ridge RMSE after tuning: {rmse_best}")


First few rows of the dataset:
   Id  MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape  \
0   1          60       RL         65.0     8450   Pave   NaN      Reg   
1   2          20       RL         80.0     9600   Pave   NaN      Reg   
2   3          60       RL         68.0    11250   Pave   NaN      IR1   
3   4          70       RL         60.0     9550   Pave   NaN      IR1   
4   5          60       RL         84.0    14260   Pave   NaN      IR1   

  LandContour Utilities  ... PoolArea PoolQC Fence MiscFeature MiscVal MoSold  \
0         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      2   
1         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      5   
2         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      9   
3         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      2   
4         Lvl    AllPub  ...        0    NaN   NaN         NaN       0     12   

  YrSold  SaleType  SaleCondition  Sa

  _data = np.array(data, dtype=dtype, copy=copy,
