In [None]:
from sklearn.datasets import fetch_california_housing
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

#fetch the Housing data
housing = fetch_california_housing(as_frame = True)

#set Features(X) and Target(y)
X = housing.data
y = housing.target

#Combine into one dataframe
df = pd.concat([X, y], axis = 1)

#Split the data into 80% for training and 20% for testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

# Initialize the model
model = LinearRegression()

#fit on training data
model.fit(X_test, y_test)

#predict y on test set
y_pred = model.predict(X_test)

#Compare the actual test values and the model predictions
comparisons = pd.DataFrame({"Actual": y_test, "Predicted": y_pred})

#Model metrics with plain linear regression
print("Metrics with Linear Regression\n")
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2_l = r2_score(y_test, y_pred)
print(f"R2 Score: {r2_l}\nMean Absolute Error: {mae}\nMean Squared Error: {mse}\nRoot Mean Squared Error: {rmse}\n\n\n")

#using scaler

scaler = StandardScaler()
#train
model_scaled = make_pipeline(scaler, LinearRegression())
model_scaled.fit(X_train, y_train)
#predict
y_pred_scaled = model_scaled.predict(X_test)



#Model metrics with scaling
print("Metrics with Scaling(Linear Regression)\n")
mae_scaled = mean_absolute_error(y_test, y_pred_scaled)
mse_scaled = mean_squared_error(y_test, y_pred_scaled)
rmse_scaled = np.sqrt(mse)
r2_scaled = r2_score(y_test, y_pred_scaled)
print(f"R2 Score: {r2_scaled}\nMean Absolute Error: {mae_scaled}\nMean Squared Error: {mse_scaled}\nRoot Mean Squared Error: {rmse_scaled}\n\n\n")


#Using Decision Tree
tree_model = DecisionTreeRegressor(max_depth = 5, random_state=42)
tree_model.fit(X_train, y_train)

y_pred_tree = tree_model.predict(X_test)

#Model metrics with decision trees
print("Metrics with Decision tree \n")
mae_tree = mean_absolute_error(y_test, y_pred_tree)
mse_tree = mean_squared_error(y_test, y_pred_tree)
rmse_tree = np.sqrt(mse)
r2_tree = r2_score(y_test, y_pred_tree)
print(f"R2 Score: {r2_tree}\nMean Absolute Error: {mae_tree}\nMean Squared Error: {mse_tree}\nRoot Mean Squared Error: {rmse_tree}\n\n\n")

#Using Random Forest
forest_model = RandomForestRegressor(n_estimators = 500, min_samples_split =  2, min_samples_leaf = 2, max_features = 'log2')
forest_model.fit(X_train, y_train)

y_pred_forest = forest_model.predict(X_test)

#Model metrics with Random Forest
print("Metrics with Random Forest \n")
mae_forest = mean_absolute_error(y_test, y_pred_forest)
mse_forest = mean_squared_error(y_test, y_pred_forest)
rmse_forest = np.sqrt(mse)
r2_forest = r2_score(y_test, y_pred_forest)
print(f"R2 Score: {r2_forest}\nMean Absolute Error: {mae_forest}\nMean Squared Error: {mse_forest}\nRoot Mean Squared Error: {rmse_forest}\n\n\n")


#Using GradientBoostingRegressor
gb_model = GradientBoostingRegressor(learning_rate = 0.1, max_depth = 5, n_estimators = 300, subsample =0.8,random_state=42)
gb_model.fit(X_train, y_train)

y_pred_gb = gb_model.predict(X_test)


#hyperparameter tuning for GradientBoostingRegressor
gb = GradientBoostingRegressor(random_state=42)

param_grid = {
    'n_estimators' : [100, 200, 300],
    'learning_rate' : [0.01, 0.05, 0.1],
    'max_depth' : [3,4,5],
    'subsample' : [0.8, 1.0]  
}

grid_search = GridSearchCV(
    gb,
    param_grid,
    cv=3,
    scoring='r2',
    n_jobs=-1,
    verbose=2
)
grid_search.fit(X_train, y_train)
print("Best Parameters: ", grid_search.best_params_)
print("Best R2 Score (CV)", grid_search.best_score_)

#Model metrics with Gradient Boosting(with hyper parameters attained through hypeer parameter tuning)
print("Metrics with Gradient Boosting\n")
mae_gb = mean_absolute_error(y_test, y_pred_gb)
mse_gb = mean_squared_error(y_test, y_pred_gb)
rmse_gb = np.sqrt(mse)
r2_gb = r2_score(y_test, y_pred_gb)
print(f"R2 Score: {r2_gb}\nMean Absolute Error: {mae_gb}\nMean Squared Error: {mse_gb}\nRoot Mean Squared Error: {rmse_gb}\n\n\n\n\n\n\n")


#Using XGBoost
# Hyperparameter tuning
#defining the model
xgb = XGBRegressor(objective="reg:squarederror", random_state=42)
param_dist = {
    "n_estimators": [100, 200, 300, 500],
    "max_depth": [3, 4, 5, 6,8],
    "learning_rate": [0.01, 0.05, 0.1, 0.2],
    "subsample": [0.6, 0.8, 1.0],
    "colsample_bytree": [0.6, 0.8, 1.0]
}
#RandomizedSearchCV setup
random_search = RandomizedSearchCV(
    estimate=xgb,
    param_distribution = param_dist,
    n_iter = 20,
    cv=3,
    scoring="r2",
    andom_state=42,
    n_jobs = -1,
    verbose=2
)
random_search.fit(X_train, y_train)

#best params
print("Best Parameters: \n\n\n\n\n", random_search.best_params_)


#Evaluate the tuned model
best_xgb_model = random_search.best_estimator_
y_pred_xgb_best = best_xgb_model.predict(X_test)

#initializing extreme gradient boosting

xgb_model = XGBRegressor(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=4,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state =42
)
#Train
xgb_model.fit(X_train, y_train)

#predict
y_pred_xgb = xgb_model.predict(X_test)

#Model metrics with XGBoost
print("Metrics with Extreme Gradient Boosting\n")
mae_xgb = mean_absolute_error(y_test, y_pred_xgb)
mse_xgb = mean_squared_error(y_test, y_pred_xgb)
rmse_xgb = np.sqrt(mse)
r2_xgb = r2_score(y_test, y_pred_xgb)
print(f"R2 Score: {r2_xgb}\nMean Absolute Error: {mae_xgb}\nMean Squared Error: {mse_xgb}\nRoot Mean Squared Error: {rmse_xgb}\n\n\n")




Metrics with Linear Regression

R2 Score: 0.596054650433006
Mean Absolute Error: 0.528942661428345
Mean Squared Error: 0.5293336127912477
Root Mean Squared Error: 0.727553168360394



Metrics with Scaling(Linear Regression)

R2 Score: 0.575787706032451
Mean Absolute Error: 0.5332001304956562
Mean Squared Error: 0.5558915986952442
Root Mean Squared Error: 0.727553168360394



Metrics with Decision tree 

R2 Score: 0.5997321244428706
Mean Absolute Error: 0.5222592972077786
Mean Squared Error: 0.5245146178314735
Root Mean Squared Error: 0.727553168360394



