In [1]:
# Regression Assignment using California Housing Dataset

# 1. Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score


In [6]:
# 2. Load the dataset
housing = fetch_california_housing()
X = pd.DataFrame(housing.data, columns=housing.feature_names)
y = pd.Series(housing.target, name='MedHouseValue')

In [12]:
print(X.head())

   MedInc  HouseAge  AveRooms  AveBedrms  Population  AveOccup  Latitude  \
0  8.3252      41.0  6.984127   1.023810       322.0  2.555556     37.88   
1  8.3014      21.0  6.238137   0.971880      2401.0  2.109842     37.86   
2  7.2574      52.0  8.288136   1.073446       496.0  2.802260     37.85   
3  5.6431      52.0  5.817352   1.073059       558.0  2.547945     37.85   
4  3.8462      52.0  6.281853   1.081081       565.0  2.181467     37.85   

   Longitude  
0    -122.23  
1    -122.22  
2    -122.24  
3    -122.25  
4    -122.25  


In [13]:
print(y.head())

0    4.526
1    3.585
2    3.521
3    3.413
4    3.422
Name: MedHouseValue, dtype: float64


In [14]:
# 3. Check for missing values
print("Missing values in features:\n", X.isnull().sum())
print("Missing values in target:\n", y.isnull().sum())

Missing values in features:
 MedInc        0
HouseAge      0
AveRooms      0
AveBedrms     0
Population    0
AveOccup      0
Latitude      0
Longitude     0
dtype: int64
Missing values in target:
 0


In [16]:
from sklearn.preprocessing import StandardScaler
# 4. Feature Scaling - Standardization
scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

In [17]:
# Display first 5 rows of the scaled data
print("\nFirst 5 rows of scaled features:\n", X_scaled.head())


First 5 rows of scaled features:
      MedInc  HouseAge  AveRooms  AveBedrms  Population  AveOccup  Latitude  \
0  2.344766  0.982143  0.628559  -0.153758   -0.974429 -0.049597  1.052548   
1  2.332238 -0.607019  0.327041  -0.263336    0.861439 -0.092512  1.043185   
2  1.782699  1.856182  1.155620  -0.049016   -0.820777 -0.025843  1.038503   
3  0.932968  1.856182  0.156966  -0.049833   -0.766028 -0.050329  1.038503   
4 -0.012881  1.856182  0.344711  -0.032906   -0.759847 -0.085616  1.038503   

   Longitude  
0  -1.327835  
1  -1.322844  
2  -1.332827  
3  -1.337818  
4  -1.337818  


📝 Explanation of Preprocessing Steps:
    1. Loading the Dataset:Used fetch_california_housing() from sklearn.datasets to load the data, which includes various housing-related features and the median house value (target variable).
    2. Conversion to pandas DataFrame: Converted the dataset to a DataFrame for easier analysis, visualization, and preprocessing.
    3. Handling Missing Values: Used isnull().sum() to check for missing values. (Note: This dataset does not contain missing values, but it’s good practice to always check.)
    4. Feature Scaling (Standardization): Applied StandardScaler to scale the features so that they all have a mean of 0 and a standard deviation of 1.This step is important because many regression models (especially linear ones) perform better when features are on a similar scale.

In [18]:
# Required libraries
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Store models and their names
models = {
    "Linear Regression": LinearRegression(),
    "Decision Tree Regressor": DecisionTreeRegressor(random_state=42),
    "Random Forest Regressor": RandomForestRegressor(random_state=42),
    "Gradient Boosting Regressor": GradientBoostingRegressor(random_state=42),
    "Support Vector Regressor (SVR)": SVR()
}

# Evaluate each model
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    print(f"\n{name}")
    print("-" * len(name))
    print(f"Mean Squared Error: {mse:.4f}")
    print(f"R-squared: {r2:.4f}")



Linear Regression
-----------------
Mean Squared Error: 0.5559
R-squared: 0.5758

Decision Tree Regressor
-----------------------
Mean Squared Error: 0.4943
R-squared: 0.6228

Random Forest Regressor
-----------------------
Mean Squared Error: 0.2555
R-squared: 0.8050

Gradient Boosting Regressor
---------------------------
Mean Squared Error: 0.2940
R-squared: 0.7756

Support Vector Regressor (SVR)
------------------------------
Mean Squared Error: 0.3552
R-squared: 0.7289


📝 Explanations and Suitability of Each Algorithm
1. Linear Regression
How it works: It models the relationship between the dependent and independent variables by fitting a straight line.

Suitability: Good baseline model; useful for identifying linear relationships. Assumes the features are linearly related to the target.

2. Decision Tree Regressor
How it works: Splits the data based on feature thresholds to minimize prediction error in each branch.

Suitability: Handles non-linear relationships and doesn’t require feature scaling. May overfit, but interpretable.

3. Random Forest Regressor
How it works: An ensemble of decision trees where each tree is trained on a random subset of the data and features.

Suitability: More accurate and robust than individual trees. Handles non-linear data and reduces overfitting.

4. Gradient Boosting Regressor
How it works: Builds models sequentially; each new model corrects the errors of the previous one using gradient descent.

Suitability: Often delivers state-of-the-art performance; great for structured/tabular data like this.

5. Support Vector Regressor (SVR)
How it works: Tries to fit the best line (or hyperplane) within a margin, using kernel tricks for non-linear data.

Suitability: Effective in high-dimensional spaces and can capture complex patterns, but slower on large datasets.



In [20]:
from sklearn.metrics import mean_absolute_error

# Dictionary to store evaluation results
results = {
    "Model": [],
    "MSE": [],
    "MAE": [],
    "R² Score": []
}

# Re-run models and collect metrics
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    results["Model"].append(name)
    results["MSE"].append(mse)
    results["MAE"].append(mae)
    results["R² Score"].append(r2)

# Create a DataFrame for easy comparison
results_df = pd.DataFrame(results)
results_df = results_df.sort_values(by="R² Score", ascending=False)

# Display results
print("\nModel Evaluation Summary:\n")
print(results_df)



Model Evaluation Summary:

                            Model       MSE       MAE  R² Score
2         Random Forest Regressor  0.255498  0.327613  0.805024
3     Gradient Boosting Regressor  0.293999  0.371650  0.775643
4  Support Vector Regressor (SVR)  0.355198  0.397763  0.728941
1         Decision Tree Regressor  0.494272  0.453784  0.622811
0               Linear Regression  0.555892  0.533200  0.575788


Conclusion
After running the above code, you can analyze the results and write the following conclusions manually based on output:
-- Best-Performing Algorithm:
Likely to be Gradient Boosting Regressor or Random Forest Regressor

Justification: These models usually achieve higher R² and lower error metrics due to their ensemble learning techniques that reduce bias and variance.

-- Worst-Performing Algorithm:
Often Linear Regression or SVR

Reasoning:

Linear Regression might underperform due to inability to capture non-linear relationships.

SVR can struggle with large datasets and sensitive hyperparameters, causing poorer generalization.

