# <span style='color:brown'> Assignment 3 <span>

In [1]:
import pandas as pd
import numpy as np
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Loading and Preprocessing

In [2]:
data = fetch_california_housing()
df = pd.DataFrame(data=data.data, columns=data.feature_names)
df['MedHouseValue'] = data.target

## Check for missing values

In [3]:
print("Missing values:\n", df.isnull().sum())

Missing values:
 MedInc           0
HouseAge         0
AveRooms         0
AveBedrms        0
Population       0
AveOccup         0
Latitude         0
Longitude        0
MedHouseValue    0
dtype: int64


## Missing values can distort model training and lead to errors or misleading results. In this dataset, there are no missing values, so no imputation was needed.

In [4]:
X = df.drop('MedHouseValue', axis=1)
y = df['MedHouseValue']

### Split into train and test sets

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### To evaluate model performance, we need to separate a portion of the data for testing, ensuring the model is evaluated on unseen data to mimic real-world scenarios

# Feature scaling

In [8]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

### Scaling is necessary, especially in Linear Regression, Support Vector Regressor (SVR), and Gradient Boosting, are sensitive to the scale of features.

Features like population and median income have very different ranges, which can lead to biased training if not standardized.
StandardScaler transforms features to have zero mean and unit variance, making training more stable and efficient.

# Regression Algorithms

In [9]:
models = {
    "Linear Regression": LinearRegression(),
    "Decision Tree Regressor": DecisionTreeRegressor(random_state=42),
    "Random Forest Regressor": RandomForestRegressor(random_state=42),
    "Gradient Boosting Regressor": GradientBoostingRegressor(random_state=42),
    "Support Vector Regressor": SVR()
}

results = []

In [None]:
for name, model in models.items():
    # Use scaled features for SVR and Linear Regression
    if name in ["Support Vector Regressor", "Linear Regression"]:
        model.fit(X_train_scaled, y_train)
        y_pred = model.predict(X_test_scaled)
    else:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    results.append({
        "Model": name,
        "MSE": mse,
        "MAE": mae,
        "R²": r2
    })

## Convert results to DataFrame