In [1]:
import pandas as pd
from sklearn.datasets import fetch_california_housing
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR

In [2]:
# Load the dataset
california = fetch_california_housing()

# Convert to pandas DataFrame
df = pd.DataFrame(california.data, columns=california.feature_names)
df['MedPrice'] = california.target

In [3]:
# Display dataset information
print("Dataset Shape:", df.shape)
print("\nFirst 5 rows:")
print(df.head())
print("\nMissing Values:", df.isnull().sum().sum())

Dataset Shape: (20640, 9)

First 5 rows:
   MedInc  HouseAge  AveRooms  AveBedrms  Population  AveOccup  Latitude  \
0  8.3252      41.0  6.984127   1.023810       322.0  2.555556     37.88   
1  8.3014      21.0  6.238137   0.971880      2401.0  2.109842     37.86   
2  7.2574      52.0  8.288136   1.073446       496.0  2.802260     37.85   
3  5.6431      52.0  5.817352   1.073059       558.0  2.547945     37.85   
4  3.8462      52.0  6.281853   1.081081       565.0  2.181467     37.85   

   Longitude  MedPrice  
0    -122.23     4.526  
1    -122.22     3.585  
2    -122.24     3.521  
3    -122.25     3.413  
4    -122.25     3.422  

Missing Values: 0


In [4]:
# Split into features (X) and target (y)
X = df.drop('MedPrice', axis=1)
y = df['MedPrice']

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature scaling (standardization)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [5]:
# 2 Regression Algorithm Implementation

# Initialize models
models = {
    "Linear Regression": LinearRegression(),
    "Decision Tree": DecisionTreeRegressor(random_state=42),
    "Random Forest": RandomForestRegressor(random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(random_state=42),
    "Support Vector Regressor": SVR()
}


In [6]:
# Train and evaluate models
results = []

for name, model in models.items():
    # Train the model
    model.fit(X_train_scaled, y_train)

    # Make predictions
    y_pred = model.predict(X_test_scaled)

    # Calculate evaluation metrics
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    # Store results
    results.append({
        "Model": name,
        "MSE": mse,
        "MAE": mae,
        "R²": r2
    })

    # Print results
    print(f"\n{name} Results:")
    print(f"MSE: {mse:.4f}")
    print(f"MAE: {mae:.4f}")
    print(f"R²: {r2:.4f}")


Linear Regression Results:
MSE: 0.5559
MAE: 0.5332
R²: 0.5758

Decision Tree Results:
MSE: 0.4940
MAE: 0.4539
R²: 0.6230

Random Forest Results:
MSE: 0.2552
MAE: 0.3274
R²: 0.8053

Gradient Boosting Results:
MSE: 0.2940
MAE: 0.3717
R²: 0.7756

Support Vector Regressor Results:
MSE: 0.3570
MAE: 0.3986
R²: 0.7276


In [9]:
# Convert results to DataFrame
results_df = pd.DataFrame(results)
print("\nModel Comparison:")
print(results_df.to_string(index=False))


Model Comparison:
                   Model      MSE      MAE       R²
       Linear Regression 0.555892 0.533200 0.575788
           Decision Tree 0.493969 0.453904 0.623042
           Random Forest 0.255170 0.327425 0.805275
       Gradient Boosting 0.293999 0.371650 0.775643
Support Vector Regressor 0.357004 0.398599 0.727563


In [10]:
# 3. Model Evaluation and Comparison

# Identify best and worst models
best_model = results_df.loc[results_df['MSE'].idxmin()]
worst_model = results_df.loc[results_df['MSE'].idxmax()]

print("\nBest Model:")
print(f"Model: {best_model['Model']}, MSE: {best_model['MSE']:.4f}, R²: {best_model['R²']:.4f}")

print("\nWorst Model:")
print(f"Model: {worst_model['Model']}, MSE: {worst_model['MSE']:.4f}, R²: {worst_model['R²']:.4f}")


Best Model:
Model: Random Forest, MSE: 0.2552, R²: 0.8053

Worst Model:
Model: Linear Regression, MSE: 0.5559, R²: 0.5758
