In [43]:
from sklearn.datasets import  fetch_california_housing
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

from sklearn.preprocessing import MinMaxScaler, StandardScaler

## Load Datasets

In [44]:
# Import clean dataset versions
df_v1 = pd.read_csv("data/processed/california_housing_processed_v1.csv")
df_v2 = pd.read_csv("data/processed/california_housing_processed_v2.csv")

# Split data into features and target
X_v1 = df_v1.drop(columns=["median_house_value"])
y_v1 = df_v1["median_house_value"]

X_v2 = df_v2.drop(columns=["median_house_value"])
y_v2 = df_v2["median_house_value"]


In [45]:
datasets = [
    {
        "name": "v1",
        "X": X_v1,
        "y": y_v1
    },
    {
        "name": "v2",
        "X": X_v2,
        "y": y_v2
    }
]

## Train-Test Split

In [46]:
# Perform train-test split

for dataset in datasets:
    X = dataset["X"]
    y = dataset["y"]

    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    dataset["X_train"] = X_train
    dataset["X_test"] = X_test
    dataset["y_train"] = y_train
    dataset["y_test"] = y_test

## Scaling

In [47]:
# Apply MinMax and Standard Scaler to all dataset versions

# Create a list to store new scaled datasets
scaled_datasets = []

for dataset in datasets:
    # Get train and test datasets
    X_train = dataset["X_train"]
    X_test = dataset["X_test"]
    y_train = dataset["y_train"]
    y_test = dataset["y_test"]

    # Fit MinMax Scaler
    minmax_scaler = MinMaxScaler()
    minmax_scaler.fit(X_train)

    # Transform datasets
    X_train_minmax = minmax_scaler.transform(X_train)
    X_test_minmax = minmax_scaler.transform(X_test)

    # Convert to DataFrame
    X_train_minmax = pd.DataFrame(X_train_minmax, columns = X_train.columns)
    X_test_minmax = pd.DataFrame(X_test_minmax, columns = X_test.columns)

    # Standard Scaler
    standard_scaler = StandardScaler()
    standard_scaler.fit(X_train)

    # Transform datasets
    X_train_standard = standard_scaler.transform(X_train)
    X_test_standard = standard_scaler.transform(X_test)

    # Convert to DataFrame
    X_train_standard = pd.DataFrame(X_train_standard, columns = X_train.columns)
    X_test_standard = pd.DataFrame(X_test_standard, columns = X_test.columns)

    # Save scaled datasets
    dataset["X_train_minmax"] = X_train_minmax
    dataset["X_test_minmax"] = X_test_minmax
    dataset["X_train_standard"] = X_train_standard
    dataset["X_test_standard"] = X_test_standard

    # Add new datasets to the scaled_datasets list
    scaled_datasets.append({
        "name": f"{dataset['name']}_minmax",
        "X_train": X_train_minmax,
        "X_test": X_test_minmax,
        "y_train": y_train,
        "y_test": y_test
    })
    
    scaled_datasets.append({
        "name": f"{dataset['name']}_standard",
        "X_train": X_train_standard,
        "X_test": X_test_standard,
        "y_train": y_train,
        "y_test": y_test
    })

# Now add all scaled datasets to the original list
datasets.extend(scaled_datasets)

## Train KNN Model

In [48]:
# List of models to evaluate
models = [
    {
        "name": "KNN",
        "model": KNeighborsRegressor(n_neighbors=10)
    },
    {
        "name": "Linear Regression",
        "model": LinearRegression()
    },
    {
        "name": "Random Forest",
        "model": RandomForestRegressor(n_estimators=100)
    }
]

results = []

for dataset in datasets:
    X_train = dataset["X_train"]
    X_test = dataset["X_test"]
    y_train = dataset["y_train"]
    y_test = dataset["y_test"]

    for model in models:
        model_name = model["name"]
        model_instance = model["model"]
        model_instance.fit(X_train, y_train)
        y_pred = model_instance.predict(X_test)
        r2_score = r2_score(y_test, y_pred)
        mae = mean_absolute_error(y_test, y_pred)
        mse = mean_squared_error(y_test, y_pred)
        rmse = np.sqrt(mse)
        results.append({
            "model": model_name,
            "dataset": dataset["name"],
            "R2 score": r2_score,
            "MAE": mae,
            "MSE": mse,
            "RMSE": rmse
        })

# Convert results to DataFrame
results_df = pd.DataFrame(results)
results_df

TypeError: 'float' object is not callable