In [11]:
# The California Housing dataset contains 8 input features (or independent variables) and 1 target variable (or dependent variable).

# The 8 input features are:

# 1. MedInc: median income in block
# 2. HouseAge: median house age in block
# 3. AveRooms: average number of rooms per dwelling
# 4. AveBedrms: average number of bedrooms per dwelling
# 5. Population: block population
# 6. AveOccup: average house occupancy
# 7. Latitude: block latitude
# 8. Longitude: block longitude
# The target variable is:

# 1. MedHouseVal: median house value in dollars

In [12]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import fetch_california_housing


In [13]:
# Load the dataset
housing = fetch_california_housing()


In [19]:
print(housing)

{'data': array([[   8.3252    ,   41.        ,    6.98412698, ...,    2.55555556,
          37.88      , -122.23      ],
       [   8.3014    ,   21.        ,    6.23813708, ...,    2.10984183,
          37.86      , -122.22      ],
       [   7.2574    ,   52.        ,    8.28813559, ...,    2.80225989,
          37.85      , -122.24      ],
       ...,
       [   1.7       ,   17.        ,    5.20554273, ...,    2.3256351 ,
          39.43      , -121.22      ],
       [   1.8672    ,   18.        ,    5.32951289, ...,    2.12320917,
          39.43      , -121.32      ],
       [   2.3886    ,   16.        ,    5.25471698, ...,    2.61698113,
          39.37      , -121.24      ]]), 'target': array([4.526, 3.585, 3.521, ..., 0.923, 0.847, 0.894]), 'frame': None, 'target_names': ['MedHouseVal'], 'feature_names': ['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude'], 'DESCR': '.. _california_housing_dataset:\n\nCalifornia Housing dataset\n-

In [20]:
type(housing)


sklearn.utils._bunch.Bunch

In [14]:
# Split the dataset into training and testing sets

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(housing.data, housing.target, test_size=0.2, random_state=42)

In [15]:
# Train a linear regression model

from sklearn.linear_model import LinearRegression


lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)

In [16]:
# Evaluate the model on the test set

from sklearn.metrics import mean_squared_error

y_pred = lin_reg.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print("RMSE:", rmse)

RMSE: 0.7455813830127752


# Gradient boosting

In [27]:

import pandas as pd
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, accuracy_score


In [22]:
model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)

In [23]:
# Fitting the model on the training data

model.fit(X_train, y_train)

In [24]:
# Making predictions on the test data

y_pred = model.predict(X_test)

In [25]:
# Calculating the mean squared error (MSE) of the predictions

mse = mean_squared_error(y_test, y_pred)

In [26]:
# Printing the MSE value

print(f"Mean Squared Error > {mse:.4f}")

Mean Squared Error: 0.2940
