In [1]:
from sklearn.linear_model import LinearRegression
from sklearn.datasets import fetch_california_housing
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error

In [2]:
dataset = fetch_california_housing()

In [3]:
print(dataset.DESCR)

.. _california_housing_dataset:

California Housing dataset
--------------------------

**Data Set Characteristics:**

    :Number of Instances: 20640

    :Number of Attributes: 8 numeric, predictive attributes and the target

    :Attribute Information:
        - MedInc        median income in block group
        - HouseAge      median house age in block group
        - AveRooms      average number of rooms per household
        - AveBedrms     average number of bedrooms per household
        - Population    block group population
        - AveOccup      average number of household members
        - Latitude      block group latitude
        - Longitude     block group longitude

    :Missing Attribute Values: None

This dataset was obtained from the StatLib repository.
https://www.dcc.fc.up.pt/~ltorgo/Regression/cal_housing.html

The target variable is the median house value for California districts,
expressed in hundreds of thousands of dollars ($100,000).

This dataset was derived

In [4]:
print(dataset)

{'data': array([[   8.3252    ,   41.        ,    6.98412698, ...,    2.55555556,
          37.88      , -122.23      ],
       [   8.3014    ,   21.        ,    6.23813708, ...,    2.10984183,
          37.86      , -122.22      ],
       [   7.2574    ,   52.        ,    8.28813559, ...,    2.80225989,
          37.85      , -122.24      ],
       ...,
       [   1.7       ,   17.        ,    5.20554273, ...,    2.3256351 ,
          39.43      , -121.22      ],
       [   1.8672    ,   18.        ,    5.32951289, ...,    2.12320917,
          39.43      , -121.32      ],
       [   2.3886    ,   16.        ,    5.25471698, ...,    2.61698113,
          39.37      , -121.24      ]]), 'target': array([4.526, 3.585, 3.521, ..., 0.923, 0.847, 0.894]), 'frame': None, 'target_names': ['MedHouseVal'], 'feature_names': ['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude'], 'DESCR': '.. _california_housing_dataset:\n\nCalifornia Housing dataset\n-

#Using Median Income,Population,Latitude and Longitude for prediction

In [128]:
x = dataset['data'][..., [0,4,6,7]]
print(x.shape)

(20640, 4)


In [129]:

y = dataset['target']

In [130]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20)

In [131]:
print(x_train.shape, x_test.shape)

(16512, 4) (4128, 4)


In [132]:
model = LinearRegression()

In [133]:
model.fit(x_train, y_train)

In [134]:
y_pred = model.predict(x_test)

In [135]:
# y = wx + b

w = model.coef_
b = model.intercept_
print(w, b)

[ 3.66642427e-01 -4.71693487e-05 -4.86782360e-01 -4.96317227e-01] -41.27694727260204


In [136]:
print(y_pred[0], w*x_test[0] + b)

2.2113159846730284 [-39.91424736 -41.31072053 -57.85675445  17.46219651]


In [137]:
r2_s = r2_score(y_test, y_pred)
mse_err = mean_squared_error(y_test, y_pred)

print(r2_s, mse_err)

0.5822504728971845 0.5222728516986671


#Using all features

In [68]:
x = dataset['data']
print(x.shape)

(20640, 8)


In [69]:
y = dataset['target']

In [70]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20)

In [71]:
print(x_train.shape, x_test.shape)

(16512, 8) (4128, 8)


In [72]:
model = LinearRegression()

In [73]:
model.fit(x_train, y_train)

In [74]:
y_pred = model.predict(x_test)

In [75]:
# y = wx + b

w = model.coef_
b = model.intercept_
print(w, b)

[ 4.31659325e-01  9.46917601e-03 -1.01735676e-01  6.12079580e-01
 -4.49413554e-06 -8.49422046e-03 -4.24042223e-01 -4.37963264e-01] -37.21285360887961


In [76]:
print(y_pred[0], w*x_test[0] + b)

2.0014339411139943 [-35.85040728 -36.8624941  -37.73271942 -36.5902572  -37.21673205
 -37.23804437 -51.60484667  14.60695976]


In [77]:
r2_s = r2_score(y_test, y_pred)
mse_err = mean_squared_error(y_test, y_pred)

print(r2_s, mse_err)

0.6100632215461942 0.5200328476392415
