In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot
import statistics as st
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline
from sklearn.datasets import fetch_california_housing 

In [3]:
data = fetch_california_housing()

In [4]:
print(data.DESCR)

.. _california_housing_dataset:

California Housing dataset
--------------------------

**Data Set Characteristics:**

    :Number of Instances: 20640

    :Number of Attributes: 8 numeric, predictive attributes and the target

    :Attribute Information:
        - MedInc        median income in block group
        - HouseAge      median house age in block group
        - AveRooms      average number of rooms per household
        - AveBedrms     average number of bedrooms per household
        - Population    block group population
        - AveOccup      average number of household members
        - Latitude      block group latitude
        - Longitude     block group longitude

    :Missing Attribute Values: None

This dataset was obtained from the StatLib repository.
https://www.dcc.fc.up.pt/~ltorgo/Regression/cal_housing.html

The target variable is the median house value for California districts,
expressed in hundreds of thousands of dollars ($100,000).

This dataset was derived

In [5]:
# create data frame
housing = pd.DataFrame(data.data,columns=data.feature_names)

In [7]:
housing["price"] = data.target

In [10]:
housing.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,price
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


In [12]:
housing.tail()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,price
20635,1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09,0.781
20636,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21,0.771
20637,1.7,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22,0.923
20638,1.8672,18.0,5.329513,1.17192,741.0,2.123209,39.43,-121.32,0.847
20639,2.3886,16.0,5.254717,1.162264,1387.0,2.616981,39.37,-121.24,0.894


In [14]:
housing.describe()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,price
count,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0
mean,3.870671,28.639486,5.429,1.096675,1425.476744,3.070655,35.631861,-119.569704,2.068558
std,1.899822,12.585558,2.474173,0.473911,1132.462122,10.38605,2.135952,2.003532,1.153956
min,0.4999,1.0,0.846154,0.333333,3.0,0.692308,32.54,-124.35,0.14999
25%,2.5634,18.0,4.440716,1.006079,787.0,2.429741,33.93,-121.8,1.196
50%,3.5348,29.0,5.229129,1.04878,1166.0,2.818116,34.26,-118.49,1.797
75%,4.74325,37.0,6.052381,1.099526,1725.0,3.282261,37.71,-118.01,2.64725
max,15.0001,52.0,141.909091,34.066667,35682.0,1243.333333,41.95,-114.31,5.00001


In [15]:
housing.isnull().sum()

MedInc        0
HouseAge      0
AveRooms      0
AveBedrms     0
Population    0
AveOccup      0
Latitude      0
Longitude     0
price         0
dtype: int64

In [16]:
housing.shape

(20640, 9)

In [25]:
# divide dataset in to dependent and independent variables
x = housing.iloc[:,:-1]
y = housing.iloc[:,-1]

In [26]:
x

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.023810,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.971880,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.802260,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25
...,...,...,...,...,...,...,...,...
20635,1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09
20636,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21
20637,1.7000,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22
20638,1.8672,18.0,5.329513,1.171920,741.0,2.123209,39.43,-121.32


In [27]:
y

0        4.526
1        3.585
2        3.521
3        3.413
4        3.422
         ...  
20635    0.781
20636    0.771
20637    0.923
20638    0.847
20639    0.894
Name: price, Length: 20640, dtype: float64

In [31]:
# divide data set in to train and test split
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test= train_test_split(x,y,test_size=0.34,random_state=42)

In [32]:
X_train.shape

(13622, 8)

In [35]:
y_train.shape

(13622,)

In [36]:
X_test.shape

(7018, 8)

In [38]:
y_test.shape

(7018,)

In [39]:
# featuren scalling -- standardlazation
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [41]:
X_train = scaler.fit_transform(X_train)

In [42]:
X_train

array([[-0.3762727 , -0.04289048, -0.4547492 , ..., -0.08713642,
        -0.59648233, -0.04118759],
       [-1.07564067,  0.83031545, -0.13093175, ...,  0.00869875,
         1.78024685, -1.17350812],
       [ 2.07001714,  1.86228611,  0.45764952, ..., -0.04632021,
        -0.75118068,  0.58233693],
       ...,
       [-0.49617186,  0.59216838, -0.58992549, ...,  0.01573485,
        -0.76055634,  0.60228971],
       [ 0.96763869, -1.07486114,  0.38979205, ...,  0.00371206,
         0.90362287, -1.18348451],
       [-0.68482512,  1.86228611, -0.82752637, ..., -0.08028688,
         0.99269162, -1.41294153]])

In [44]:
X_test = scaler.transform(X_test)

In [45]:
X_test

array([[-1.15490763, -0.28103756, -0.5063446 , ...,  0.05854052,
         0.1910729 ,  0.28803336],
       [-0.70805854,  0.11587423, -0.16175057, ..., -0.0354617 ,
        -0.24020735,  0.06356453],
       [-0.20932849,  1.86228611, -0.59381508, ..., -0.13902544,
         1.0067551 , -1.42291792],
       ...,
       [-0.79899478, -0.2016552 , -0.22577182, ..., -0.00496635,
         1.67711461, -1.80202083],
       [-0.45175503,  0.03649188,  0.04881787, ..., -0.05127995,
        -1.33715927,  1.29066078],
       [-0.89844643,  0.83031545, -0.57489542, ..., -0.10902122,
         1.36771792, -0.9390629 ]])

##  LinearRegression

In [46]:
# select model
from sklearn.linear_model import LinearRegression

In [47]:
linear = LinearRegression()

In [48]:
linear.fit(X_train,y_train)

In [49]:
linear.coef_

array([ 8.45998800e-01,  1.17911863e-01, -3.01388380e-01,  3.49864216e-01,
       -8.34552304e-04, -4.16545694e-02, -8.95759508e-01, -8.71776442e-01])

In [50]:
linear.intercept_

2.0670440691528498

In [53]:
#predection
y_predict = linear.predict(X_test)

In [54]:
y_predict

array([0.72594074, 1.76643672, 2.70195784, ..., 1.46262049, 1.73698122,
       1.16985571])

In [55]:
#MSE & MAE
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score

In [64]:
mse = mean_squared_error(y_test,y_predict)
print(mse*100)
mae = mean_absolute_error(y_test,y_predict)
print(mae*100)
print(np.sqrt(mae)*100)

53.82330283179093
53.02570867686387
72.81875354389408


In [67]:
## check accuracy R2 and ADjusted R square
score = r2_score(y_test,y_predict)
print(score*100)

59.70446953628195


In [73]:
# adjested r squre
1-(1-score)*(len(y)-1)/(len(y)-x.shape[1]-1)

0.5968884430029194

## Ridge Regression

In [75]:
from sklearn.linear_model import Ridge
ridge = Ridge()

In [76]:
ridge.fit(X_train,y_train)

In [88]:
y_predict = ridge.predict(X_test)

In [89]:
y_predict

array([0.726359  , 1.76622929, 2.70152823, ..., 1.46247378, 1.73700117,
       1.17021117])

In [90]:
mse = mean_squared_error(y_test,y_predict)
print(mse*100)
mae = mean_absolute_error(y_test,y_predict)
print(mae*100)
print(np.sqrt(mae)*100)

53.820912070920826
53.02522010600794
72.81841807263321


## Lasso Regression

In [91]:
from sklearn.linear_model import Lasso
lasso = Lasso(alpha=20.0)

In [92]:
lasso.fit(X_train,y_train)

In [93]:
y_predict = lasso.predict(X_test)

In [94]:
y_predict

array([2.06704407, 2.06704407, 2.06704407, ..., 2.06704407, 2.06704407,
       2.06704407])

In [95]:
mse = mean_squared_error(y_test,y_predict)
print(mse*100)
mae = mean_absolute_error(y_test,y_predict)
print(mae*100)
print(np.sqrt(mae)*100)

133.5733795638561
91.31090295251018
95.5567386176978


## ElasticNet

In [96]:
from sklearn.linear_model import ElasticNet
elastic = ElasticNet(alpha=20.0)

In [97]:
elastic.fit(X_train,y_train)

In [98]:
y_predict = elastic.predict(X_test)

In [99]:
y_predict

array([2.06704407, 2.06704407, 2.06704407, ..., 2.06704407, 2.06704407,
       2.06704407])

In [100]:
mse = mean_squared_error(y_test,y_predict)
print(mse*100)
mae = mean_absolute_error(y_test,y_predict)
print(mae*100)
print(np.sqrt(mae)*100)

133.5733795638561
91.31090295251018
95.5567386176978
