## 1. Import Libraries

In [2]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import math
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score

## 2. Load Dataset

In [None]:
dataset = datasets.fetch_california_housing() # change the dataset because Boston dataset is deleted
print("Số chiều dữ liệu input: ", dataset.data.shape)
print("Số chiều dữ liệu target: ", dataset.target.shape)
print()
print("5 mẫu dữ liệu đầu tiên:")
print("input: ", dataset.data[:5])
print("target: ",dataset.target[:5])

Số chiều dữ liệu input:  (20640, 8)
Số chiều dữ liệu target:  (20640,)

5 mẫu dữ liệu đầu tiên:
input:  [[ 8.32520000e+00  4.10000000e+01  6.98412698e+00  1.02380952e+00
   3.22000000e+02  2.55555556e+00  3.78800000e+01 -1.22230000e+02]
 [ 8.30140000e+00  2.10000000e+01  6.23813708e+00  9.71880492e-01
   2.40100000e+03  2.10984183e+00  3.78600000e+01 -1.22220000e+02]
 [ 7.25740000e+00  5.20000000e+01  8.28813559e+00  1.07344633e+00
   4.96000000e+02  2.80225989e+00  3.78500000e+01 -1.22240000e+02]
 [ 5.64310000e+00  5.20000000e+01  5.81735160e+00  1.07305936e+00
   5.58000000e+02  2.54794521e+00  3.78500000e+01 -1.22250000e+02]
 [ 3.84620000e+00  5.20000000e+01  6.28185328e+00  1.08108108e+00
   5.65000000e+02  2.18146718e+00  3.78500000e+01 -1.22250000e+02]]
target:  [4.526 3.585 3.521 3.413 3.422]


In [10]:
dataset

{'data': array([[   8.3252    ,   41.        ,    6.98412698, ...,    2.55555556,
           37.88      , -122.23      ],
        [   8.3014    ,   21.        ,    6.23813708, ...,    2.10984183,
           37.86      , -122.22      ],
        [   7.2574    ,   52.        ,    8.28813559, ...,    2.80225989,
           37.85      , -122.24      ],
        ...,
        [   1.7       ,   17.        ,    5.20554273, ...,    2.3256351 ,
           39.43      , -121.22      ],
        [   1.8672    ,   18.        ,    5.32951289, ...,    2.12320917,
           39.43      , -121.32      ],
        [   2.3886    ,   16.        ,    5.25471698, ...,    2.61698113,
           39.37      , -121.24      ]]),
 'target': array([4.526, 3.585, 3.521, ..., 0.923, 0.847, 0.894]),
 'frame': None,
 'target_names': ['MedHouseVal'],
 'feature_names': ['MedInc',
  'HouseAge',
  'AveRooms',
  'AveBedrms',
  'Population',
  'AveOccup',
  'Latitude',
  'Longitude'],
 'DESCR': '.. _california_housing_dataset:\n

In [11]:
dataset.data.shape

(20640, 8)

## 3. Data Splitting

In [13]:
from sklearn.model_selection import train_test_split

X = dataset.data
y = dataset.target
X_train, X_test, y_train, y_test = train_test_split(dataset.data, dataset.target, test_size = 0.2, random_state = 42)

## 4. Build Model

In [22]:
regr = linear_model.LinearRegression()

In [34]:
# build model by myself
def linear_regression(X_train, y_train):
    # add a column of ones to X for the bias term (intercept)
    one = np.ones((X_train.shape[0], 1)) 
    Xbar = np.concatenate((one, X_train), axis = 1)

    A = np.dot(Xbar.T, Xbar)
    b = np.dot(Xbar.T, y_train)
    w = np.dot(np.linalg.pinv(A), b)
    coef = w[1:]
    intercept = w[0]
    return coef, intercept

In [35]:
# function for model testing
def predict(X_test, coef, intercept):
    y_pred = np.dot(X_test, coef) + intercept
    return y_pred

## 5. Model Training

5.1. Model của thư viện

In [36]:
regr.fit(X_train, y_train)

In [37]:
print("[w1, ..., w_n] = ", regr.coef_)
print("w0 = ", regr.intercept_)

[w1, ..., w_n] =  [ 4.48674910e-01  9.72425752e-03 -1.23323343e-01  7.83144907e-01
 -2.02962058e-06 -3.52631849e-03 -4.19792487e-01 -4.33708065e-01]
w0 =  -37.02327770606397


5.2. Model tự viết

In [38]:
# train the model by myself
coef, intercept = linear_regression(X_train, y_train)
print("[w1, ..., w_n] = ", coef)
print("w0 = ", intercept)

[w1, ..., w_n] =  [ 4.48674910e-01  9.72425756e-03 -1.23323343e-01  7.83144908e-01
 -2.02962050e-06 -3.52631849e-03 -4.19792484e-01 -4.33708062e-01]
w0 =  -37.02327744153445


## 6. Prediction

6.1. Model của thư viện

In [40]:
y_pred_lib = regr.predict(X_test)
pd.DataFrame({'Actual': y_test, 'Predicted': y_pred_lib, 'Difference': abs(y_test - y_pred_lib)})

Unnamed: 0,Actual,Predicted,Difference
0,0.47700,0.719123,0.242123
1,0.45800,1.764017,1.306017
2,5.00001,2.709659,2.290351
3,2.18600,2.838926,0.652926
4,2.78000,2.604657,0.175343
...,...,...,...
4123,2.63300,1.991746,0.641254
4124,2.66800,2.249839,0.418161
4125,5.00001,4.468770,0.531240
4126,0.72300,1.187511,0.464511


6.2. Model tự viết

In [41]:
y_pred = predict(X_test, coef, intercept)
pd.DataFrame({'Actual': y_test, 'Predicted': y_pred_lib, 'Difference': abs(y_test - y_pred_lib)})

Unnamed: 0,Actual,Predicted,Difference
0,0.47700,0.719123,0.242123
1,0.45800,1.764017,1.306017
2,5.00001,2.709659,2.290351
3,2.18600,2.838926,0.652926
4,2.78000,2.604657,0.175343
...,...,...,...
4123,2.63300,1.991746,0.641254
4124,2.66800,2.249839,0.418161
4125,5.00001,4.468770,0.531240
4126,0.72300,1.187511,0.464511


6.3. Đánh giá model của thư viện

In [42]:
loss_lib = math.sqrt(mean_squared_error(y_test, y_pred_lib))
print("RMSE: ",loss_lib)

RMSE:  0.745581383012775


In [43]:
loss = math.sqrt(mean_squared_error(y_test, y_pred))
print("RMSE: ", loss)

RMSE:  0.745581382996425
