In [4]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import numpy as np
import matplotlib.pyplot as plt
import random

# Load the dataset into a pandas DataFrame
df = pd.read_csv('D:/Day ta sai an/Khoa_teacher/practice-exercises/advertising/Housing.csv')

# Define the categorical columns
cat_cols = ['mainroad', 'guestroom', 'basement', 'hotwaterheating', 'airconditioning', 'parking', 'prefarea', 'furnishingstatus']

# Encode the categorical columns using label encoding
le = LabelEncoder()
for col in cat_cols:
    df[col] = le.fit_transform(df[col])

# Print the encoded DataFrame
print(df.head())
data = df.values

      price  area  bedrooms  bathrooms  stories  mainroad  guestroom  \
0  13300000  7420         4          2        3         1          0   
1  12250000  8960         4          4        4         1          0   
2  12250000  9960         3          2        2         1          0   
3  12215000  7500         4          2        2         1          0   
4  11410000  7420         4          1        2         1          1   

   basement  hotwaterheating  airconditioning  parking  prefarea  \
0         0                0                1        2         1   
1         0                0                1        3         0   
2         1                0                0        2         1   
3         1                0                1        3         1   
4         1                0                1        2         0   

   furnishingstatus  
0                 0  
1                 0  
2                 1  
3                 0  
4                 0  


In [5]:
data

array([[13300000,     7420,        4, ...,        2,        1,        0],
       [12250000,     8960,        4, ...,        3,        0,        0],
       [12250000,     9960,        3, ...,        2,        1,        1],
       ...,
       [ 1750000,     3620,        2, ...,        0,        0,        2],
       [ 1750000,     2910,        3, ...,        0,        0,        0],
       [ 1750000,     3850,        3, ...,        0,        0,        2]],
      dtype=int64)

In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data[:,1:], data[:,1], test_size=0.33, random_state=42)
X_train.shape

(365, 12)

### **Vectorizing**

#### y = ax<sub>1</sub> + bx<sub>2</sub> + cx<sub>3</sub> ... +d
Giả sử ta có nhiều trọng số thì ta phải làm như thế nào để tối giản code lại. 

1. Ta gom các trọng số a,b,c lại thành một ma trận trọng số W:
$$
W=
\left(\begin{array}{cc} 
a\\ 
b\\
c
\end{array}\right)

, shape = (3,1)
$$ 

2. Ta gom các biến x1,x2,x3 lại thành một ma trận X: 
$$
X=
\left(\begin{array}{cc} 
x_{1_1} & x_{2_1} & x_{3_1}\\ 
x_{1_2} & x_{2_2} & x_{3_2}\\ 
x_{1_3} & x_{2_3} & x_{3_3}\\ 
... & ... & ...\\
x_{1_{12}} & x_{2_{12}} & x_{3_{12}}
\end{array}\right)

, shape = (12,3)
$$ 

3. Để tính y dự đoán ta có: $$\hat{y} = X*W + d$$
4. Tương tự như vậy, ta sẽ có ma trận của giá trị đạo hàm trọng số: 
$$
dW=
\left(\begin{array}{cc} 
dA\\ 
dB\\ 
dC\\ 
\end{array}\right)
=2*(y-\hat{y})*X
$$
$$dD =2*(y-\hat{y})
$$

5. Và cập nhật trọng số bằng cách: 
$$
W -= lr* dW
$$
$$
d -= lr* dD
$$

In [7]:
class VectorizedLinearRegression_GradientDescent:
    def __init__(self,lr=0.01,n_epoch=1000):
        self.lr=lr
        self.n_epoch=n_epoch
        self.W = np.random.rand(12,1) - 0.5
        self.d = random.random() - 0.5
    def fit(self, X,Y):
        N = len(X) # number of samples
        for i in range(self.n_epoch):
            y_hat = np.dot(X,self.W) + self.d
            f = y_hat - Y
            loss = np.sum(f**2)/N
            # Updating m and b
            self.W -= self.lr * (2 * X.T.dot(f) / N)
            self.d -= self.lr * (2 * f.sum()/N)

            print("Epoch",i+1,",loss",loss)

In [8]:
model = VectorizedLinearRegression_GradientDescent(lr=0.00001)
model.fit(X_train, np.array(y_train).reshape(-1,1))


Epoch 1 ,loss 42681775.66874281
Epoch 2 ,loss 15364620647703.94
Epoch 3 ,loss 5.530968824704111e+18
Epoch 4 ,loss 1.9910427234935604e+24
Epoch 5 ,loss 7.167372032672286e+29
Epoch 6 ,loss 2.580116501196666e+35
Epoch 7 ,loss 9.28792468062429e+40
Epoch 8 ,loss 3.343474793984679e+46
Epoch 9 ,loss 1.203586816474862e+52
Epoch 10 ,loss 4.332681757907493e+57
Epoch 11 ,loss 1.5596823559670838e+63
Epoch 12 ,loss 5.614557420644446e+68
Epoch 13 ,loss 2.0211330152649933e+74
Epoch 14 ,loss 7.275691313395245e+79
Epoch 15 ,loss 2.6191093652920515e+85
Epoch 16 ,loss 9.428291514692359e+90
Epoch 17 ,loss 3.3940041627894264e+96
Epoch 18 ,loss 1.2217764203706657e+102
Epoch 19 ,loss 4.398160844172101e+107
Epoch 20 ,loss 1.5832535714955187e+113
Epoch 21 ,loss 5.6994092768909915e+118
Epoch 22 ,loss 2.0516780565243148e+124
Epoch 23 ,loss 7.385647605078803e+129
Epoch 24 ,loss 2.6586915219444333e+135
Epoch 25 ,loss 9.570779688971893e+140
Epoch 26 ,loss 3.445297173394731e+146
Epoch 27 ,loss 1.240240920672246e+152

  loss = np.sum(f**2)/N
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  self.W -= self.lr * (2 * X.T.dot(f) / N)
