# 線性迴歸求解

## 範例1. 簡單線性迴歸
### $\begin{equation}y = wx + b\end{equation}$

In [11]:
# OLS 公式
#from IPython.display import Image
#Image('./formula/regression_wb.png')

In [12]:
# 使用 OLS 公式計算 w、b
# 載入套件
import matplotlib.pyplot as plt
import numpy as np
import math
import pandas as pd

# 載入資料集
df = pd.read_csv('../src/data/population.csv')

w = ((df['pop'] - df['pop'].mean()) * df['year']).sum() \
     / ((df['year'] - df['year'].mean())**2).sum()
b = df['pop'].mean() - w * df['year'].mean()

print(f'w={w}, b={b}')

w=0.06115935866155474, b=-116.35631056117153


In [13]:
# 使用 NumPy 的現成函數 polyfit()
coef = np.polyfit(df['year'], df['pop'], deg=1)
print(f'w={coef[0]}, b={coef[1]}')

w=0.06115935866155433, b=-116.35631056117064


## 矩陣計算

In [14]:
import numpy as np

X = df[['year']].values

# b = b * 1
one=np.ones((len(df), 1))

# 將 x 與 one 合併 
X = np.concatenate((X, one), axis=1)

y = df[['pop']].values

# 求解
w = np.linalg.inv(X.T @ X) @ X.T @ y
print(f'w={w[0, 0]}, b={w[1, 0]}')

w=0.06115935866154625, b=-116.35631056115469


## 以Scikit-Learn的房價資料集為例，求解線性迴歸

In [15]:
import numpy as np
from sklearn.datasets import fetch_california_housing

# 载入 California 房价数据集
data = fetch_california_housing()
X, y = data.data, data.target

# 添加一列全为1的列，用于截距项
one = np.ones((X.shape[0], 1))

# 将 x 与 one 合并
X = np.concatenate((X, one), axis=1)

# 使用正规方程求解线性回归参数
w = np.linalg.inv(X.T @ X) @ X.T @ y

print("Weights:", w)



URLError: <urlopen error [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1000)>

In [None]:
import numpy as np
from sklearn.datasets import load_boston

# 載入 Boston 房價資料集
X, y = load_boston(return_X_y=True)

# b = b * 1
one=np.ones((X.shape[0], 1))

# 將 x 與 one 合併 
X = np.concatenate((X, one), axis=1)

# 求解
w = np.linalg.inv(X.T @ X) @ X.T @ y
w

ImportError: 
`load_boston` has been removed from scikit-learn since version 1.2.

The Boston housing prices dataset has an ethical problem: as
investigated in [1], the authors of this dataset engineered a
non-invertible variable "B" assuming that racial self-segregation had a
positive impact on house prices [2]. Furthermore the goal of the
research that led to the creation of this dataset was to study the
impact of air quality but it did not give adequate demonstration of the
validity of this assumption.

The scikit-learn maintainers therefore strongly discourage the use of
this dataset unless the purpose of the code is to study and educate
about ethical issues in data science and machine learning.

In this special case, you can fetch the dataset from the original
source::

    import pandas as pd
    import numpy as np

    data_url = "http://lib.stat.cmu.edu/datasets/boston"
    raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
    data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
    target = raw_df.values[1::2, 2]

Alternative datasets include the California housing dataset and the
Ames housing dataset. You can load the datasets as follows::

    from sklearn.datasets import fetch_california_housing
    housing = fetch_california_housing()

for the California housing dataset and::

    from sklearn.datasets import fetch_openml
    housing = fetch_openml(name="house_prices", as_frame=True)

for the Ames housing dataset.

[1] M Carlisle.
"Racist data destruction?"
<https://medium.com/@docintangible/racist-data-destruction-113e3eff54a8>

[2] Harrison Jr, David, and Daniel L. Rubinfeld.
"Hedonic housing prices and the demand for clean air."
Journal of environmental economics and management 5.1 (1978): 81-102.
<https://www.researchgate.net/publication/4974606_Hedonic_housing_prices_and_the_demand_for_clean_air>


## 以Scikit-Learn的線性迴歸驗證

In [None]:
from sklearn.linear_model import LinearRegression

X, y = load_boston(return_X_y=True)

lr = LinearRegression()
lr.fit(X, y)

lr.coef_, lr.intercept_

(array([-1.08011358e-01,  4.64204584e-02,  2.05586264e-02,  2.68673382e+00,
        -1.77666112e+01,  3.80986521e+00,  6.92224640e-04, -1.47556685e+00,
         3.06049479e-01, -1.23345939e-02, -9.52747232e-01,  9.31168327e-03,
        -5.24758378e-01]),
 36.45948838509001)

## 使用PyTorch 線性代數函數庫

In [None]:
import numpy as np
from sklearn.datasets import load_boston
import torch

# 載入 Boston 房價資料集
X, y = load_boston(return_X_y=True)

X_tensor = torch.from_numpy(X)

# b = b * 1
one=torch.ones((X.shape[0], 1))

# 將 x 與 one 合併 
X = torch.cat((X_tensor, one), axis=1)


# 求解
w = torch.linalg.inv(X.T @ X) @ X.T @ y
# w = (X.T @ X).inverse() @ X.T @ y # 也可以

w

tensor([-1.0801e-01,  4.6420e-02,  2.0559e-02,  2.6867e+00, -1.7767e+01,
         3.8099e+00,  6.9222e-04, -1.4756e+00,  3.0605e-01, -1.2335e-02,
        -9.5275e-01,  9.3117e-03, -5.2476e-01,  3.6459e+01],
       dtype=torch.float64)

In [39]:
import numpy as np
X=df[['year']].values
one=np.ones((len(df),1))
X=np.concatenate((one,X),axis=1)

y=df[['pop']].values

w=np.linalg.inv(X.T.dot(X)).dot(X.T).dot(y)
print(f'b={w[0,0]},w={w[1,0]}')

b=-116.35631056116833,w=0.06115935866155193


In [42]:
import numpy as np
X=df[['year']].values
one=np.ones((len(df),1))
X=np.concatenate((X,one),axis=1)

y=df[['pop']].values

w=np.linalg.inv(X.T @ X) @ X.T@y
print(f'w={w[0,0]},b={w[1,0]}')

w=0.06115935866154625,b=-116.35631056115469


In [43]:
coef=np.polyfit(df['year'],df['pop'],deg=1)
coef

array([ 6.11593587e-02, -1.16356311e+02])

In [21]:
X

array([[1.950e+03, 1.000e+00],
       [1.951e+03, 1.000e+00],
       [1.952e+03, 1.000e+00],
       [1.953e+03, 1.000e+00],
       [1.954e+03, 1.000e+00],
       [1.955e+03, 1.000e+00],
       [1.956e+03, 1.000e+00],
       [1.957e+03, 1.000e+00],
       [1.958e+03, 1.000e+00],
       [1.959e+03, 1.000e+00],
       [1.960e+03, 1.000e+00],
       [1.961e+03, 1.000e+00],
       [1.962e+03, 1.000e+00],
       [1.963e+03, 1.000e+00],
       [1.964e+03, 1.000e+00],
       [1.965e+03, 1.000e+00],
       [1.966e+03, 1.000e+00],
       [1.967e+03, 1.000e+00],
       [1.968e+03, 1.000e+00],
       [1.969e+03, 1.000e+00],
       [1.970e+03, 1.000e+00],
       [1.971e+03, 1.000e+00],
       [1.972e+03, 1.000e+00],
       [1.973e+03, 1.000e+00],
       [1.974e+03, 1.000e+00],
       [1.975e+03, 1.000e+00],
       [1.976e+03, 1.000e+00],
       [1.977e+03, 1.000e+00],
       [1.978e+03, 1.000e+00],
       [1.979e+03, 1.000e+00],
       [1.980e+03, 1.000e+00],
       [1.981e+03, 1.000e+00],
       [

In [32]:
X = [[4, 1], [5, 2], [1, 3]]
one=np.ones((len(X),1))
X=np.concatenate((X,one),axis=1)
y = [1, 2, 3]

In [33]:
X

array([[4., 1., 1.],
       [5., 2., 1.],
       [1., 3., 1.]])

In [31]:
X = [[1, 1], [1, 2], [1, 3]]
X

[[1, 1], [1, 2], [1, 3]]

In [34]:
import numpy as np
def linear_regression_normal_equation(X: list[list[float]], y: list[float]) -> list[float]:
	X=np.array(X)
	one=np.ones((len(y),1))
	X=np.concatenate((X,one),axis=1)
	y=np.array(y)
	theta=np.linalg.inv(X.T@X)@X.T@y
	#theta = np.round(theta, 4).tolist()
	return theta

	
	# Your code here, make sure to round
    

In [None]:
lin