## Import thư viện

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import model_selection
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error as mser
from sklearn.model_selection import KFold, train_test_split

## Đọc dữ liệu

In [2]:
url = "https://raw.githubusercontent.com/ngochai-hcmus/Linear_Regression/main/wine.csv"
df = pd.read_csv(url, sep=';')
df

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.700,0.00,1.9,0.076,11.0,34,0.99780,3.51,0.56,9.4,5
1,7.8,0.880,0.00,2.6,0.098,25.0,67,0.99680,3.20,0.68,9.8,5
2,7.8,0.760,0.04,2.3,0.092,15.0,54,0.99700,3.26,0.65,9.8,5
3,11.2,0.280,0.56,1.9,0.075,17.0,60,0.99800,3.16,0.58,9.8,6
4,7.4,0.700,0.00,1.9,0.076,11.0,34,0.99780,3.51,0.56,9.4,5
...,...,...,...,...,...,...,...,...,...,...,...,...
1194,7.0,0.745,0.12,1.8,0.114,15.0,64,0.99588,3.22,0.59,9.5,6
1195,6.2,0.430,0.22,1.8,0.078,21.0,56,0.99633,3.52,0.60,9.5,6
1196,7.9,0.580,0.23,2.3,0.076,23.0,94,0.99686,3.21,0.58,9.5,6
1197,7.7,0.570,0.21,1.5,0.069,4.0,9,0.99458,3.16,0.54,9.8,6


## Tách dữ liệu

In [3]:
x_cols = ['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
          'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
          'pH', 'sulphates', 'alcohol']
y_cols = ['quality']

In [4]:
X = df[x_cols].values
X

array([[ 7.4 ,  0.7 ,  0.  , ...,  3.51,  0.56,  9.4 ],
       [ 7.8 ,  0.88,  0.  , ...,  3.2 ,  0.68,  9.8 ],
       [ 7.8 ,  0.76,  0.04, ...,  3.26,  0.65,  9.8 ],
       ...,
       [ 7.9 ,  0.58,  0.23, ...,  3.21,  0.58,  9.5 ],
       [ 7.7 ,  0.57,  0.21, ...,  3.16,  0.54,  9.8 ],
       [ 7.7 ,  0.26,  0.26, ...,  3.15,  0.79, 10.9 ]])

In [5]:
y_label = df.quality
y = df[y_cols].values
y

array([[5],
       [5],
       [5],
       ...,
       [6],
       [6],
       [6]], dtype=int64)

## Câu a: Xây dựng mô hình sử dụng toàn bộ 11 đặc trưng
## y=theta_1x_1+theta_2x_2+...+theta_11x_11

In [6]:
def fitting(A,b):
    AT = np.transpose(A)
    theta = np.matmul(np.linalg.inv(np.matmul(AT,A)), np.matmul(AT,b))
    return theta

In [7]:
fitting(X,y)

array([[ 5.92516137e-03],
       [-1.10803754e+00],
       [-2.63046284e-01],
       [ 1.53222831e-02],
       [-1.73050274e+00],
       [ 3.80141908e-03],
       [-3.89899869e-03],
       [ 4.33858768e+00],
       [-4.58535475e-01],
       [ 7.29718662e-01],
       [ 3.08858648e-01]])

## Model y = 0.00592516137X1 - 1.10803754X2 -0.263046284X3 + 0.0153222831X4 - 1.73050274X5 + 0.00380141908X6 - 0.00389899869X7 + 4.33858768X8 - 0.458535475X9 + 0.729718662X10 + 0.308858648X11

## Câu b: Xây dựng mô hình sử dụng 1 đặc trưng cho kết quả tốt nhất (Phương pháp Cross Validation)

In [8]:
cv = []
for feature in x_cols:
    X = df[feature]
    X_train,X_test,y_train,y_test = train_test_split(X,y_label,random_state=42)
    X_train= X_train.values.reshape(-1, 1)
    y_train= y_train.values.reshape(-1, 1)
    kfold = model_selection.KFold(n_splits=10,shuffle=True,random_state=42)
    cv_results = model_selection.cross_val_score(LinearRegression(), X_train, y_train, cv=kfold, scoring='neg_mean_squared_error')
    error = abs((cv_results.mean()))
    cv.append([feature,error])
pred = pd.DataFrame(cv,columns=['Feature','Error'])
pred

Unnamed: 0,Feature,Error
0,fixed acidity,0.655508
1,volatile acidity,0.571412
2,citric acid,0.634073
3,residual sugar,0.668661
4,chlorides,0.660162
5,free sulfur dioxide,0.669115
6,total sulfur dioxide,0.634714
7,density,0.649341
8,pH,0.66744
9,sulphates,0.642987


### alcohol có sai số nhỏ nhất => alcohol cho kết quả tốt nhất

## c. Xây dựng một mô hình riêng

### Xây dựng mô hình RidgeCV sử dụng thuật toán Ridge Regression

In [9]:
from sklearn.linear_model import Ridge, RidgeCV

In [10]:
X = df[x_cols].values
y = df[y_cols].values
X_train,X_test,y_train,y_test = train_test_split(X,y_label,random_state=42)

In [11]:
# Khởi tạo dãy alphas
alphas = np.logspace(-10, 10, 30)

# Khởi tạo mô hình RidgeCV
model = RidgeCV(alphas=alphas)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
rmse = mser(y_test, y_pred)
print("Error:", rmse)

Error: 0.38449067034336926


### Tham khảo:
#### https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.KFold.html
#### https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Ridge.html
#### https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.RidgeCV.html
#### https://medium.com/@rishabh_roy/predicting-best-quality-of-wine-using-linear-regression-and-pytorch-a5bda59e43c2