### **Polynomial Regression**

In [None]:
from sklearn.metrics import r2_score, mean_squared_error as mse, mean_absolute_error as mae
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.pipeline import Pipeline
from scipy.stats import pearsonr
# Đọc dữ liệu từ file Excel
data9 = pd.read_excel('drive/MyDrive/data_match9.xlsx', engine='openpyxl')
data10 = pd.read_excel('drive/MyDrive/data_match10.xlsx', engine='openpyxl')

data9.dropna(inplace=True)
data10.dropna(inplace=True)

# data9.fillna(0, inplace=True)
# data10.fillna(0, inplace=True)

print('loading model...')



loading model...


In [None]:
X_9 = data9.iloc[:, 6: -1].values
y_9 = data9.iloc[:, 4].values

X_10 = data10.iloc[:, 6: -1].values
y_10 = data10.iloc[:, 4].values

X = np.concatenate((X_9, X_10), axis=0)
y = np.concatenate((y_9, y_10), axis=0)

# Chia dữ liệu thành 2 phần: tập huấn luyện và tập kiểm tra
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=0)

print(X_train.shape)
print(y_train.shape)
print(X_val.shape)
print(y_val.shape)


(129548, 18)
(129548,)
(32388, 18)
(32388,)


# **Linear Regression**

In [None]:
print('Training...')

# Khởi tạo mô hình Linear Regression
lr_model = LinearRegression().fit(X_train, y_train)
predicted_train = lr_model.predict(X_train)

corr_train, p_value_train = pearsonr(y_train, predicted_train)
print('Train R2 score: ', lr_model.score(X_train, y_train))
print('Train RMSE:', np.sqrt(mse(y_train, predicted_train)))
print('Train MAE:', mae(predicted_train, y_train))
print('Train Pearson r:', corr_train, p_value_train)

print('Validating...')
predicted_val = lr_model.predict(X_val)

corr_val, p_value_val = pearsonr(y_val, predicted_val)
print('Validate R2 score:', lr_model.score(X_val, y_val))
print('Validate RMSE:', np.sqrt(mse(y_val, predicted_val)))
print('Validate MAE:', mae(predicted_val, y_val))
print('Validate Pearson r:', corr_val, p_value_val)




Training...
Train R2 score:  0.11008462592671153
Train RMSE: 2.241021042464224
Train MAE: 0.6117372505098349
Train Pearson r: 0.33179003289235726 0.0
Validating...
Validate R2 score: 0.09374820351055368
Validate RMSE: 2.2104013250744132
Validate MAE: 0.6177014732489081
Validate Pearson r: 0.3073711985403972 0.0


# **Polynomial Regression**

In [None]:
print('Training...')
# # Áp dụng Polynomial Regression với bậc 3
poly_model = PolynomialFeatures(degree=3)
X_train_poly = poly_model.fit_transform(X_train)

# Khởi tạo mô hình Poly Regression
pr_model = LinearRegression().fit(X_train_poly, y_train)
predicted_train = pr_model.predict(X_train_poly)

print('Train R2 score: ', pr_model.score(X_train_poly, y_train))
print('Train RMSE:', np.sqrt(mse(y_train, predicted_train)))
corr_train, p_value_train = pearsonr(y_train, predicted_train)
print('Train MAE:', mae(predicted_train, y_train))
print('Train Pearson r:', corr_train, p_value_train)


print('Validating...')
X_val_poly = poly_model.transform(X_val)
predicted_val = pr_model.predict(X_val_poly)

print('Validate R2 score:', pr_model.score(X_val_poly, y_val))
print('Validate RMSE:', np.sqrt(mse(y_val, predicted_val)))
corr_val, p_value_val = pearsonr(y_val, predicted_val)
print('Validate MAE:', mae(predicted_val, y_val))
print('Validate Pearson r:', corr_val, p_value_val)



Training...
Train R2 score:  0.21330405145152231
Train RMSE: 2.1070509456968556
Train MAE: 0.5905071015815896
Train Pearson r: 0.4618958102732197 0.0
Validating...
Validate R2 score: 0.15255869838676805
Validate RMSE: 2.1374772702281155
Validate MAE: 0.6142307477284974
Validate Pearson r: 0.40070625026619594 0.0


In [None]:
print(predicted_val)

[ 3.18684732e-01  2.01304652e+00 -2.57781428e-01 ... -6.91500325e+00
  4.60536779e-03  2.31223159e+00]


In [None]:
# Define the polynomial degrees we want to test

# OPTIMIZING
pipeline = Pipeline([
    ('poly', PolynomialFeatures()),
    ('linear', LinearRegression())
])
# Tham số cho Grid Search
parameters = {
    'poly__degree': [1, 2, 3]
}

# Tìm kiếm lưới
grid_search = GridSearchCV(pipeline, parameters, cv=5)
grid_search.fit(X_train, y_train)

# In bậc tối ưu
print('Bậc tối ưu:', grid_search.best_params_['poly__degree'])
best_pr = grid_search.best_estimator_
best_pr.fit(X_train, y_train)
# Đánh giá hiệu suất trên tập validate

y_pred_train = best_pr.predict(X_train)
rmse = np.sqrt(mse(y_train, y_pred_train))
r2 = r2_score(y_train, y_pred_train)
corr_train, p_value_train = pearsonr(y_train, y_pred_train)
print('Train R2 score:', r2)
print('Train RMSE:', rmse)
print('Train MAE:', mae(y_pred_train, y_train))
print('Train Pearson r:', corr_train, p_value_train)


y_pred_val = best_pr.predict(X_val)
rmse = np.sqrt(mse(y_val, y_pred_val))
r2 = r2_score(y_val, y_pred_val)
corr_val, p_value_val = pearsonr(y_val, y_pred_val)
print('Validate R2 score:', r2)
print('Validate RMSE:', rmse)
print('Validate MAE:', mae(y_pred_val, y_val))
print('Validate Pearson r:', corr_val, p_value_val)


Bậc tối ưu: 3
Train R2 score: 0.21330405145152231
Train RMSE: 2.1070509456968556
Train MAE: 0.5905071015815896
Train Pearson r: 0.4618958102732197 0.0
Validate R2 score: 0.15255869838676805
Validate RMSE: 2.1374772702281155
Validate MAE: 0.6142307477284974
Validate Pearson r: 0.40070625026619594 0.0


In [None]:
# # Visualize results
# plt.scatter(X_val, y_val, color='blue')
# plt.plot(X_val_poly, y_pred_val, color='red')
# plt.title('Polynomial Regression')
# plt.xlabel('Features')
# plt.ylabel('Estimation')
# plt.show()
print(grid_search.best_params_)

{'poly__degree': 3}


In [None]:
pr_model = None
poly = None
poly_model = None
lr_model = None
best_pr = None
print()



