In [555]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from statsmodels.formula.api import ols
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np
import statsmodels.api as sm


In [557]:
df = sns.load_dataset("penguins")

In [560]:
df.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female


In [537]:
penguins_X = df[["bill_length_mm", "flipper_length_mm"]]
penguins_Y = df[["body_mass_g"]]

In [539]:
ols_formula = "body_mass_g ~ bill_length_mm + flipper_length_mm"

In [541]:
ols_data = pd.concat([penguins_X, penguins_Y], axis =1)
OLS = ols(formula = ols_formula, data = ols_data)
model = OLS.fit()

In [543]:
model.summary()

0,1,2,3
Dep. Variable:,body_mass_g,R-squared:,0.76
Model:,OLS,Adj. R-squared:,0.759
Method:,Least Squares,F-statistic:,536.6
Date:,"Thu, 19 Sep 2024",Prob (F-statistic):,9.09e-106
Time:,16:21:06,Log-Likelihood:,-2527.7
No. Observations:,342,AIC:,5061.0
Df Residuals:,339,BIC:,5073.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-5736.8972,307.959,-18.629,0.000,-6342.649,-5131.146
bill_length_mm,6.0475,5.180,1.168,0.244,-4.141,16.236
flipper_length_mm,48.1449,2.011,23.939,0.000,44.189,52.101

0,1,2,3
Omnibus:,5.475,Durbin-Watson:,2.099
Prob(Omnibus):,0.065,Jarque-Bera (JB):,5.408
Skew:,0.308,Prob(JB):,0.0669
Kurtosis:,3.025,Cond. No.,2980.0


**nhận xét:**
1. chỉ số R-squared của mô hình tuyến tính đa biến giải thích tốt hơn 0,760 có nghĩa là khoảng 76% sự biến động phụ thuộc (body_mass_g) được giải thích bởi mô hình cho thấy mô hình phù hợp tốt ngay cả khi đã điều chỉnh cho số lượng biến giải thích
2. chỉ số R-squared của mô hình tuến tính đơn biến có chỉ số là 0.354 có nghĩa mô hình giải thích được khoảng 35.4% sự biến thiên của biến phụ thuộc đây không phải là 1 giá trị cao cho thấy mô hình này chỉ giải thích một phần tương đối nhỏ sự biến động 

In [457]:
#one hot encoding
print(df['sex'].unique())
print(df['species'].unique())

['Male' 'Female' nan]
['Adelie' 'Chinstrap' 'Gentoo']


In [459]:
one_hot_encoded_data = pd.get_dummies(df, columns = ['sex', 'species'])
print(one_hot_encoded_data)

        island  bill_length_mm  bill_depth_mm  flipper_length_mm  body_mass_g  \
0    Torgersen            39.1           18.7              181.0       3750.0   
1    Torgersen            39.5           17.4              186.0       3800.0   
2    Torgersen            40.3           18.0              195.0       3250.0   
3    Torgersen             NaN            NaN                NaN          NaN   
4    Torgersen            36.7           19.3              193.0       3450.0   
..         ...             ...            ...                ...          ...   
339     Biscoe             NaN            NaN                NaN          NaN   
340     Biscoe            46.8           14.3              215.0       4850.0   
341     Biscoe            50.4           15.7              222.0       5750.0   
342     Biscoe            45.2           14.8              212.0       5200.0   
343     Biscoe            49.9           16.1              213.0       5400.0   

     sex_Female  sex_Male  

In [461]:
ols_formulas = "body_mass_g ~ bill_length_mm + flipper_length_mm + sex_Female + sex_Male"

In [463]:
penguins_X = one_hot_encoded_data[["bill_length_mm", "flipper_length_mm", "sex_Female", "sex_Male"]]
penguins_Y = one_hot_encoded_data[["body_mass_g"]]

In [465]:
ols_datas = pd.concat([penguins_X, penguins_Y], axis = 1)
OLS = ols(formula = ols_formulas, data = ols_datas)
model = OLS.fit()

In [467]:
model.summary()

0,1,2,3
Dep. Variable:,body_mass_g,R-squared:,0.804
Model:,OLS,Adj. R-squared:,0.801
Method:,Least Squares,F-statistic:,344.9
Date:,"Thu, 19 Sep 2024",Prob (F-statistic):,1e-117
Time:,15:54:53,Log-Likelihood:,-2493.4
No. Observations:,342,AIC:,4997.0
Df Residuals:,337,BIC:,5016.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-5285.4104,308.339,-17.142,0.000,-5891.922,-4678.899
sex_Female[T.True],-61.9025,122.510,-0.505,0.614,-302.882,179.077
sex_Male[T.True],298.0199,123.564,2.412,0.016,54.967,541.073
bill_length_mm,-4.3828,4.859,-0.902,0.368,-13.941,5.176
flipper_length_mm,47.5979,1.827,26.054,0.000,44.004,51.191

0,1,2,3
Omnibus:,0.541,Durbin-Watson:,1.781
Prob(Omnibus):,0.763,Jarque-Bera (JB):,0.658
Skew:,0.076,Prob(JB):,0.72
Kurtosis:,2.849,Cond. No.,3380.0


**nhận xét**
việc sử dụng kĩ thuật one hot encoding làm tang giá trị R-squared 

**kĩ thuật label encoding**
Label Encoding chỉ phù hợp khi các giá trị phân loại không có thứ tự cụ thể. Nếu các giá trị có thứ tự (ordinal), ta cần xem xét sử dụng Ordinal Encoding hoặc One-Hot Encoding tùy vào trường hợp cụ thể.
Khi áp dụng Label Encoding, mô hình học máy có thể hiểu các nhãn mã hóa này như các con số có thứ tự, dẫn đến ảnh hưởng đến độ chính xác của mô hình (đặc biệt với các thuật toán học có giám sát dựa trên khoảng cách như k-NN).

In [471]:
label_encoder = LabelEncoder()

In [473]:
df['sex_encoded'] = label_encoder.fit_transform(df['sex'])
df['species_encoded'] = label_encoder.fit_transform(df['species'])
df

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,sex_encoded,species_encoded
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male,1,0
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female,0,0
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female,0,0
3,Adelie,Torgersen,,,,,,2,0
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female,0,0
...,...,...,...,...,...,...,...,...,...
339,Gentoo,Biscoe,,,,,,2,2
340,Gentoo,Biscoe,46.8,14.3,215.0,4850.0,Female,0,2
341,Gentoo,Biscoe,50.4,15.7,222.0,5750.0,Male,1,2
342,Gentoo,Biscoe,45.2,14.8,212.0,5200.0,Female,0,2


In [475]:
ols_formulass = "body_mass_g ~ bill_length_mm + flipper_length_mm + sex_encoded"

In [477]:
penguins_X = df[["bill_length_mm", "flipper_length_mm", "sex_encoded"]]
penguins_Y = df[["body_mass_g"]]

In [479]:
ols_datass = pd.concat([penguins_X, penguins_Y], axis = 1)
OLS = ols(formula = ols_formulass, data = ols_datass)
model = OLS.fit()

In [481]:
model.summary()

0,1,2,3
Dep. Variable:,body_mass_g,R-squared:,0.79
Model:,OLS,Adj. R-squared:,0.789
Method:,Least Squares,F-statistic:,424.9
Date:,"Thu, 19 Sep 2024",Prob (F-statistic):,2.69e-114
Time:,15:54:56,Log-Likelihood:,-2504.5
No. Observations:,342,AIC:,5017.0
Df Residuals:,338,BIC:,5032.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-5475.9003,290.589,-18.844,0.000,-6047.490,-4904.310
bill_length_mm,-0.0270,4.924,-0.005,0.996,-9.713,9.659
flipper_length_mm,47.4561,1.885,25.181,0.000,43.749,51.163
sex_encoded,265.1100,37.834,7.007,0.000,190.690,339.530

0,1,2,3
Omnibus:,0.25,Durbin-Watson:,1.875
Prob(Omnibus):,0.883,Jarque-Bera (JB):,0.14
Skew:,0.043,Prob(JB):,0.932
Kurtosis:,3.048,Cond. No.,3000.0


In [483]:
ols_formulasss = "body_mass_g ~ bill_length_mm + flipper_length_mm + sex_encoded + species_encoded"

In [485]:
penguins_X = df[["bill_length_mm", "flipper_length_mm", "sex_encoded", "species_encoded"]]
penguins_Y = df[["body_mass_g"]]

In [487]:
ols_datasss = pd.concat([penguins_X, penguins_Y], axis = 1)
OLS = ols(formula = ols_formulasss, data = ols_datasss)
model = OLS.fit()

In [489]:
model.summary()

0,1,2,3
Dep. Variable:,body_mass_g,R-squared:,0.799
Model:,OLS,Adj. R-squared:,0.797
Method:,Least Squares,F-statistic:,335.3
Date:,"Thu, 19 Sep 2024",Prob (F-statistic):,4.5700000000000004e-116
Time:,15:54:57,Log-Likelihood:,-2497.2
No. Observations:,342,AIC:,5004.0
Df Residuals:,337,BIC:,5024.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-3427.2790,605.443,-5.661,0.000,-4618.202,-2236.356
bill_length_mm,-11.8627,5.730,-2.070,0.039,-23.133,-0.592
flipper_length_mm,38.6996,2.937,13.175,0.000,32.922,44.477
sex_encoded,343.0298,42.291,8.111,0.000,259.842,426.218
species_encoded,204.9484,53.446,3.835,0.000,99.818,310.079

0,1,2,3
Omnibus:,2.307,Durbin-Watson:,1.969
Prob(Omnibus):,0.316,Jarque-Bera (JB):,2.228
Skew:,-0.074,Prob(JB):,0.328
Kurtosis:,3.367,Cond. No.,6410.0
