In [1]:
# 선형회귀 - 단순회귀분석
# 연속형 종속변수와 독립변수간 선형관계 확인
# 종속변수와 독립변수가 각각 하나 (두개면 다중회귀)
# 설명력과 더불어 오차평가 지표로 모델의 성능평가 (실제와 오차의 확인)

In [3]:
# statmodels - ols
# 선형회귀 분석을 위한 statmodels 의 함수
# ols 함수 내 종속변수와 독립변수를 선언
# ols 함수의 fit 메서드로 모델 피팅
# 변수명에 온점 등 특정 특수문자가 있으면 오류발생 ex ) Sepal.Length
# 모델 객체의 predict 메서드로 예측

In [1]:
import pandas as pd
from statsmodels.formula.api import ols

In [2]:
df = pd.read_csv("iris.csv")
df.head(2)

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa


In [4]:
# model = ols(formula = "Sepal.Length ~ Sepal.Width", data = df).fit()
df.columns = ["SL", "SW", "PL", "PW", "species"]
df.head(2)

Unnamed: 0,SL,SW,PL,PW,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa


In [5]:
model = ols(formula = "SL ~ SW", data = df).fit()
model.summary() # 나쁜모델

0,1,2,3
Dep. Variable:,SL,R-squared:,0.014
Model:,OLS,Adj. R-squared:,0.007
Method:,Least Squares,F-statistic:,2.074
Date:,"Thu, 26 Jan 2023",Prob (F-statistic):,0.152
Time:,18:13:35,Log-Likelihood:,-183.0
No. Observations:,150,AIC:,370.0
Df Residuals:,148,BIC:,376.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,6.5262,0.479,13.628,0.000,5.580,7.473
SW,-0.2234,0.155,-1.440,0.152,-0.530,0.083

0,1,2,3
Omnibus:,4.389,Durbin-Watson:,0.952
Prob(Omnibus):,0.111,Jarque-Bera (JB):,4.237
Skew:,0.36,Prob(JB):,0.12
Kurtosis:,2.6,Cond. No.,24.2


In [6]:
model = ols(formula = "PL ~ PW", data = df).fit()
model.summary() # 좋은모델

0,1,2,3
Dep. Variable:,PL,R-squared:,0.927
Model:,OLS,Adj. R-squared:,0.927
Method:,Least Squares,F-statistic:,1882.0
Date:,"Thu, 26 Jan 2023",Prob (F-statistic):,4.6800000000000005e-86
Time:,18:14:20,Log-Likelihood:,-101.18
No. Observations:,150,AIC:,206.4
Df Residuals:,148,BIC:,212.4
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,1.0836,0.073,14.850,0.000,0.939,1.228
PW,2.2299,0.051,43.387,0.000,2.128,2.332

0,1,2,3
Omnibus:,2.438,Durbin-Watson:,1.43
Prob(Omnibus):,0.295,Jarque-Bera (JB):,1.966
Skew:,0.211,Prob(JB):,0.374
Kurtosis:,3.369,Cond. No.,3.7


In [7]:
# y = 2.2299 x + 1.0836 의 모델임 
# F-statistc : F분포 + adj
# R-squared : R 검정 (이론참조) + adj
# Coef ,P>[t] 이렇게 46정도만 보면됨


In [8]:
model.predict(df.iloc[:6, :])

0    1.529546
1    1.529546
2    1.529546
3    1.529546
4    1.529546
5    1.975534
dtype: float64

In [9]:
df["pred"] = model.predict(df)
df.head()

Unnamed: 0,SL,SW,PL,PW,species,pred
0,5.1,3.5,1.4,0.2,setosa,1.529546
1,4.9,3.0,1.4,0.2,setosa,1.529546
2,4.7,3.2,1.3,0.2,setosa,1.529546
3,4.6,3.1,1.5,0.2,setosa,1.529546
4,5.0,3.6,1.4,0.2,setosa,1.529546


In [10]:
# sklearn - LinearRegression()
# 선형회귀분석을 위한 함수
# fit_intercept 로 절편 적합여부 설정
# fit 으로 학습데이터 할당
# coef_ , intercept_ 로 계수와 절편확인가능
# predict 예측

In [None]:
# ols 로 하는거보다 조금더 강력함
# 최적화 하는 면이 좀 있음 ~ 문제에서 조정하라고 하면 이거써야됨 

In [11]:
from sklearn.linear_model import LinearRegression

In [None]:
# model = LinearRegression().fit(X = df["PL"], Y = df["PW"]) > 2차원 어레이를 넣어줘야함
# model

In [14]:
model = LinearRegression().fit(X = df[["PL"]], y = df[["PW"]]) 
model

LinearRegression()

In [15]:
model.coef_ # 기울기

array([[0.41575542]])

In [16]:
model.intercept_ # 절편

array([-0.36307552])

In [17]:
model.predict(df[["PL"]])

array([[0.21898206],
       [0.21898206],
       [0.17740652],
       [0.2605576 ],
       [0.21898206],
       [0.34370869],
       [0.21898206],
       [0.2605576 ],
       [0.21898206],
       [0.2605576 ],
       [0.2605576 ],
       [0.30213314],
       [0.21898206],
       [0.09425544],
       [0.13583098],
       [0.2605576 ],
       [0.17740652],
       [0.21898206],
       [0.34370869],
       [0.2605576 ],
       [0.34370869],
       [0.2605576 ],
       [0.0526799 ],
       [0.34370869],
       [0.42685977],
       [0.30213314],
       [0.30213314],
       [0.2605576 ],
       [0.21898206],
       [0.30213314],
       [0.30213314],
       [0.2605576 ],
       [0.2605576 ],
       [0.21898206],
       [0.2605576 ],
       [0.13583098],
       [0.17740652],
       [0.21898206],
       [0.17740652],
       [0.2605576 ],
       [0.17740652],
       [0.17740652],
       [0.17740652],
       [0.30213314],
       [0.42685977],
       [0.21898206],
       [0.30213314],
       [0.218

In [18]:
# sklearn - mean_absolute_error 
# MAE (Mean Absolute Error) 연산을 위한 함수

# sklearn - mean_squared_error
# MSE (Mean Squared Error) 연산을 위한 함수
# 해당 결과에 제곱근을 하면 RMSE 계산가능

In [19]:
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [20]:
mean_absolute_error(y_true = df["PL"], y_pred= df["PW"])

2.558666666666667

In [21]:
mean_squared_error(y_true = df["PL"], y_pred=df["PW"])

7.645466666666667

In [22]:
mean_squared_error(y_true = df["PL"], y_pred=df["PW"]) ** 0.5 # RMSE

2.76504370067937

In [None]:
# 문제 1

In [31]:
from sklearn.model_selection import train_test_split
df = pd.read_csv("bike.csv")
df.head(2)


Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40


In [35]:
df_train , df_test = train_test_split(df, train_size=0.7, random_state=123)
df_train.head(2)

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
4046,2011-09-19 15:00:00,3,0,1,2,24.6,30.305,60,15.0013,44,143,187
9262,2012-09-09 07:00:00,3,0,0,1,22.14,25.76,73,11.0014,20,50,70


In [36]:
model = ols(formula = "registered ~ temp", data = df_train).fit() # registerd : 종속, temp : 독립
model.summary()

0,1,2,3
Dep. Variable:,registered,R-squared:,0.106
Model:,OLS,Adj. R-squared:,0.106
Method:,Least Squares,F-statistic:,902.3
Date:,"Thu, 26 Jan 2023",Prob (F-statistic):,1.92e-187
Time:,18:33:20,Log-Likelihood:,-48650.0
No. Observations:,7620,AIC:,97300.0
Df Residuals:,7618,BIC:,97320.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,27.5151,4.559,6.036,0.000,18.579,36.452
temp,6.3391,0.211,30.038,0.000,5.925,6.753

0,1,2,3
Omnibus:,2097.525,Durbin-Watson:,2.022
Prob(Omnibus):,0.0,Jarque-Bera (JB):,5337.402
Skew:,1.502,Prob(JB):,0.0
Kurtosis:,5.79,Cond. No.,60.1


In [37]:
# 문제 2

In [39]:
model = ols(formula = "casual ~ atemp", data = df_train).fit() # registerd : 종속, temp : 독립
model.summary()

0,1,2,3
Dep. Variable:,casual,R-squared:,0.219
Model:,OLS,Adj. R-squared:,0.219
Method:,Least Squares,F-statistic:,2138.0
Date:,"Thu, 26 Jan 2023",Prob (F-statistic):,0.0
Time:,18:34:32,Log-Likelihood:,-39689.0
No. Observations:,7620,AIC:,79380.0
Df Residuals:,7618,BIC:,79400.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-29.2974,1.498,-19.554,0.000,-32.234,-26.360
atemp,2.7672,0.060,46.243,0.000,2.650,2.885

0,1,2,3
Omnibus:,4125.373,Durbin-Watson:,1.973
Prob(Omnibus):,0.0,Jarque-Bera (JB):,34148.771
Skew:,2.494,Prob(JB):,0.0
Kurtosis:,12.092,Cond. No.,74.1


In [40]:
pred = model.predict(df_test)
pred[:4]

6495    31.499001
7050    12.626390
558     10.537120
5085    33.588271
dtype: float64

In [42]:
mean_squared_error(y_pred=pred, y_true = df_test["casual"]) ** 0.5

44.46237010271433

In [44]:
# 문제 3

In [45]:
df_s2 = df.loc[df["season"] == 2, :]
df_s4 = df.loc[df["season"] == 4, :]

In [46]:
df_s2_train , df_s2_test = train_test_split(df_s2, train_size=0.7, random_state=123)
df_s4_train , df_s4_test = train_test_split(df_s4, train_size=0.7, random_state=123)

In [49]:
model_s2 = ols(formula="casual ~ atemp", data = df_s2_train).fit()
model_s4 = ols(formula="casual ~ atemp", data = df_s4_train).fit()
pred_s2 = model_s2.predict(df_s2_test)
pred_s4 = model_s4.predict(df_s4_test)
RMSE_s2 = mean_squared_error(y_pred = pred_s2, y_true= df_s2_test["casual"]) ** 0.5
RMSE_s4 = mean_squared_error(y_pred = pred_s4, y_true= df_s4_test["casual"]) ** 0.5

In [52]:
abs(RMSE_s2 - RMSE_s4)

8.648423450414171