### 데이터 불러오기

In [2]:
import pandas as pd
df=pd.read_csv('https://raw.githubusercontent.com/ralbu85/DataScience_2022S/master/data/auto.csv')

In [3]:
df

Unnamed: 0,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,mpg
0,8,307.0,130.0,3504.0,12.0,70,1,18.0
1,8,350.0,165.0,3693.0,11.5,70,1,15.0
2,8,318.0,150.0,3436.0,11.0,70,1,18.0
3,8,304.0,150.0,3433.0,12.0,70,1,16.0
4,8,302.0,140.0,3449.0,10.5,70,1,17.0
...,...,...,...,...,...,...,...,...
387,4,140.0,86.0,2790.0,15.6,82,1,27.0
388,4,97.0,52.0,2130.0,24.6,82,2,44.0
389,4,135.0,84.0,2295.0,11.6,82,1,32.0
390,4,120.0,79.0,2625.0,18.6,82,1,28.0


### 데이터나누기

In [4]:
df = pd.get_dummies(df,columns=['origin'])
y = df['mpg']
X = df.drop(columns=['mpg'])

### 교차검증

In [5]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold, cross_validate

reg= LinearRegression()
kfold = KFold(5,shuffle=True)
result=cross_validate(estimator=reg, 
                        X=X, y=y, 
                        cv=kfold, 
                        scoring=['neg_mean_squared_error','r2'], 
                        return_train_score=True)
result

{'fit_time': array([0.00373125, 0.00226021, 0.00216913, 0.00210094, 0.00205207]),
 'score_time': array([0.00187898, 0.00154972, 0.0014739 , 0.00149202, 0.00149417]),
 'test_neg_mean_squared_error': array([-11.46283804, -13.85848519,  -8.10628732, -12.09270151,
        -11.10674263]),
 'train_neg_mean_squared_error': array([-10.59570757,  -9.9817114 , -11.356233  , -10.4559514 ,
        -10.65712132]),
 'test_r2': array([0.79122457, 0.81693436, 0.85345058, 0.79326894, 0.80055544]),
 'train_r2': array([0.82976112, 0.82429999, 0.81678614, 0.82945843, 0.82597343])}

### 하이퍼패러미터 변경(차수 K)

In [6]:
from sklearn.preprocessing import PolynomialFeatures

kfold = KFold(5,shuffle=True)
for i in range(1,5):
    transformer = PolynomialFeatures(degree=i, interaction_only=False)
    X_=transformer.fit_transform(X) #데이터에 차수 추가
    reg= LinearRegression()
    result=cross_validate(estimator=reg, X=X_, y=y, cv=kfold, scoring=['neg_mean_squared_error','r2'], return_train_score=True)
    print(i,'train',result['train_neg_mean_squared_error'].mean(), result['train_r2'].mean())
    print(i,'test',result['test_neg_mean_squared_error'].mean(), result['test_r2'].mean())

1 train -10.607205009720055 0.8253061295023896
1 test -11.349723824804954 0.8103488711181356
2 train -6.910045047780294 0.8858171196693986
2 test -9.822785411964256 0.8346052081329598
3 train -6.104209438134539 0.8994430086541383
3 test -1381.6842878690616 -22.465862659250455
4 train -0.30514556589124686 0.9950371610706641
4 test -749203.7933400336 -15152.551470166432


### Regularization

In [7]:
from sklearn.linear_model import Ridge, Lasso
reg = Lasso(alpha=0.01)
reg.fit(X,y)
reg.coef_

array([-0.44629638,  0.02286369, -0.01762971, -0.00671945,  0.07651421,
        0.77615517, -2.56966676,  0.        ,  0.16691861])

### Lambda의 크기에 따른 계수의 변화를 살펴보자

In [None]:
import matplotlib.pyplot as plt
plt.figure(figsize=(12,8))
lambdas = [0,0.0001,0.001,0.01,0.1,1]
for i in lambdas:
    reg = Lasso(alpha=i)
    reg.fit(X,y)
    print(i, reg.coef_)

In [8]:
import warnings
warnings.filterwarnings('ignore')

## 하이퍼패러미터 변경(다항식 추가)
kfold = KFold(5,shuffle=True)
lambdas = [0,0.001,0.01,0.1,0.5,1,10,100]
for l in lambdas:    
    reg= Lasso(alpha=l)
    result=cross_validate(estimator=reg, X=X, y=y, cv=kfold, scoring=['neg_mean_squared_error','r2'], return_train_score=True)
    print(l,'train',result['train_neg_mean_squared_error'].mean(), result['train_r2'].mean())
    print(l,'test',result['test_neg_mean_squared_error'].mean(), result['test_r2'].mean())

0 train -10.60616242975533 0.8250592933629616
0 test -11.398004469543324 0.8064949186926779
0.001 train -10.612301163709967 0.8250738228333434
0.001 test -11.318461679191559 0.8084836993816914
0.01 train -10.6349526681579 0.8249745194535784
0.01 test -11.128519863187346 0.8170716160642343
0.1 train -10.730618032336933 0.8230810275710005
0.1 test -11.54026281802042 0.8044197757052196
0.5 train -11.632795375132098 0.8085291119846959
0.5 test -11.919633747696839 0.8026985453480293
1 train -11.703533582925187 0.8071684673048398
1 test -11.930480164163303 0.800266443686732
10 train -18.012494493035586 0.7031497388155461
10 test -18.503062867254346 0.6885051489001641
100 train -18.67615687293585 0.6924784789017805
100 test -18.8093604943327 0.6873445261538974
