## 다중회귀

In [4]:
from sklearn.datasets import load_boston
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import mglearn

# 1. 데이터 불러오기
data = load_boston()
df1 = pd.DataFrame(data['data'], columns=data['feature_names'])
df1['target'] = data['target']
df1.head()


    The Boston housing prices dataset has an ethical problem. You can refer to
    the documentation of this function for further details.

    The scikit-learn maintainers therefore strongly discourage the use of this
    dataset unless the purpose of the code is to study and educate about
    ethical issues in data science and machine learning.

    In this special case, you can fetch the dataset from the original
    source::

        import pandas as pd
        import numpy as np

        data_url = "http://lib.stat.cmu.edu/datasets/boston"
        raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
        data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
        target = raw_df.values[1::2, 2]

    Alternative datasets include the California housing dataset (i.e.
    :func:`~sklearn.datasets.fetch_california_housing`) and the Ames housing
    dataset. You can load the datasets as follows::

        from sklearn.datasets import fetch_california_ho

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,target
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2


In [None]:
# 상관계수 행렬
df1.corr()

In [None]:
# 산포도 그리기
sns.set_style()
sns.pairplot(df1, palette='dark')
plt.show() # 관계성있는 필드만 선택해서 그려야한다.

In [29]:
# 데이터 쪼개기
X_train, X_test, y_train, y_test = train_test_split(data['data'], data['target'], test_size=0.3, random_state=1)
Lin = LinearRegression()
Lin.fit(X_train, y_train)
y_pred = Lin.predict(X_test)

X_train.shape, y_train.shape

# score
print('------------------LinearRegression-------------------')
print('훈련셋 : ', Lin.score(X_train, y_train))
print('테스트셋 : ', Lin.score(X_test, y_test))

from sklearn.linear_model import Ridge
rid = Ridge(alpha=0.01) # 일반적으로 alpha의 값이 높아질수록 과대적합이 줄어든다
rid.fit(X_train, y_train)
print('------------------Ridge-------------------')
print('훈련셋 : ', rid.score(X_train, y_train))
print('테스트셋 : ', rid.score(X_test, y_test))

from sklearn.linear_model import Lasso
las = Lasso(alpha=0.01)
las.fit(X_train, y_train)
print('------------------Lasso-------------------')
print('훈련셋 : ', las.score(X_train, y_train))
print('테스트셋 : ', las.score(X_test, y_test))

from sklearn.linear_model import ElasticNet
ela = ElasticNet(alpha=0.01)
ela.fit(X_train, y_train)
print('------------------ElasticNet-------------------')
print('훈련셋 : ', ela.score(X_train, y_train))
print('테스트셋 : ', ela.score(X_test, y_test))

------------------LinearRegression-------------------
훈련셋 :  0.7103879080674731
테스트셋 :  0.7836295385076297
------------------Ridge-------------------
훈련셋 :  0.7103864838468883
테스트셋 :  0.7837991009786622
------------------Lasso-------------------
훈련셋 :  0.7099491416782446
테스트셋 :  0.7855180427558521
------------------ElasticNet-------------------
훈련셋 :  0.702944045303391
테스트셋 :  0.7886720224748982


## mglearn 모듈

In [1]:
X, y = mglearn.datasets.load_extended_boston()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)
Lin = LinearRegression()
Lin.fit(X_train, y_train)
y_pred = Lin.predict(X_test)

# score
print('------------------LinearRegression-------------------')
print('훈련셋 : ', Lin.score(X_train, y_train))
print('테스트셋 : ', Lin.score(X_test, y_test))

# L2 규제 : 가중치의 값을 0에 가깝게 규제를 가한다 (ridge)
from sklearn.linear_model import Ridge
rid = Ridge(alpha=0.01) # 일반적으로 alpha의 값이 높아질수록 과대적합이 줄어든다
rid.fit(X_train, y_train)
print('------------------Ridge-------------------')
print('훈련셋 : ', rid.score(X_train, y_train))
print('테스트셋 : ', rid.score(X_test, y_test))

# L1 규제 : 불필요한 피처 제거
from sklearn.linear_model import Lasso
las = Lasso(alpha=0.01)
las.fit(X_train, y_train)
print('------------------Lasso-------------------')
print('훈련셋 : ', las.score(X_train, y_train))
print('테스트셋 : ', las.score(X_test, y_test))

# 엘라스틱넷 : L1, L2규제 합침
from sklearn.linear_model import ElasticNet
ela = ElasticNet(alpha=0.01)
ela.fit(X_train, y_train)
print('------------------ElasticNet-------------------')
print('훈련셋 : ', ela.score(X_train, y_train))
print('테스트셋 : ', ela.score(X_test, y_test))

# RandomForest방식
from sklearn.ensemble import RandomForestRegressor
fore = RandomForestRegressor(max_depth=6)
fore.fit(X_train, y_train)
print('------------------RandomForestRegressor-------------------')
print('훈련셋 : ', fore.score(X_train, y_train))
print('테스트셋 : ', fore.score(X_test, y_test))

# GradiantBoost
from sklearn.ensemble import GradientBoostingRegressor
grd = GradientBoostingRegressor(max_depth=1)
grd.fit(X_train, y_train)
print('------------------GradientBoostingRegressor-------------------')
print('훈련셋 : ', grd.score(X_train, y_train))
print('테스트셋 : ', grd.score(X_test, y_test))

# SVM
from sklearn.svm import SVR
sv = SVR(kernel='linear')
sv.fit(X_train, y_train)
print('------------------SVR-------------------')
print('훈련셋 : ', sv.score(X_train, y_train))
print('테스트셋 : ', sv.score(X_test, y_test))

NameError: name 'mglearn' is not defined