# **1. 단순 선형 회귀 분석**
- 전복의 나이를 예측하는 선형회귀모델을 생성하세요.
- 전복의 ‘성별’, ‘키’, ‘지름’, ‘높이’, ‘전체무게’, ‘몸통무게’, ‘내장무게’, ‘껍질무게’를 이용해 ‘껍질의 고리 수’를 예측한 뒤, **예측된 ‘껍질의 고리 수’에 1.5를 더하면 전복의 나이**가 됩니다.

In [1]:
# 기본 모듈 불러오기
import numpy as np
import pandas as pd

**1) 데이터 load 및 변형**

In [20]:
# 데이터 로드
data = pd.read_csv("./abalone.csv")
data.head()
print(data.shape)

# 성별 M은 Male, F는 Female, I는 Infant 이므로 따로 열 만들기
for label in "MFI":
    data[label] = data["Sex"] == label
data.drop('Sex', axis=1, inplace=True)

(4177, 9)


**2) X, y 선택**
: y는 Rings열, X는 Rings열을 제외한 나머지를 선택하되 전부 실수가 되도록 한다.

In [15]:
data.head()

Unnamed: 0,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings,M,F,I
0,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15,True,False,False
1,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7,True,False,False
2,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9,False,True,False
3,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10,True,False,False
4,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7,False,False,True


In [21]:
# X,y 데이터 선택
y = data['Rings']

X = data.drop(['Rings'], axis=1)
X = X.astype(float)

In [22]:
X.head()

Unnamed: 0,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,M,F,I
0,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,1.0,0.0,0.0
1,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,1.0,0.0,0.0
2,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,0.0,1.0,0.0
3,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,1.0,0.0,0.0
4,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,0.0,0.0,1.0


 **3) train/test set 분리**

In [23]:
# 필요한 모듈 불러오기
from sklearn.model_selection import train_test_split

In [24]:
# train과 test set 분리 (train:test = 7:3 비율로)
X_train, X_test, y_train, y_test=train_test_split(X, y, test_size=0.3, random_state=0)

**4) 선형회귀모델 생성, 모델 예측치 구하기**

In [26]:
#필요한 모듈 불러오기
from sklearn.linear_model import LinearRegression

In [27]:
#선형회귀모델 생성 및 훈련
lr=LinearRegression()
lr.fit(X_train, y_train)

In [40]:
# 모델 예측치 구하기
y_preds = lr.predict(X_test)
# 모델 예측치를 활용해 최종적으로 전복의 나이를 예측
predicted_age = y_preds+1.5

**5) 모델 평가: MSE, RMSE, R2 score, corr 구하기**

In [29]:
#필요한 모듈 불러오기
from sklearn.metrics import mean_squared_error, r2_score

- MSE, RMSE

In [31]:
#mse, rmse
mse = mean_squared_error(y_test, y_preds)
rmse = np.sqrt(mse)

print('MSE:{0:.3f}, RMSE: {1:.3f}'.format(mse, rmse))

MSE:5.007, RMSE: 2.238


- R2 score

In [32]:
#R2 score 측정

print('Variance score: {0:.3f}'.format(r2_score(y_test, y_preds)))

Variance score: 0.525


- 회귀 절편값

In [33]:
#회귀 절편 값
print('절편 값:', lr.intercept_)

절편 값: -72748926071972.25


- 회귀 계수 값

In [34]:
#회귀 계수 값
print('회귀 계수 값:', np.round(lr.coef_, 1))

회귀 계수 값: [-2.00000000e-01  1.13000000e+01  7.20000000e+00  9.10000000e+00
 -2.00000000e+01 -1.08000000e+01  9.70000000e+00  7.27489261e+13
  7.27489261e+13  7.27489261e+13]


- 상관계수

Hint: corr 함수 이용.

In [49]:
# 상관계수 구하기
data.corr(method='spearman')

Unnamed: 0,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings,M,F,I
Length,1.0,0.983319,0.888206,0.972633,0.95683,0.952658,0.947926,0.604385,0.238968,0.308737,-0.553044
Diameter,0.983319,1.0,0.895705,0.971324,0.950472,0.948391,0.954149,0.622895,0.243992,0.316493,-0.565927
Height,0.888206,0.895705,1.0,0.915985,0.874196,0.900587,0.921224,0.657716,0.237145,0.320853,-0.563194
Whole weight,0.972633,0.971324,0.915985,1.0,0.97706,0.975252,0.969426,0.630832,0.263649,0.321128,-0.590804
Shucked weight,0.95683,0.950472,0.874196,0.97706,1.0,0.947635,0.91773,0.53942,0.259933,0.292583,-0.558628
Viscera weight,0.952658,0.948391,0.900587,0.975252,0.947635,1.0,0.938143,0.614344,0.257687,0.330057,-0.593521
Shell weight,0.947926,0.954149,0.921224,0.969426,0.91773,0.938143,1.0,0.692475,0.253421,0.323888,-0.582995
Rings,0.604385,0.622895,0.657716,0.630832,0.53942,0.614344,0.692475,1.0,0.218739,0.283812,-0.507429
M,0.238968,0.243992,0.237145,0.263649,0.259933,0.257687,0.253421,0.218739,1.0,-0.512528,-0.522541
F,0.308737,0.316493,0.320853,0.321128,0.292583,0.330057,0.323888,0.283812,-0.512528,1.0,-0.464298


In [50]:
df = pd.DataFrame({'y_pred' : y_preds, 'y_true' : y_test})
cor = df['y_pred'].corr(df['y_true'])
cor

0.7250821752288097

# **2. Polynomial features**

In [38]:
# PolynomialFeatures 라이브러리 호출
from sklearn.preprocessing import PolynomialFeatures

In [39]:
# 임의 데이터 생성

X = np.arange(6).reshape(3, 2)

df =  pd.DataFrame(X)
df.columns = ['x_1','x_2']
df

Unnamed: 0,x_1,x_2
0,0,1
1,2,3
2,4,5


In [45]:
# 차원은 2로 설정
# fit_transform 메소드를 통해 데이터 변환
# PolynomialFeatures로 변환 된 데이터를 데이터 프레임 형태로 변환
poly = PolynomialFeatures(degree=2)
poly.fit(df)
poly_ftr = poly.transform(df)
polyDataFrame=pd.DataFrame(poly_ftr)

In [46]:
polyDataFrame

Unnamed: 0,0,1,2,3,4,5
0,1.0,0.0,1.0,0.0,0.0,1.0
1,1.0,2.0,3.0,4.0,6.0,9.0
2,1.0,4.0,5.0,16.0,20.0,25.0


In [47]:
# df_poly의 컬럼을 1,x1,x2,x1^2,x1*x2,x2^2 로 변경
polyDataFrame.columns = ['1','x1','x2','x1^2','x1*x2','x2^2']
polyDataFrame

Unnamed: 0,1,x1,x2,x1^2,x1*x2,x2^2
0,1.0,0.0,1.0,0.0,0.0,1.0
1,1.0,2.0,3.0,4.0,6.0,9.0
2,1.0,4.0,5.0,16.0,20.0,25.0
