In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, MinMaxScaler

### Linear Regression

In [None]:
# 이상치 미존재
x1 = np.array([10,9,3,2])
y1 = np.array([90,80,50,30])

# 이상치 존재
x2 = np.array([10,9,3,2,11])
y2 = np.array([90,80,50,30,40])

In [None]:
plt.figure(figsize=(10, 5))

plt.subplot(1,2,1)
plt.scatter(x1,y1)
plt.title('Without Outlier')

plt.subplot(1,2,2)
plt.scatter(x2,y2)
plt.title('With Outlier')
plt.show()

In [None]:
def OLS(x, y):
    w = np.sum((x-x.mean())*(y-y.mean()))/np.sum((x-x.mean())**2)
    b = y.mean() - w*x.mean()
    return w, b

In [None]:
w1,b1 = OLS(x1, y1)
w1, b1

In [None]:
w2,b2 = OLS(x2, y2)
w2, b2

In [None]:
x1_pred = x1*w1 + b1
x2_pred = x2*w2 + b2

In [None]:
plt.figure(figsize=(10, 5))

plt.subplot(1,2,1)
plt.scatter(x1,y1,label='Sample')
plt.plot(x1,x1_pred, c='red', label=f'Y={w1:.2f}x+{b1:.2f}')
plt.title('Without Outlier')
plt.legend()

plt.subplot(1,2,2)
plt.scatter(x2,y2,label='Sample')
plt.plot(x2,x2_pred, c='red', label=f'Y={w2:.2f}x+{b2:.2f}')
plt.title('With Outlier')
plt.legend()
plt.show()

In [None]:
from sklearn.linear_model import LinearRegression
model1 = LinearRegression().fit(x1.reshape(-1,1), y1)
model1.coef_, model1.intercept_

In [None]:
model2 = LinearRegression().fit(x2.reshape(-1,1), y2)
model2.coef_, model2.intercept_

In [None]:
X = np.arange(50)
Y = (2*X) + 10*np.random.randn(50)
plt.plot(X, Y, 'b.')

In [None]:
w_range = np.arange(0.1, 4.1, 0.1)
costs = []
for w in w_range:
    h = w*X
    cost = 1/50 * np.sum((h-Y)**2)
    costs.append(cost)
plt.plot(w_range, costs, 'r.')

In [None]:
plt.plot(w_range, costs, 'r.')
for w,cost in zip(w_range, costs):
    h = w*X
    gradient = 2/50*np.sum((h-Y)*X)
    plt.plot(w_range, gradient*(w_range-w)+cost)
    plt.axis([0,4,0,3000])
plt.show()

In [None]:
for w,cost in zip(w_range, costs):
    plt.plot(w_range, costs, 'r.')
    h = w*X
    gradient = 2/50*np.sum((h-Y)*X)
    plt.plot(w_range, gradient*(w_range-w)+cost)
    plt.axis([0,4,0,3000])
    plt.show()

In [None]:
#시각화
plt.scatter(X, Y)
plt.show()

In [None]:
#2. 데이터 분할
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X.reshape(-1,1), Y, 
                                                    random_state=0)

In [None]:
#3. 모델생성, 학습
from sklearn.linear_model import LinearRegression

lr = LinearRegression()
lr.fit(X_train, y_train)

In [None]:
lr.coef_, lr.intercept_

In [None]:
#4. 예측, 성능평가
#회귀계수, 절편을 확인
lr.coef_, lr.intercept_
#score : r2

train_sc = lr.score(X_train, y_train)
test_sc = lr.score(X_test, y_test)

print(f'train : {train_sc:.3f}, test : {test_sc:.3f}')

In [None]:
#y_pred
y_pred = lr.predict(X_test)
y_pred

In [None]:
# 설치되어 있지 않은 경우 아래 주석해제 후 설치실행
# !pip install mglearn

### 회귀(일반 선형 회귀)

In [None]:
import mglearn

X, y = mglearn.datasets.make_wave(n_samples=60)
X.shape, y.shape

In [None]:
#시각화
plt.scatter(X, y)
plt.show()

In [None]:
#2. 데이터 분할
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    random_state=0)

In [None]:
#3. 모델생성, 학습
from sklearn.linear_model import LinearRegression

lr = LinearRegression()
lr.fit(X_train, y_train)

In [None]:
lr.coef_, lr.intercept_

In [None]:
#4. 예측, 성능평가
#회귀계수, 절편을 확인
lr.coef_, lr.intercept_
#score : r2

train_sc = lr.score(X_train, y_train)
test_sc = lr.score(X_test, y_test)

print(f'train : {train_sc:.3f}, test : {test_sc:.3f}')

In [None]:
#y_pred
y_pred = lr.predict(X_test)
y_pred

In [None]:
#시각화
plt.plot(X, y, 'o')
plt.plot(X, lr.predict(X))
plt.show()

### 선형회귀를 이용한 주택가격 예측

In [None]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler, PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

In [None]:
#1. data load
boston_df = pd.read_csv('../data/boston.csv')
boston_df.head(3)

In [None]:
boston_df.info()
#상관관계 분석 : X, Y : -1 < 값 < 1
boston_df.corr()['MEDV'].sort_values(ascending=False)

In [None]:
#히트맵
plt.figure(figsize=(12,10))
#테이블 형태로 데이터를 넣어준다.
sns.heatmap(boston_df.corr(), annot=True, fmt='.2f')
plt.show()

In [None]:
#2. 데이터 분할

X = boston_df.iloc[:, :-1]
y = boston_df['MEDV']
X.shape, y.shape

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    random_state=0)

In [None]:
#모델생성, 학습
lr2 = LinearRegression()
lr2.fit(X_train, y_train)

In [None]:
#회귀계수, 절편 확인
lr2.coef_
y_pred = lr2.predict(X_test)
y_pred
#예측 성능 확인
train_sc = lr2.score(X_train, y_train)
test_sc = lr2.score(X_test, y_test)
print(f'train : {train_sc:.3f}, test : {test_sc:.3f}')

In [None]:
r2_sc = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
print(f'r2:{r2_sc:.3f}, mae:{mae:.3f}, mse:{mse:.3f}, rmse:{rmse:.3f}')

#### 일차방정식 -> 성능 X  --> 고차 방정식 변형 : PolynimalFeatures

In [None]:
#아래 함수는 스케일링과 다항회귀 변환을 적용한 데이터를 제공하는 함수
def convertDataSet(X, y, deg=2):
    #X 데이터 정규화 진행
    X_scaled = MinMaxScaler().fit_transform(X)
    #정규화된 X 데이터를 다항회귀 적용
    polyF = PolynomialFeatures(degree=deg)
    X_sc_poly = polyF.fit_transform(X_scaled)
    #test code : 다항회귀로 변환된 피처 정보 조회
#     print(polyF.get_feature_names_out())
    
    return X_sc_poly, y

In [None]:
#2. 데이터 분할

X = boston_df.iloc[:, :-1]
y = boston_df['MEDV']

X_sc_p, y = convertDataSet(X, y)

X_train, X_test, y_train, y_test = train_test_split(X_sc_p, y, 
                                                    random_state=0)

In [None]:
#모델생성, 학습
lr2 = LinearRegression()
lr2.fit(X_train, y_train)

In [None]:
y_pred = lr2.predict(X_test)
y_pred
#예측 성능 확인
train_sc = lr2.score(X_train, y_train)
test_sc = lr2.score(X_test, y_test)
print(f'train : {train_sc:.3f}, test : {test_sc:.3f}')

In [None]:
r2_sc = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
print(f'r2:{r2_sc:.3f}, mae:{mae:.3f}, mse:{mse:.3f}, rmse:{rmse:.3f}')

### Ridge를 적용한 과적합 개선

In [None]:
ridge = Ridge(alpha=1.0)
ridge.fit(X_train, y_train)

In [None]:
train_sc = ridge.score(X_train, y_train)
test_sc = ridge.score(X_test, y_test)
print(f'train : {train_sc:.3f}, test : {test_sc:.3f}')

In [None]:
#alpha값을 변경하면서 성능 튜닝
alphaL = [0.01, 0.5, 1, 5, 10]

for a in alphaL:
    ridge = Ridge(alpha=a)
    ridge.fit(X_train, y_train)
    train_sc = ridge.score(X_train, y_train)
    test_sc = ridge.score(X_test, y_test)
    print(f'alpha={a}, train : {train_sc:.3f}, test : {test_sc:.3f}')

### Lasso를 적용한 과적합 개선

In [None]:
lasso = Lasso(alpha=0.01, max_iter=10000)
lasso.fit(X_train, y_train)

In [None]:
train_sc = lasso.score(X_train, y_train)
test_sc = lasso.score(X_test, y_test)
print(f'train : {train_sc:.3f}, test : {test_sc:.3f}')

In [None]:
lasso.coef_