#### **To Do : Adjusted R-square를 이용해 후진 제거법 알고리즘 구현해보기**

In [10]:
# 필요 라이브러리 import

from sklearn import datasets
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

In [11]:
# 데이터 불러오기
data = datasets.load_diabetes()

In [12]:
# 데이터 저장 및 확인
x = data['data']
y = data['target']

print(x.shape, y.shape)

(442, 10) (442,)


In [13]:
# 데이터 나누기 - 6:2:2 비율
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1)
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.25, random_state=1)

print(x_train.shape, x_val.shape, x_test.shape)

(264, 10) (89, 10) (89, 10)


In [14]:
# 선택된 변수들, R-square 값 및 모델 저장
best_variables = list(range(10))
best_adj_r2 = 0.
best_model = None


# 선택된 변수들 중 Adjusted R-square의 손실이 가장 적은 변수를 골라 없애기
for round in range(10):
  print(f"============ round {round+1} ============")
  adj_r2_of_this_round = []
  models_of_this_round = []

  for var in best_variables:
    # 사용될 변수들과 모델
    use_vars = best_variables.copy()
    use_vars.remove(var)
    model = LinearRegression()

    # 지정된 변수만 사용하게끔 데이터 추출
    x_train_small = x_train[:, use_vars]
    x_val_small = x_val[:, use_vars]

    # 지정된 변수로 모델 학습
    model.fit(x_train_small, y_train)
    models_of_this_round.append(model)

    # validation adjusted R-square 계산
    r2 = r2_score(y_val, model.predict(x_val_small))
    adj_r2 = 1 - (1-r2)*(y_val.shape[0]-1)/(y_val.shape[0]-len(use_vars)-1)
    adj_r2_of_this_round.append(adj_r2)
    
  # R-square 가 높은 모델 선택
  best_adj_r2_of_this_round = np.max(adj_r2_of_this_round)

  # 이전 round와 비교
  if best_adj_r2_of_this_round > best_adj_r2:
    max_var = np.argmax(adj_r2_of_this_round)
    best_variables.pop(max_var)
    best_adj_r2 = best_adj_r2_of_this_round
    best_model = models_of_this_round[max_var]

    print('best variables updated: ', best_variables)
    print('current best r2: ', best_adj_r2)

  # 더 이상 개선되지 않으면 멈춤
  else:
    print("no improvement")
    break

print('---------------------------------------------------')
print('final variables: ', sorted(best_variables))
print('final adj_r2: ', best_adj_r2)

print('---------------------------------------------------')
test_r2=r2_score(y_test, best_model.predict(x_test[:, best_variables]))
test_adj_r2 =1-(1-test_r2)*(y_test.shape[0]-1)/(y_test.shape[0]-len(best_variables)-1) 
print('test adjust R-square: ', test_adj_r2)


best variables updated:  [0, 1, 2, 3, 4, 5, 6, 8, 9]
current best r2:  0.44968750331058116
best variables updated:  [0, 1, 2, 3, 4, 5, 6, 8]
current best r2:  0.468224109131373
best variables updated:  [0, 1, 2, 3, 5, 6, 8]
current best r2:  0.47847527149248903
best variables updated:  [0, 1, 2, 3, 6, 8]
current best r2:  0.49917040260293344
best variables updated:  [1, 2, 3, 6, 8]
current best r2:  0.5052771304258854
no improvement
---------------------------------------------------
final variables:  [1, 2, 3, 6, 8]
final adj_r2:  0.5052771304258854
---------------------------------------------------
test adjust R-square:  0.3860227499500407
