(URL) https://zephyrus1111.tistory.com/65

#### Variable Selection with python

* 전진 선택법 (Forward Selection)

* 후진 소거법 (Backward Elimination)

* 단계별 선택법 (Forward Stepwise Selection)

In [None]:
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
 
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/surgical_unit.csv') ## 데이터 불러오기

  import pandas.util.testing as tm


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54 entries, 0 to 53
Data columns (total 10 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Blood_Clotting_Score  54 non-null     float64
 1   Prognostic_Index      54 non-null     int64  
 2   Enzyme_Test           54 non-null     int64  
 3   Liver_Test            54 non-null     float64
 4   Age                   54 non-null     int64  
 5   Gender                54 non-null     int64  
 6   Alc_Mod               54 non-null     int64  
 7   Alc_Heavy             54 non-null     int64  
 8   Survival_Time         54 non-null     int64  
 9   Log_ST                54 non-null     float64
dtypes: float64(3), int64(7)
memory usage: 4.3 KB


In [None]:
df.shape

(54, 10)

In [None]:
df.sample(10)

Unnamed: 0,Blood_Clotting_Score,Prognostic_Index,Enzyme_Test,Liver_Test,Age,Gender,Alc_Mod,Alc_Heavy,Survival_Time,Log_ST
50,6.6,77,46,1.95,50,0,1,0,405,6.005
27,11.2,76,90,5.59,58,1,0,1,1965,7.583
43,6.5,56,77,2.85,41,0,1,0,538,6.288
44,3.4,77,93,1.48,69,0,1,0,482,6.178
29,5.8,76,59,2.58,61,1,1,0,600,6.396
8,6.0,67,93,2.5,58,0,1,0,1056,6.962
39,5.4,52,88,1.81,40,1,0,0,705,6.558
33,5.8,72,93,3.3,39,1,0,1,1037,6.944
15,7.4,74,68,2.4,64,1,1,0,809,6.695
9,3.7,76,94,2.4,48,0,1,0,968,6.875


In [None]:
## 전진 선택법
variables = df.columns[:-2].tolist() ## 설명 변수 리스트
 
y = df['Survival_Time'] ## 반응 변수
selected_variables = [] ## 선택된 변수들
sl_enter = 0.05
 
sv_per_step = [] ## 각 스텝별로 선택된 변수들
adjusted_r_squared = [] ## 각 스텝별 수정된 결정계수
steps = [] ## 스텝
step = 0
while len(variables) > 0:
    remainder = list(set(variables) - set(selected_variables))
    pval = pd.Series(index=remainder) ## 변수의 p-value
    ## 기존에 포함된 변수와 새로운 변수 하나씩 돌아가면서 
    ## 선형 모형을 적합한다.
    for col in remainder: 
        X = df[selected_variables+[col]]
        X = sm.add_constant(X)
        model = sm.OLS(y,X).fit()
        pval[col] = model.pvalues[col]
 
    min_pval = pval.min()
    if min_pval < sl_enter: ## 최소 p-value 값이 기준 값보다 작으면 포함
        selected_variables.append(pval.idxmin())
        
        step += 1
        steps.append(step)
        adj_r_squared = sm.OLS(y,sm.add_constant(df[selected_variables])).fit().rsquared_adj
        adjusted_r_squared.append(adj_r_squared)
        sv_per_step.append(selected_variables.copy())
    else:
        break

In [None]:
selected_variables

In [None]:
fig = plt.figure(figsize=(10,10))
fig.set_facecolor('white')
 
font_size = 15
plt.xticks(steps,[f'step {s}\n'+'\n'.join(sv_per_step[i]) for i,s in enumerate(steps)], fontsize=12)
plt.plot(steps,adjusted_r_squared, marker='o')
    
plt.ylabel('Adjusted R Squared',fontsize=font_size)
plt.grid(True)
plt.show()

In [None]:
## 후진 소거법
variables = df.columns[:-2].tolist() ## 설명 변수 리스트
 
y = df['Survival_Time'] ## 반응 변수
selected_variables = variables ## 초기에는 모든 변수가 선택된 상태
sl_remove = 0.05
 
sv_per_step = [] ## 각 스텝별로 선택된 변수들
adjusted_r_squared = [] ## 각 스텝별 수정된 결정계수
steps = [] ## 스텝
step = 0
while len(selected_variables) > 0:
    X = sm.add_constant(df[selected_variables])
    p_vals = sm.OLS(y,X).fit().pvalues[1:] ## 절편항의 p-value는 뺀다
    max_pval = p_vals.max() ## 최대 p-value
    if max_pval >= sl_remove: ## 최대 p-value값이 기준값보다 크거나 같으면 제외
        remove_variable = p_vals.idxmax()
        selected_variables.remove(remove_variable)
 
        step += 1
        steps.append(step)
        adj_r_squared = sm.OLS(y,sm.add_constant(df[selected_variables])).fit().rsquared_adj
        adjusted_r_squared.append(adj_r_squared)
        sv_per_step.append(selected_variables.copy())
    else:
        break

In [None]:
selected_variables