In [None]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
def stepwise(X,y,alpha_in=0.1,alpha_out=0.15):
    '''X为所有可能的自变量构成的DataFrame,
      y为响应变量，
      alpha_in为引入变量的显著性水平，
      alpha_out为剔除变量的显著性水平'''
    included=[]
    while True:
        changed=False
        excluded=list(set(X.columns)-set(included))
        p_val=pd.Series(index=excluded)
        for new_column in excluded:
            model=sm.OLS(y,sm.add_constant(X[included+[new_column]])).fit()
            p_val[new_column]=model.pvalues[new_column]
        min_pval=p_val.min()
        #forward step
        if min_pval < alpha_in:
            changed=True
            add_feature=p_val.idxmin()
            included.append(add_feature)
            print("Add {:20} with p_value   {:.6}".format(add_feature,min_pval))
        #backward step
        model=sm.OLS(y,sm.add_constant(X[included])).fit()
        p_val=model.pvalues.iloc[1:]
        max_pval=p_val.max()
        if max_pval > alpha_out:
            changed=True
            drop_feature=p_val.idxmax()
            included.remove(drop_feature)
            print("Drop {:20} with p_value   {:.6}".format(drop_feature,max_pval))
        if not changed:
            break
    return included


In [None]:
#根据最大调整R方前向逐步回选模型
def  forward_selection(X,y):
    included=[]
    current_R,best_R=0.0,0.0
    while True:
        changed=False
        excluded=list(set(X.columns)-set(included))
        adj_R=pd.Series(index=excluded)
        for new_column in excluded:
            model=sm.OLS(y,sm.add_constant(X[included+[new_column]])).fit()
            adj_R[new_column]=model.rsquared_adj
        best_R=adj_R.max()
        add_feature=adj_R.idxmax()
        if best_R > current_R:
            changed=True
            included.append(add_feature)
            current_R=best_R
            print("Add {}".format(add_feature))
        if not changed:
            break
    return included      
