In [1]:
import pandas
df = pandas.read_csv('Data/house-prices.csv')
df.head()

Unnamed: 0,Home,Price,SqFt,Bedrooms,Bathrooms,Offers,Brick,Neighborhood
0,1,114300,1790,2,2,2,No,East
1,2,114200,2030,4,2,3,No,East
2,3,114800,1740,3,2,1,No,East
3,4,94700,1980,3,2,3,No,East
4,5,119800,2130,3,3,3,No,East


In [2]:
house = pandas.concat([df, pandas.get_dummies(df['Brick']), pandas.get_dummies(df['Neighborhood'])], axis = 1)
del house['No']
del house['West']
del house['Brick']
del house['Neighborhood']
del house['Home']
house.head()

Unnamed: 0,Price,SqFt,Bedrooms,Bathrooms,Offers,Yes,East,North
0,114300,1790,2,2,2,False,True,False
1,114200,2030,4,2,3,False,True,False
2,114800,1740,3,2,1,False,True,False
3,94700,1980,3,2,3,False,True,False
4,119800,2130,3,3,3,False,True,False


In [3]:
X = house[['SqFt', 'Bedrooms', 'Bathrooms', 'Offers', 'Yes', 'East', 'North']]
Y = house['Price'].values

#### 最小二乘回归（OLS）每个值与平均值的差的平方的总和，越小拟合的就越好
#### sm.add_constant(X) 的作用是为独立变量矩阵添加一个常数列，以考虑回归模型中的截距项
#### est2.summary() 打印出包含回归模型的各种统计信息和摘要

In [None]:
import statsmodels.api as sm
X2 = sm.add_constant(X)
est = sm.OLS(Y, X2)
est2 = est.fit()
print(est2.summary())

#### predictorcols是参与拟合的自变量的所有元素
#### itertools.combinations(predictorcols, 1)对这些变量进行组合，1换成k，就是每种组合所含变量的个数

In [None]:
predictorcols = ['SqFt', 'Bedrooms', 'Bathrooms', 'Offers', 'Yes', 'East', 'North']
import itertools
for variables in itertools.combinations(predictorcols, 1):
    print(variables)

#### 就是每种组合的准确率，aic越小，准确度越高

#### X[list(variables)]从X = house[['SqFt', 'Bedrooms', 'Bathrooms', 'Offers', 'Yes', 'East', 'North']]选择数据
#### list(variables)为['SqFt', 'Bedrooms'],即x[['SqFt', 'Bedrooms']]

In [None]:
import itertools
AICs = {}
for k in range(1,len(predictorcols)+1):
    for variables in itertools.combinations(predictorcols, k):
        predictors  = X[list(variables)]
        predictors2 = sm.add_constant(predictors)
        est = sm.OLS(Y, predictors2)
        res = est.fit()
        AICs[variables] = res.aic    

#### 利用Counter(AICs)将每种组合的aic计数

In [None]:
from collections import Counter
c = Counter(AICs)
#c.most_common()
c.most_common()[::-10]