In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
plt.style.use('ggplot')



from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import RobustScaler, StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline, make_pipeline
from scipy.stats import skew
from sklearn.decomposition import PCA, KernelPCA
from sklearn.preprocessing import Imputer

from sklearn.model_selection import cross_val_score, GridSearchCV, KFold
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor
from sklearn.svm import SVR, LinearSVR
from sklearn.linear_model import ElasticNet, SGDRegressor, BayesianRidge
from sklearn.kernel_ridge import KernelRidge
from xgboost import XGBRegressor
import lightgbm as lgb


In [7]:
file= './train.csv'
train=pd.read_csv(file)
file1='./test.csv'
test1=pd.read_csv(file1)
train.rename(columns={'时间':'time', '小区名':'community', '小区房屋出租数量':'house_num', '楼层':'floor', '总楼层':'total_floor', '房屋面积':'area', '房屋朝向':'orientation', '居住状态':'status', '卧室数量':'bedroom_num',
       '厅的数量':'livingroom_num', '卫的数量':'bathroom_num', '出租方式':'method', '区':'Urban', '位置':'location', '地铁线路':'underground', '地铁站点':'underground_station', '距离':'distance', '装修情况':'Decoration', '月租金':'price'},inplace=True)
test1.rename(columns={'时间':'time', '小区名':'community', '小区房屋出租数量':'house_num', '楼层':'floor', '总楼层':'total_floor', '房屋面积':'area', '房屋朝向':'orientation', '居住状态':'status', '卧室数量':'bedroom_num',
       '厅的数量':'livingroom_num', '卫的数量':'bathroom_num', '出租方式':'method', '区':'Urban', '位置':'location', '地铁线路':'underground', '地铁站点':'underground_station', '距离':'distance', '装修情况':'Decoration', 'id':'id'},inplace=True)
id1=test1.id
test1= test1.drop(['id'],axis=1)
quantity = [attr for attr in train.columns if train.dtypes[attr] != 'object']  # 数值变量集合
quality = [attr for attr in train.columns if train.dtypes[attr] == 'object']  # 类型变量集合
def encode(frame,test, feature):
    '''
    对所有类型变量，依照各个类型变量的不同取值对应的样本集内房价的均值，按照房价均值高低
    对此变量的当前取值确定其相对数值1,2,3,4等等，相当于对类型变量赋值使其成为连续变量。
    此方法采用了与One-Hot编码不同的方法来处理离散数据，值得学习
    注意：此函数会直接在原frame的DataFrame内创建新的一列来存放feature编码后的值。
    '''
    ordering = pd.DataFrame()
    ordering['val'] = frame[feature].unique()
    ordering.index = ordering.val
    ordering['price_mean'] = frame[[feature, 'price']].groupby(feature).mean()['price']
    # 上述 groupby()操作可以将某一feature下同一取值的数据整个到一起，结合mean()可以直接得到该特征不同取值的房价均值
    ordering = ordering.sort_values('price_mean')
    ordering['order'] = range(1, ordering.shape[0]+1)
    ordering = ordering['order'].to_dict()
    for attr_v, score in ordering.items():
        # e.g. qualitative[2]: {'Grvl': 1, 'MISSING': 3, 'Pave': 2}
        frame.loc[frame[feature] == attr_v, feature+'_E'] = score
        test1.loc[test1[feature] == attr_v, feature+'_E'] = score

quality_encoded = []
# 由于qualitative集合中包含了非数值型变量和伪数值型变量（多为评分、等级等，其取值为1,2,3,4等等）两类
# 因此只需要对非数值型变量进行encode()处理。
# 如果采用One-Hot编码，则整个qualitative的特征都要进行pd,get_dummies()处理
for q in quality:
    encode(train,test1, q)
    quality_encoded.append(q+'_E')
train.drop(quality, axis=1, inplace=True)
test1.drop(quality, axis=1, inplace=True) # 离散变量已经有了编码后的新变量，因此删去原变量
# df_tr.shape = (1460, 80)
print(quality_encoded, '\n{} qualitative attributes have been encoded.'.format(len(quality_encoded)))

['orientation_E'] 
1 qualitative attributes have been encoded.


In [8]:
#train.fillna(0,inplace = True)
#test1.fillna(1,inplace = True)
                 
for d in ['house_num','Urban','location']:
    train[d].fillna(train[d].mean(), inplace=True)
for d in ['status','Decoration','method','distance','underground','underground_station']:
    train[d].fillna(0, inplace=True)
    
for d in ['house_num','Urban','location']:
    test1[d].fillna(test1[d].mean(), inplace=True)
for d in ['status','Decoration','method','distance','underground','underground_station']:
    test1[d].fillna(0, inplace=True)
test1['orientation_E'].fillna(test1['orientation_E'].mean(), inplace=True) 

In [9]:
class AverageWeight(BaseEstimator, RegressorMixin):
    def __init__(self,mod,weight):
        self.mod = mod
        self.weight = weight
        
    def fit(self,X,y):
        self.models_ = [clone(x) for x in self.mod]
        for model in self.models_:
            model.fit(X,y)
        return self
    
    def predict(self,X):
        w = list()
        pred = np.array([model.predict(X) for model in self.models_])
        # for every data point, single model prediction times weight, then add them together
        for data in range(pred.shape[1]):
            single = [pred[model,data]*weight for model,weight in zip(range(pred.shape[0]),self.weight)]
            w.append(np.sum(single))
        return w

In [10]:
etr=ExtraTreesRegressor(max_depth=1000,min_samples_split=4, n_jobs = -1)
xg=XGBRegressor(max_depth=6,n_estimators=19000,n_jobs=-1)

In [11]:
X_scaled = train.drop(['price'],axis=1)
y_log = train.price

In [12]:
weight_avg = AverageWeight(mod = [etr,xg],weight=[0.6,0.4])


In [13]:
weight_avg.fit(X_scaled,y_log)


AverageWeight(mod=[ExtraTreesRegressor(bootstrap=False, criterion='mse', max_depth=1000,
          max_features='auto', max_leaf_nodes=None,
          min_impurity_decrease=0.0, min_impurity_split=None,
          min_samples_leaf=1, min_samples_split=4,
          min_weight_fraction_leaf=0.0, n_estimators='warn',...,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)],
       weight=[0.6, 0.4])

In [14]:
p=weight_avg.predict(test1)
id1['price']=p

sp = pd.DataFrame()
sp['id']=id1
sp.drop(sp.index[-1],inplace=True)
sp['price']=p
sp.to_csv('try.csv',index=False)
d=pd.read_csv('try.csv')
print(d)

          id      price
0          1   4.379525
1          2   5.960286
2          3  12.768639
3          4   5.534577
4          5   4.991634
5          6  10.937753
6          7   9.281098
7          8   3.871248
8          9   8.759528
9         10   6.007465
10        11   5.453805
11        12   9.290493
12        13   3.851166
13        14   7.313627
14        15   7.186250
15        16  10.938377
16        17   6.718289
17        18   5.578862
18        19   7.775946
19        20   5.455036
20        21   4.134382
21        22   6.072959
22        23   7.284374
23        24   6.007118
24        25   7.819614
25        26   4.927101
26        27   4.539472
27        28   2.579838
28        29   7.799630
29        30   2.975708
...      ...        ...
56249  57970   4.137321
56250  57971   6.363210
56251  57973   7.039272
56252  57974   8.506850
56253  57975   8.913214
56254  57976  13.055045
56255  57977   6.486172
56256  57978   5.235909
56257  57979   5.380841
56258  57980   9

In [31]:
etr_y_predict=etr.predict(test1)
print(etr_y_predict)

[ 4.24312394  5.76952462 12.13234295 ...  7.4339983  12.85186757
  4.03735144]


In [20]:
# define cross validation strategy
def rmse_cv(model,X,y):
    rmse = np.sqrt(-cross_val_score(model, X, y, scoring="neg_mean_squared_error", cv=5))
    return rmse

In [22]:
X_train,X_test,y_train,y_test=train_test_split(X_scaled,y_log,random_state=20,test_size=0.2)

In [36]:


#ExtraTreesRegressor  X_scaled, y_log
etr=ExtraTreesRegressor(max_depth=None,min_samples_split=4, n_jobs = -1, random_state=20)
etr.fit(X_train,y_train)
#etr.fit(X_scaled,y_log)
etr_y_predict=etr.predict(X_test)

mean_squared_error(y_test,etr_y_predict)

1.7255012351732348