# 任务概述
当对数据和特征做了一些常规处理后，可以采取一些模型对最初的目标进行模型拟合；但模型种类较多，且参数众多，需要选择合适的模型也是一件比较费时的事情。这也是本阶段的主要任务。
* 模型筛选
* 特征重构
* 模型选定

In [5]:
# 加载包
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')  #忽略警告
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import IsolationForest

In [3]:
# 加载数据
data_train = pd.read_csv('train_data.csv')
data_test = pd.read_csv('test_a.csv')
# 数据预览
data_train.info(),data_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41440 entries, 0 to 41439
Data columns (total 51 columns):
ID                    41440 non-null int64
area                  41440 non-null float64
rentType              41440 non-null object
houseType             41440 non-null object
houseFloor            41440 non-null object
totalFloor            41440 non-null int64
houseToward           41440 non-null object
houseDecoration       41440 non-null object
communityName         41440 non-null object
city                  41440 non-null object
region                41440 non-null object
plate                 41440 non-null object
buildYear             41440 non-null object
saleSecHouseNum       41440 non-null int64
subwayStationNum      41440 non-null int64
busStationNum         41440 non-null int64
interSchoolNum        41440 non-null int64
schoolNum             41440 non-null int64
privateSchoolNum      41440 non-null int64
hospitalNum           41440 non-null int64
drugStoreNum       

(None, None)

## 清洗训练集

In [6]:
# 清洗目标数据中的异常值：利用数据本身规律清洗
def IF_drop(train):
    IForest = IsolationForest(contamination=0.01)
    IForest.fit(train["tradeMoney"].values.reshape(-1,1))
    y_pred = IForest.predict(train["tradeMoney"].values.reshape(-1,1))
    drop_index = train.loc[y_pred==-1].index
    print(drop_index)
    train.drop(drop_index,inplace=True)
    return train

data_train = IF_drop(data_train)


Int64Index([  108,   146,   228,   230,   255,   277,   310,   319,   348,
              369,
            ...
            38753, 38761, 39141, 39164, 39193, 39256, 39307, 39432, 41210,
            41241],
           dtype='int64', length=364)


In [10]:
# 1)清洗不符合业务逻辑的数据
def dropData(train):
    # 丢弃部分异常值
    train = train[train.area <= 200]
    train = train[(train.tradeMoney <=16000) & (train.tradeMoney >=700)]
    train.drop(train[(train['totalFloor'] == 0)].index, inplace=True)
    return train  

data_train = dropData(data_train)

In [11]:
# 2)清洗不符合业务逻辑的数据
def cleanData(data):
    data.drop(data[(data['region']=='RG00001') & (data['tradeMoney']<1000)&(data['area']>50)].index,inplace=True)
    data.drop(data[(data['region']=='RG00001') & (data['tradeMoney']>25000)].index,inplace=True)
    data.drop(data[(data['region']=='RG00001') & (data['area']>250)&(data['tradeMoney']<20000)].index,inplace=True)
    data.drop(data[(data['region']=='RG00001') & (data['area']>400)&(data['tradeMoney']>50000)].index,inplace=True)
    data.drop(data[(data['region']=='RG00001') & (data['area']>100)&(data['tradeMoney']<2000)].index,inplace=True)
    data.drop(data[(data['region']=='RG00002') & (data['area']<100)&(data['tradeMoney']>60000)].index,inplace=True)
    data.drop(data[(data['region']=='RG00003') & (data['area']<300)&(data['tradeMoney']>30000)].index,inplace=True)
    data.drop(data[(data['region']=='RG00003') & (data['tradeMoney']<500)&(data['area']<50)].index,inplace=True)
    data.drop(data[(data['region']=='RG00003') & (data['tradeMoney']<1500)&(data['area']>100)].index,inplace=True)
    data.drop(data[(data['region']=='RG00003') & (data['tradeMoney']<2000)&(data['area']>300)].index,inplace=True)
    data.drop(data[(data['region']=='RG00003') & (data['tradeMoney']>5000)&(data['area']<20)].index,inplace=True)
    data.drop(data[(data['region']=='RG00003') & (data['area']>600)&(data['tradeMoney']>40000)].index,inplace=True)
    data.drop(data[(data['region']=='RG00004') & (data['tradeMoney']<1000)&(data['area']>80)].index,inplace=True)
    data.drop(data[(data['region']=='RG00006') & (data['tradeMoney']<200)].index,inplace=True)
    data.drop(data[(data['region']=='RG00005') & (data['tradeMoney']<2000)&(data['area']>180)].index,inplace=True)
    data.drop(data[(data['region']=='RG00005') & (data['tradeMoney']>50000)&(data['area']<200)].index,inplace=True)
    data.drop(data[(data['region']=='RG00006') & (data['area']>200)&(data['tradeMoney']<2000)].index,inplace=True)
    data.drop(data[(data['region']=='RG00007') & (data['area']>100)&(data['tradeMoney']<2500)].index,inplace=True)
    data.drop(data[(data['region']=='RG00010') & (data['area']>200)&(data['tradeMoney']>25000)].index,inplace=True)
    data.drop(data[(data['region']=='RG00010') & (data['area']>400)&(data['tradeMoney']<15000)].index,inplace=True)
    data.drop(data[(data['region']=='RG00010') & (data['tradeMoney']<3000)&(data['area']>200)].index,inplace=True)
    data.drop(data[(data['region']=='RG00010') & (data['tradeMoney']>7000)&(data['area']<75)].index,inplace=True)
    data.drop(data[(data['region']=='RG00010') & (data['tradeMoney']>12500)&(data['area']<100)].index,inplace=True)
    data.drop(data[(data['region']=='RG00004') & (data['area']>400)&(data['tradeMoney']>20000)].index,inplace=True)
    data.drop(data[(data['region']=='RG00008') & (data['tradeMoney']<2000)&(data['area']>80)].index,inplace=True)
    data.drop(data[(data['region']=='RG00009') & (data['tradeMoney']>40000)].index,inplace=True)
    data.drop(data[(data['region']=='RG00009') & (data['area']>300)].index,inplace=True)
    data.drop(data[(data['region']=='RG00009') & (data['area']>100)&(data['tradeMoney']<2000)].index,inplace=True)
    data.drop(data[(data['region']=='RG00011') & (data['tradeMoney']<10000)&(data['area']>390)].index,inplace=True)
    data.drop(data[(data['region']=='RG00012') & (data['area']>120)&(data['tradeMoney']<5000)].index,inplace=True)
    data.drop(data[(data['region']=='RG00013') & (data['area']<100)&(data['tradeMoney']>40000)].index,inplace=True)
    data.drop(data[(data['region']=='RG00013') & (data['area']>400)&(data['tradeMoney']>50000)].index,inplace=True)
    data.drop(data[(data['region']=='RG00013') & (data['area']>80)&(data['tradeMoney']<2000)].index,inplace=True)
    data.drop(data[(data['region']=='RG00014') & (data['area']>300)&(data['tradeMoney']>40000)].index,inplace=True)
    data.drop(data[(data['region']=='RG00014') & (data['tradeMoney']<1300)&(data['area']>80)].index,inplace=True)
    data.drop(data[(data['region']=='RG00014') & (data['tradeMoney']<8000)&(data['area']>200)].index,inplace=True)
    data.drop(data[(data['region']=='RG00014') & (data['tradeMoney']<1000)&(data['area']>20)].index,inplace=True)
    data.drop(data[(data['region']=='RG00014') & (data['tradeMoney']>25000)&(data['area']>200)].index,inplace=True)
    data.drop(data[(data['region']=='RG00014') & (data['tradeMoney']<20000)&(data['area']>250)].index,inplace=True)
    data.drop(data[(data['region']=='RG00005') & (data['tradeMoney']>30000)&(data['area']<100)].index,inplace=True)
    data.drop(data[(data['region']=='RG00005') & (data['tradeMoney']<50000)&(data['area']>600)].index,inplace=True)
    data.drop(data[(data['region']=='RG00005') & (data['tradeMoney']>50000)&(data['area']>350)].index,inplace=True)
    data.drop(data[(data['region']=='RG00006') & (data['tradeMoney']>4000)&(data['area']<100)].index,inplace=True)
    data.drop(data[(data['region']=='RG00006') & (data['tradeMoney']<600)&(data['area']>100)].index,inplace=True)
    data.drop(data[(data['region']=='RG00006') & (data['area']>165)].index,inplace=True)
    data.drop(data[(data['region']=='RG00012') & (data['tradeMoney']<800)&(data['area']<30)].index,inplace=True)
    data.drop(data[(data['region']=='RG00007') & (data['tradeMoney']<1100)&(data['area']>50)].index,inplace=True)
    data.drop(data[(data['region']=='RG00004') & (data['tradeMoney']>8000)&(data['area']<80)].index,inplace=True)
    data.loc[(data['region']=='RG00002')&(data['area']>50)&(data['rentType']=='合租'),'rentType']='整租'
    data.loc[(data['region']=='RG00014')&(data['rentType']=='合租')&(data['area']>60),'rentType']='整租'
    data.drop(data[(data['region']=='RG00008')&(data['tradeMoney']>15000)&(data['area']<110)].index,inplace=True)
    data.drop(data[(data['region']=='RG00008')&(data['tradeMoney']>20000)&(data['area']>110)].index,inplace=True)
    data.drop(data[(data['region']=='RG00008')&(data['tradeMoney']<1500)&(data['area']<50)].index,inplace=True)
    data.drop(data[(data['region']=='RG00008')&(data['rentType']=='合租')&(data['area']>50)].index,inplace=True)
    data.drop(data[(data['region']=='RG00015') ].index,inplace=True)
    data.reset_index(drop=True, inplace=True)
    return data

data_train = cleanData(data_train)

In [29]:
# 目标数据
target_train  = data_train.pop('tradeMoney')
target_test  = pd.read_csv('./评分文件/sub_a_913.csv')

KeyError: 'tradeMoney'

In [14]:
# 处理缺失和填补
def missing(data):
    # 删除无用特征
    data.drop(['city','ID'], axis = 1, inplace = True)
    # 填补缺失或不正常值
    buildYear_mode= data['buildYear'][data['buildYear'] != '暂无信息'].mode()
    data.loc[data['buildYear'] == '暂无信息','buildYear'] = buildYear_mode[0]
    data['buildYear'] = data['buildYear'].astype(int)
    data['pv'].fillna(data['pv'].mean(), inplace = True)
    data['uv'].fillna(data['uv'].mean(), inplace = True)
    return(data)
train,test = missing(data_train),missing(data_test)

## 特征拆分和融合

In [15]:
# 分类变量
# 将houseType转为'Room'，'Hall'，'Bath'
def housetype(data_train):
    def Room(x):
        Room = int(x.split('室')[0])
        return Room
    def Hall(x):
        Hall = int(x.split('室')[1].split('厅')[0])
        return(Hall)
    def Bash(x):
        Bash = int(x.split('室')[1].split('厅')[1].split('卫')[0])
    data_train['room'] = data_train['houseType'].apply(lambda x :Room(x))
    data_train['hall'] = data_train['houseType'].apply(lambda x :Room(x))
    data_train['bath'] = data_train['houseType'].apply(lambda x :Room(x))
    data_train['bath_room'] = (data_train['bath'] + 1) / (data_train['room'] + 1)
    return(data_train)

data_train, data_test = housetype(data_train), housetype(data_test)


In [16]:
# 填充租房类型
def renttype(data_train):
    data_train.loc[(data_train['rentType'] == '未知方式') & (data_train['room'] <= 1), 'rentType'] = '整租'
    # print(data.loc[(data['rentType']=='未知方式')&(data['Room_Bath']>1),'rentType'])
    data_train.loc[(data_train['rentType'] == '未知方式') & (data_train['bath_room'] > 1), 'rentType'] = '合租'
    data_train.loc[(data_train['rentType'] == '未知方式') & (data_train['room'] > 1) & (data_train['area'] < 50), 'rentType'] = '合租'
    data_train.loc[(data_train['rentType'] == '未知方式') & (data_train['area'] / data_train['room'] < 20), 'rentType'] = '合租'
    # data.loc[(data['rentType']=='未知方式')&(data['area']>60),'rentType']='合租'
    data_train.loc[(data_train['rentType'] == '未知方式') & (data_train['area'] <= 50) & (data_train['room'] == 2), 'rentType'] = '合租'
    data_train.loc[(data_train['rentType'] == '未知方式') & (data_train['area'] > 60) & (data_train['room'] == 2), 'rentType'] = '整租'
    data_train.loc[(data_train['rentType'] == '未知方式') & (data_train['area'] <= 60) & (data_train['room'] == 3), 'rentType'] = '合租'
    data_train.loc[(data_train['rentType'] == '未知方式') & (data_train['area'] > 60) & (data_train['room'] == 3), 'rentType'] = '整租'
    data_train.loc[(data_train['rentType'] == '未知方式') & (data_train['area'] >= 100) & (data_train['room'] > 3), 'rentType'] = '整租'
    return(data_train)

data_train, data_test = renttype(data_train), renttype(data_test)

In [17]:
data_train.loc[data_train['rentType'] == '--',['rentType', 'area', 'room', 'bath', 'bath_room']]

Unnamed: 0,rentType,area,room,bath,bath_room
3978,--,63.26,1,1,1.0
4003,--,49.0,2,2,1.0
4034,--,37.8,1,1,1.0
38122,--,30.0,1,1,1.0


In [21]:
# 对rentType中“--”填补
data_train.loc[(data_train['rentType'] == '--') & (data_train['area'] >60) & (data_train['room'] <= 1), 'rentType'] = '整租'
data_train.loc[(data_train['rentType'] == '--') & (data_train['area'] >50) & (data_train['room'] == 2), 'rentType'] = '整租'
data_train.loc[(data_train['rentType'] == '--')  & (data_train['room'] <= 1), 'rentType'] = '整租'

# 是否有必要将‘未知方式’细分全部填补
# data_train.loc[data_train['rentType'] == '未知方式',['rentType', 'area', 'room', 'bath', 'bath_room']]

In [22]:
# 处理tradeTime
def time(data_train):
    def month(x):
        month = int(x.split('/')[1])
        return(month)
    def day(x):
        day = int(x.split('/')[2])
        return(day)
    def year(x):
        year = int(x.split('/')[2])
        return(year)
    data_train['month'] = data_train['tradeTime'].apply(lambda x: month(x))
    data_train['day'] = data_train['tradeTime'].apply(lambda x: day(x))
    data_train['year'] = data_train['tradeTime'].apply(lambda x: year(x))
    return(data_train)

data_train, data_test = time(data_train), time(data_test)

In [23]:
# 其余特征融合（脑洞大开）
def otherca(data_train):
    data_train['pv/uv'] = data_train['pv'] / data_train['uv']
    data_train['房间总数'] = data_train['room'] + data_train['hall'] + data_train['bath']

    # 公共设施
    # 各交通工具权重自定（参照交通工具对租金影响程度而定）
    data_train['transportNum'] = 5*data_train['subwayStationNum'] / data_train['subwayStationNum'].mean() + data_train['busStationNum'] / data_train['busStationNum'].mean()
    # 教育设施权重（可类比，国际学校，私立学校，公立学校影响）
    data_train['all_SchoolNum'] = 2 * data_train['interSchoolNum'] / data_train['interSchoolNum'].mean() + data_train['schoolNum'] / data_train['schoolNum'].mean() + data_train['privateSchoolNum'] / data_train['privateSchoolNum'].mean()
    # 商场，同上
    data_train['all_mall'] = data_train['mallNum'] / data_train['mallNum'].mean() + data_train['superMarketNum'] / data_train['superMarketNum'].mean()
    # 医疗
    data_train['all_hospitalNum'] = 2 * data_train['hospitalNum'] / data_train['hospitalNum'].mean() + data_train['drugStoreNum'] / data_train['drugStoreNum'].mean()
    # 其他公共设施；体育馆，银行等
    data_train['otherNum'] = data_train['gymNum'] / data_train['gymNum'].mean() + data_train['bankNum'] / data_train['bankNum'].mean() + data_train['shopNum'] / data_train['shopNum'].mean() + 2 * data_train['parkNum'] / data_train['parkNum'].mean()

    # 删除已用特征
    data_train.drop(['houseType','tradeTime','subwayStationNum', 'busStationNum','interSchoolNum', 'schoolNum', 'privateSchoolNum',
                   'hospitalNum', 'drugStoreNum', 'mallNum', 'superMarketNum', 'gymNum', 'bankNum', 'shopNum', 'parkNum'],
                  axis=1, inplace=True)
    return(data_train)

data_train, data_test = otherca(data_train), otherca(data_test)

In [24]:
#计算统计特征
def featureCount(train,test):
    train['data_type'] = 0
    test['data_type'] = 1
    data = pd.concat([train, test], axis=0, join='outer')
    def feature_count(data, features=[]):
        new_feature = 'count'
        for i in features:
            new_feature += '_' + i
        temp = data.groupby(features).size().reset_index().rename(columns={0: new_feature})
        data = data.merge(temp, 'left', on=features)
        return data

    data = feature_count(data, ['communityName'])
    data = feature_count(data, ['buildYear'])
    data = feature_count(data, ['totalFloor'])
    data = feature_count(data, ['communityName', 'totalFloor'])
    data = feature_count(data, ['communityName', 'newWorkers'])
    data = feature_count(data, ['communityName', 'totalTradeMoney'])
    new_train = data[data['data_type'] == 0]
    new_test = data[data['data_type'] == 1]
    new_train.drop('data_type', axis=1, inplace=True)
    new_test.drop(['data_type'], axis=1, inplace=True)
    return new_train, new_test
    
data_train, data_test = featureCount(data_train, data_test)

In [25]:
#groupby生成统计特征：mean,std等

def groupby(train,test):
    train['data_type'] = 0
    test['data_type'] = 1
    data = pd.concat([train, test], axis=0, join='outer')
    columns = ['rentType', 'houseFloor', 'houseToward', 'houseDecoration', 'communityName', 'region', 'plate']
    for feature in columns:
        data[feature] = LabelEncoder().fit_transform(data[feature])

    temp = data.groupby('communityName')['area'].agg({'com_area_mean': 'mean', 'com_area_std': 'std'})
    temp.fillna(0, inplace=True)
    data = data.merge(temp, on='communityName', how='left')
    
    data['price_per_area'] = data.tradeMeanPrice / data.area * 100
    temp = data.groupby('communityName')['price_per_area'].agg(
        {'comm_price_mean': 'mean', 'comm_price_std': 'std'})
    temp.fillna(0, inplace=True)
    data = data.merge(temp, on='communityName', how='left')
   
    temp = data.groupby('plate')['price_per_area'].agg(
        {'plate_price_mean': 'mean', 'plate_price_std': 'std'})
    temp.fillna(0, inplace=True)
    data = data.merge(temp, on='plate', how='left')
    data.drop('price_per_area', axis=1, inplace=True)

    temp = data.groupby('plate')['area'].agg({'plate_area_mean': 'mean', 'plate_area_std': 'std'})
    temp.fillna(0, inplace=True)
    data = data.merge(temp, on='plate', how='left')
    
    temp = data.groupby(['plate'])['buildYear'].agg({'plate_year_mean': 'mean', 'plate_year_std': 'std'})
    data = data.merge(temp, on='plate', how='left')
    data.plate_year_mean = data.plate_year_mean.astype('int')
    data['comm_plate_year_diff'] = data.buildYear - data.plate_year_mean
    data.drop('plate_year_mean', axis=1, inplace=True)

    temp = data.groupby('plate')['transportNum'].agg('sum').reset_index(name='plate_transportNum')
    data = data.merge(temp, on='plate', how='left')
    temp = data.groupby(['communityName', 'plate'])['transportNum'].agg('sum').reset_index(name='com_transportNum')
    data = data.merge(temp, on=['communityName', 'plate'], how='left')
    data['transportNum_ratio'] = list(map(lambda x, y: round(x / y, 3) if y != 0 else -1,
                                           data['com_transportNum'], data['plate_transportNum']))
    data = data.drop(['com_transportNum', 'plate_transportNum'], axis=1)

#     temp = data.groupby('plate')['all_SchoolNum'].agg('sum').reset_index(name='plate_all_SchoolNum')
#     data = data.merge(temp, on='plate', how='left')
#     temp = data.groupby(['communityName', 'plate'])['all_SchoolNum'].agg('sum').reset_index(name='com_all_SchoolNum')
#     data = data.merge(temp, on=['communityName', 'plate'], how='left')
#     data = data.drop(['com_all_SchoolNum', 'plate_all_SchoolNum'], axis=1)  # 没有融合

    temp = data.groupby(['communityName', 'plate'])['all_mall'].agg('sum').reset_index(name='com_all_mall')
    data = data.merge(temp, on=['communityName', 'plate'], how='left')

    temp = data.groupby('plate')['otherNum'].agg('sum').reset_index(name='plate_otherNum')
    data = data.merge(temp, on='plate', how='left')
    temp = data.groupby(['communityName', 'plate'])['otherNum'].agg('sum').reset_index(name='com_otherNum')
    data = data.merge(temp, on=['communityName', 'plate'], how='left')
    data['other_ratio'] = list(map(lambda x, y: round(x / y, 3) if y != 0 else -1,
                                   data['com_otherNum'], data['plate_otherNum']))
    data = data.drop(['com_otherNum', 'plate_otherNum'], axis=1)

    temp = data.groupby(['month', 'communityName']).size().reset_index(name='communityName_saleNum')
    data = data.merge(temp, on=['month', 'communityName'], how='left')
    temp = data.groupby(['month', 'plate']).size().reset_index(name='plate_saleNum')
    data = data.merge(temp, on=['month', 'plate'], how='left')

    data['sale_ratio'] = round((data.communityName_saleNum + 1) / (data.plate_saleNum + 1), 3)
    data['sale_newworker_differ'] = 3 * data.plate_saleNum - data.newWorkers
    data.drop(['communityName_saleNum', 'plate_saleNum'], axis=1, inplace=True)

    new_train = data[data['data_type'] == 0]
    new_test = data[data['data_type'] == 1]
    new_train.drop('data_type', axis=1, inplace=True)
    new_test.drop(['data_type'], axis=1, inplace=True)
    return new_train, new_test

data_train, data_test = groupby(data_train, data_test)

In [26]:
#聚类
def cluster(train,test):
    from sklearn.mixture import GaussianMixture

    train['data_type'] = 0
    test['data_type'] = 1
    data = pd.concat([train, test], axis=0, join='outer')
    col = ['totalFloor',
           'houseDecoration', 'communityName', 'region', 'plate', 'buildYear',

           'tradeMeanPrice', 'tradeSecNum', 'totalNewTradeMoney',
           'totalNewTradeArea', 'tradeNewMeanPrice', 'tradeNewNum', 'remainNewNum',

           'landTotalPrice', 'landMeanPrice', 'totalWorkers',
           'newWorkers', 'residentPopulation', 'lookNum',
           'transportNum',
           'all_SchoolNum', 'all_hospitalNum', 'all_mall', 'otherNum']

    # EM
    gmm = GaussianMixture(n_components=3, covariance_type='full', random_state=0)
    data['cluster']= pd.DataFrame(gmm.fit_predict(data[col]))


    col1 = ['totalFloor','houseDecoration', 'communityName', 'region', 'plate', 'buildYear']
    col2 = ['tradeMeanPrice', 'tradeSecNum', 'totalNewTradeMoney',
            'totalNewTradeArea', 'tradeNewMeanPrice', 'tradeNewNum', 'remainNewNum',
            'landTotalPrice', 'landMeanPrice', 'totalWorkers',
            'newWorkers', 'residentPopulation', 'lookNum',
            'transportNum',
            'all_SchoolNum', 'all_hospitalNum', 'all_mall', 'otherNum']
    for feature1 in col1:
        for feature2 in col2:
        
            temp = data.groupby(['cluster',feature1])[feature2].agg('mean').reset_index(name=feature2+'_'+feature1+'_cluster_mean')
            temp.fillna(0, inplace=True)
       
            data = data.merge(temp, on=['cluster', feature1], how='left')
    
   
    new_train = data[data['data_type'] == 0]
    new_test = data[data['data_type'] == 1]
    new_train.drop('data_type', axis=1, inplace=True)
    new_test.drop(['data_type'], axis=1, inplace=True)
    
    return new_train, new_test

data_train, data_test = cluster(data_train, data_test)   

In [27]:
# 过大量级值取log平滑（针对线性模型有效）
big_num_cols = ['totalTradeMoney','totalTradeArea','tradeMeanPrice','totalNewTradeMoney', 'totalNewTradeArea',
                'tradeNewMeanPrice','remainNewNum', 'supplyNewNum', 'supplyLandArea',
                'tradeLandArea','landTotalPrice','landMeanPrice','totalWorkers','newWorkers',
                'residentPopulation','pv','uv']

for col in big_num_cols:
        data_train[col] = data_train[col].map(lambda x: np.log1p(x))
        data_test[col] = data_test[col].map(lambda x: np.log1p(x))
        

In [31]:
#对比特征工程前后线性模型结果情况
data_test = data_test.fillna(0)
data_train = data_train.fillna(0)
# Lasso回归
from sklearn.linear_model import Lasso
lasso=Lasso(alpha=0.1,fit_intercept = True)
lasso.fit(data_train,target_train)
#预测测试集和训练集结果
y_pred_train=lasso.predict(data_train)
y_pred_test=lasso.predict(data_test)

#对比结果
from sklearn.metrics import r2_score
score_train=r2_score(y_pred_train,target_train)
print("训练集结果：",score_train)
score_test=r2_score(y_pred_test, target_test)
print("测试集结果：",score_test)

训练集结果： 0.7299971022440981
测试集结果： 0.813833276187229


In [32]:
#相关系数法特征选择
from sklearn.feature_selection import SelectKBest

print(data_train.shape)

sk = SelectKBest(k=150)
new_train = sk.fit_transform(data_train,target_train)
print(new_train.shape)

# 获取对应列索引
select_columns=sk.get_support(indices = True)
print(select_columns)

# 获取对应列名
print(data_train.columns[select_columns])
select_columns_name=data_test.columns[select_columns]
new_test=data_test[select_columns_name]
print(new_test.shape)
# Lasso回归
from sklearn.linear_model import Lasso

lasso=Lasso(alpha=0.1)
lasso.fit(new_train,target_train)
#预测测试集和训练集结果
y_pred_train=lasso.predict(new_train)

y_pred_test=lasso.predict(new_test)

#对比结果
from sklearn.metrics import r2_score
score_train=r2_score(y_pred_train,target_train)
print("训练集结果：",score_train)
score_test=r2_score(y_pred_test, target_test)
print("测试集结果：",score_test)

(40134, 177)
(40134, 150)
[  0   1   3   4   5   6   7   8   9  10  12  13  14  15  16  17  18  19
  20  21  22  24  27  28  29  30  33  34  35  37  38  39  40  41  43  44
  45  46  47  48  49  50  51  52  53  54  55  56  57  58  59  60  61  62
  64  66  68  69  70  71  72  73  74  75  76  77  78  79  80  81  83  84
  85  86  87  88  89  90  92  93  94  95  96  97  98  99 100 101 102 103
 104 105 106 108 109 110 111 114 116 117 119 120 121 122 123 124 125 126
 127 128 129 132 133 134 135 136 137 138 139 140 141 142 144 145 146 147
 150 151 152 153 155 156 157 158 159 160 161 162 163 164 165 168 169 170
 171 172 173 174 175 176]
Index(['area', 'rentType', 'totalFloor', 'houseToward', 'houseDecoration',
       'communityName', 'region', 'plate', 'buildYear', 'saleSecHouseNum',
       ...
       'remainNewNum_buildYear_cluster_mean',
       'totalWorkers_buildYear_cluster_mean',
       'newWorkers_buildYear_cluster_mean',
       'residentPopulation_buildYear_cluster_mean',
       'lookNum

In [34]:
# Wrapper

from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
rfe = RFE(lr, n_features_to_select=100)
rfe.fit(data_train,target_train)

RFE(estimator=LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
                               normalize=False),
    n_features_to_select=40, step=1, verbose=0)




select_columns = [f for f, s in zip(data_train.columns, rfe.support_) if s]
print(select_columns)
new_train = data_train[select_columns]
new_test = data_test[select_columns]

# Lasso回归
from sklearn.linear_model import Lasso

lasso=Lasso(alpha=0.1)
lasso.fit(new_train,target_train)
#预测测试集和训练集结果
y_pred_train=lasso.predict(new_train)

y_pred_test=lasso.predict(new_test)

#对比结果
from sklearn.metrics import r2_score
score_train=r2_score(y_pred_train,target_train)
print("训练集结果：",score_train)
score_test=r2_score(y_pred_test, target_test)
print("测试集结果：",score_test)

['area', 'rentType', 'houseFloor', 'totalFloor', 'houseToward', 'houseDecoration', 'region', 'plate', 'buildYear', 'saleSecHouseNum', 'totalTradeMoney', 'totalTradeArea', 'tradeMeanPrice', 'totalNewTradeMoney', 'totalNewTradeArea', 'tradeNewMeanPrice', 'tradeNewNum', 'remainNewNum', 'supplyNewNum', 'supplyLandNum', 'supplyLandArea', 'tradeLandNum', 'tradeLandArea', 'landTotalPrice', 'landMeanPrice', 'totalWorkers', 'newWorkers', 'residentPopulation', 'pv', 'uv', 'lookNum', 'room', 'hall', 'bath', 'month', 'pv/uv', '房间总数', 'transportNum', 'all_SchoolNum', 'all_mall', 'all_hospitalNum', 'otherNum', 'count_communityName_totalFloor', 'com_area_mean', 'plate_area_mean', 'plate_area_std', 'plate_year_std', 'comm_plate_year_diff', 'transportNum_ratio', 'other_ratio', 'sale_ratio', 'cluster', 'lookNum_totalFloor_cluster_mean', 'transportNum_totalFloor_cluster_mean', 'all_SchoolNum_totalFloor_cluster_mean', 'all_hospitalNum_totalFloor_cluster_mean', 'all_mall_totalFloor_cluster_mean', 'otherNum

In [35]:
# Embedded
# 基于惩罚项的特征选择法
# Lasso(l1)和Ridge(l2)

from sklearn.linear_model import Ridge

ridge = Ridge(alpha=5)
ridge.fit(data_train,target_train)

Ridge(alpha=5, copy_X=True, fit_intercept=True, max_iter=None, normalize=False,
      random_state=None, solver='auto', tol=0.001)

# 特征系数排序
coefSort = ridge.coef_.argsort()
print(coefSort)


# 特征系数
featureCoefSore=ridge.coef_[coefSort]
print(featureCoefSore)


select_columns = [f for f, s in zip(data_train.columns, featureCoefSore) if abs(s)> 0.0000005 ] 
# 选择绝对值大于0.0000005的特征
# print(select_columns)

new_train = data_train[select_columns]
new_test = data_test[select_columns]
# Lasso回归
from sklearn.linear_model import Lasso

lasso=Lasso(alpha=0.1)
lasso.fit(new_train,target_train)
#预测测试集和训练集结果
y_pred_train=lasso.predict(new_train)

y_pred_test=lasso.predict(new_test)

#对比结果
from sklearn.metrics import r2_score
score_train=r2_score(y_pred_train,target_train)
print("训练集结果：",score_train)
score_test=r2_score(y_pred_test, target_test)
print("测试集结果：",score_test)

[ 66  30 154 139  13 121 120  43 122 176  84  81 118 136 135 153  65 140
  63 119  23 172  16 137  17  28   9  22 173  19 164   2   4  92 128  59
  11  74  27 124   5  10  24 129 100  97  93  47  14 110 147 102  90   6
 104 144  87 103  70  52  77  57  38  39 167 113  69 149 127  75  98  49
  80 169 116 152  55  67 114 150 151  56 109 125  94 166 130 161  71 107
 148  36 112  76 143  89 168  78 170 115  48  73 163 132  79 134 145 159
 123 105  58 108 165  96 133 141  72  91 131 162  51 126  95  18  99  68
 160  64 111  32 106 101 142  26  50  53 117  20  25  61  54  82  33  35
  34 146  37   3  41  88  62 171   0  60  15 156  40  86   1   8  45  83
  12  21  46 158 174  29  44 138 175 157 155   7  85  31  42]
[-7.64380383e+02 -7.01000123e+02 -6.18610506e+02 -4.01673108e+02
 -3.76898538e+02 -3.74611706e+02 -3.57133169e+02 -3.16328382e+02
 -3.06687287e+02 -2.89510336e+02 -2.82427273e+02 -2.69696038e+02
 -2.67222424e+02 -2.34322618e+02 -1.81762294e+02 -1.41975945e+02
 -1.36156981e+02 -1.1

In [37]:
# Embedded
# 基于树模型的特征选择法
# 随机森林 平均不纯度减少（mean decrease impurity


from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor()
# 训练随机森林模型，并通过feature_importances_属性获取每个特征的重要性分数。rf = RandomForestRegressor()
rf.fit(data_train,target_train)
print("Features sorted by their score:")
print(sorted(zip(map(lambda x: round(x, 4), rf.feature_importances_), data_train.columns),
             reverse=True))

select_columns = [f for f, s in zip(data_train.columns, rf.feature_importances_) if abs(s)> 0.00005 ] 
# 选择绝对值大于0.00005的特征

new_train = data_train[select_columns]
new_test = data_test[select_columns]

# Lasso回归
from sklearn.linear_model import Lasso

lasso=Lasso(alpha=0.1)
lasso.fit(new_train,target_train)
#预测测试集和训练集结果
y_pred_train=lasso.predict(new_train)

y_pred_test=lasso.predict(new_test)

#对比结果
from sklearn.metrics import r2_score
score_train=r2_score(y_pred_train,target_train)
print("训练集结果：",score_train)
score_test=r2_score(y_pred_test, target_test)
print("测试集结果：",score_test)

Features sorted by their score:
[(0.447, 'area'), (0.1362, 'tradeMeanPrice_plate_cluster_mean'), (0.0797, 'tradeMeanPrice_communityName_cluster_mean'), (0.03, 'plate_area_mean'), (0.0255, 'com_area_mean'), (0.0203, 'plate_year_std'), (0.0193, 'plate_area_std'), (0.0155, 'tradeNewMeanPrice_plate_cluster_mean'), (0.0094, 'totalFloor'), (0.0085, 'comm_plate_year_diff'), (0.0076, 'buildYear'), (0.0073, 'tradeNewMeanPrice_communityName_cluster_mean'), (0.007, 'plate_price_mean'), (0.007, 'comm_price_mean'), (0.0065, 'transportNum'), (0.0051, 'communityName'), (0.0045, 'sale_ratio'), (0.0041, 'tradeSecNum_communityName_cluster_mean'), (0.0037, 'com_area_std'), (0.0037, 'com_all_mall'), (0.0036, 'remainNewNum_communityName_cluster_mean'), (0.0034, 'transportNum_ratio'), (0.0033, 'transportNum_plate_cluster_mean'), (0.0033, 'count_communityName'), (0.0031, 'all_SchoolNum_communityName_cluster_mean'), (0.003, 'transportNum_communityName_cluster_mean'), (0.003, 'all_hospitalNum_communityName_clu

## 模型选择

In [38]:
#coding:utf-8
#导入warnings包，利用过滤器来实现忽略警告语句。
import warnings
warnings.filterwarnings('ignore')

# GBDT
from sklearn.ensemble import GradientBoostingRegressor
# XGBoost
import xgboost as xgb
# LightGBM
import lightgbm as lgb

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import KFold
from sklearn.metrics import r2_score
from sklearn.preprocessing import LabelEncoder
import pickle
import multiprocessing
from sklearn.preprocessing import StandardScaler
ss = StandardScaler() 
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import ElasticNet, Lasso,  BayesianRidge, LassoLarsIC,LinearRegression,LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import IsolationForest

In [41]:
# LinearRegression回归
lr= LinearRegression(normalize=False,fit_intercept = True)
lr.fit(data_train,target_train)
#预测测试集和训练集结果
y_pred_train=lr.predict(data_train)
y_pred_test=lr.predict(data_test)

#对比结果
from sklearn.metrics import r2_score
score_train=r2_score(y_pred_train,target_train)
print("训练集结果：",score_train)
score_test=r2_score(y_pred_test, target_test)
print("测试集结果：",score_test)

训练集结果： 0.73448780527957
测试集结果： -0.1914830664273488


In [42]:
# Randomforest
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor()
# 训练随机森林模型，并通过feature_importances_属性获取每个特征的重要性分数。rf = RandomForestRegressor()
rf.fit(data_train,target_train)
print("Features sorted by their score:")
print(sorted(zip(map(lambda x: round(x, 4), rf.feature_importances_), data_train.columns),
             reverse=True))

select_columns = [f for f, s in zip(data_train.columns, rf.feature_importances_) if abs(s)> 0.00005 ] 
# 选择绝对值大于0.00005的特征

new_train = data_train[select_columns]
new_test = data_test[select_columns]

# LinearRegression回归
lr= LinearRegression(normalize=False,fit_intercept = True)
lr.fit(new_train,target_train)
#预测测试集和训练集结果
y_pred_train=lr.predict(new_train)
y_pred_test=lr.predict(new_test)

#对比结果
from sklearn.metrics import r2_score
score_train=r2_score(y_pred_train,target_train)
print("训练集结果：",score_train)
score_test=r2_score(y_pred_test, target_test)
print("测试集结果：",score_test)

Features sorted by their score:
[(0.4436, 'area'), (0.1616, 'tradeMeanPrice_plate_cluster_mean'), (0.0643, 'tradeMeanPrice_communityName_cluster_mean'), (0.0236, 'com_area_mean'), (0.0232, 'plate_area_std'), (0.0226, 'plate_year_std'), (0.0224, 'plate_area_mean'), (0.0117, 'comm_plate_year_diff'), (0.0095, 'tradeNewMeanPrice_plate_cluster_mean'), (0.0086, 'comm_price_mean'), (0.0084, 'plate_price_mean'), (0.0078, 'tradeNewMeanPrice_communityName_cluster_mean'), (0.0078, 'totalFloor'), (0.0057, 'buildYear'), (0.0048, 'sale_ratio'), (0.0047, 'transportNum'), (0.0045, 'transportNum_communityName_cluster_mean'), (0.0041, 'tradeSecNum_communityName_cluster_mean'), (0.0041, 'tradeMeanPrice'), (0.0041, 'com_all_mall'), (0.004, 'remainNewNum_communityName_cluster_mean'), (0.0039, 'other_ratio'), (0.0039, 'com_area_std'), (0.0034, 'count_communityName'), (0.0034, 'communityName'), (0.0032, 'tradeNewMeanPrice'), (0.0029, 'year'), (0.0027, 'tradeMeanPrice_buildYear_cluster_mean'), (0.0027, 'day')

In [None]:
# lightgbm
from __future__ import print_function
import lightgbm as lgb
import sklearn
import numpy
import hyperopt
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials
import colorama
import numpy as np

N_HYPEROPT_PROBES = 500
HYPEROPT_ALGO = tpe.suggest  #  tpe.suggest OR hyperopt.rand.suggest

# ----------------------------------------------------------

colorama.init()

# ---------------------------------------------------------------------

def get_lgb_params(space):
    lgb_params = dict()
    lgb_params['boosting_type'] = space['boosting_type'] if 'boosting_type' in space else 'gbdt'
    lgb_params['objective'] = 'regression'
    lgb_params['metric'] = 'rmse'
    lgb_params['learning_rate'] = space['learning_rate']
    lgb_params['num_leaves'] = int(space['num_leaves'])
    lgb_params['min_data_in_leaf'] = int(space['min_data_in_leaf'])
    lgb_params['min_sum_hessian_in_leaf'] = space['min_sum_hessian_in_leaf']
    lgb_params['max_depth'] = -1
    lgb_params['lambda_l1'] = space['lambda_l1'] if 'lambda_l1' in space else 0.0
    lgb_params['lambda_l2'] = space['lambda_l2'] if 'lambda_l2' in space else 0.0
    lgb_params['max_bin'] = int(space['max_bin']) if 'max_bin' in space else 256
    lgb_params['feature_fraction'] = space['feature_fraction']
    lgb_params['bagging_fraction'] = space['bagging_fraction']
    lgb_params['bagging_freq'] = int(space['bagging_freq']) if 'bagging_freq' in space else 1
    lgb_params['nthread'] = 4
    return lgb_params

# ---------------------------------------------------------------------

obj_call_count = 0
cur_best_score = 0 # 0 or np.inf
log_writer = open( './lgb-hyperopt-log.txt', 'w' )


def objective(space):
    global obj_call_count, cur_best_score

    obj_call_count += 1

    print('\nLightGBM objective call #{} cur_best_score={:7.5f}'.format(obj_call_count,cur_best_score) )

    lgb_params = get_lgb_params(space)

    sorted_params = sorted(space.items(), key=lambda z: z[0])
    params_str = str.join(' ', ['{}={}'.format(k, v) for k, v in sorted_params])
    print('Params: {}'.format(params_str) )
    
    kf = KFold(n_splits=3, shuffle=True, random_state=0)
    out_of_fold = np.zeros(len(X_train))
    for fold, (train_idx, val_idx) in enumerate(kf.split(X_train)):
        D_train = lgb.Dataset(X_train.iloc[train_idx], label=Y_train[train_idx])
        D_val = lgb.Dataset(X_train.iloc[val_idx], label=Y_train[val_idx])
        # Train
        num_round = 10000
        clf = lgb.train(lgb_params,
                           D_train,
                           num_boost_round=num_round,
                           # metrics='mlogloss',
                           valid_sets=D_val,
                           # valid_names='val',
                           # fobj=None,
                           # feval=None,
                           # init_model=None,
                           # feature_name='auto',
                           # categorical_feature='auto',
                           early_stopping_rounds=200,
                           # evals_result=None,
                           verbose_eval=False,
                           # learning_rates=None,
                           # keep_training_booster=False,
                           # callbacks=None
                           )
        # predict
        nb_trees = clf.best_iteration
        val_loss = clf.best_score['valid_0']
        print('nb_trees={} val_loss={}'.format(nb_trees, val_loss))
        out_of_fold[val_idx] = clf.predict(X_train.iloc[val_idx], num_iteration=nb_trees)
        score = r2_score(out_of_fold, Y_train)

    print('val_r2_score={}'.format(score))

    log_writer.write('score={} Params:{} nb_trees={}\n'.format(score, params_str, nb_trees ))
    log_writer.flush()

    if score>cur_best_score:
        cur_best_score = score
        print(colorama.Fore.GREEN + 'NEW BEST SCORE={}'.format(cur_best_score) + colorama.Fore.RESET)
    return {'loss': -score, 'status': STATUS_OK}

# --------------------------------------------------------------------------------

space ={
        'num_leaves': hp.quniform ('num_leaves', 10, 100, 1),
        'min_data_in_leaf':  hp.quniform ('min_data_in_leaf', 10, 100, 1),
        'feature_fraction': hp.uniform('feature_fraction', 0.75, 1.0),
        'bagging_fraction': hp.uniform('bagging_fraction', 0.75, 1.0),
        'learning_rate': hp.uniform('learning_rate', 0, 0.01),
#         'learning_rate': hp.loguniform('learning_rate', -5.0, -2.3),
        'min_sum_hessian_in_leaf': hp.loguniform('min_sum_hessian_in_leaf', 0, 2.3),
        'max_bin': hp.quniform ('max_bin', 88, 200, 1),
        'bagging_freq': hp.quniform ('bagging_freq', 1, 15, 1),
        'lambda_l1': hp.uniform('lambda_l1', 0, 10 ),
        'lambda_l2': hp.uniform('lambda_l2', 0, 10 ),
       }

trials = Trials()
best = hyperopt.fmin(fn=objective,
                     space=space,
                     algo=HYPEROPT_ALGO,
                     max_evals=N_HYPEROPT_PROBES,
                     trials=trials,
                     verbose=1)

print('-'*50)
print('The best params:')
print( best )
print('\n\n')