# 任务概述
本阶段主要对数据的特征作处理，分两步进行。
* 特征工程
* 特征选择

In [147]:
# 加载包
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import IsolationForest


In [148]:
# 导入数据
data_train = pd.read_csv('train_data.csv')
data_test = pd.read_csv('test_a.csv')
# 提取target
train_target = data_train.pop('tradeMoney')
test_target = pd.read_csv('sub_a_913.csv')
test_target = test_target['pre']

In [149]:
# 处理缺失和填补
def missing(data):
    # 删除无用特征
    data.drop(['city','ID'], axis = 1, inplace = True)
    # 填补缺失或不正常值
    buildYear_mode= data['buildYear'][data['buildYear'] != '暂无信息'].mode()
    data.loc[data['buildYear'] == '暂无信息','buildYear'] = buildYear_mode[0]
    data['buildYear'] = data['buildYear'].astype(int)
    data['pv'].fillna(data['pv'].mean(), inplace = True)
    data['uv'].fillna(data['uv'].mean(), inplace = True)
    return(data)
train,test = missing(data_train),missing(data_test)

In [150]:
# # clean target
# def IF_drop(train):
#     IForest = IsolationForest(contamination=0.01)
#     IForest.fit(train.values.reshape(-1,1))
#     y_pred = IForest.predict(train.values.reshape(-1,1))
#     drop_index = train.loc[y_pred==-1].index
#     print(len(drop_index))
#     train.drop(drop_index,inplace=True)
#     return train

# train_target = IF_drop(train_target)
# 异常值清洗
# 部分数据清洗

## 特征拆分和融合

In [151]:
# 分类变量
# 将houseType转为'Room'，'Hall'，'Bath'
def housetype(data_train):
    def Room(x):
        Room = int(x.split('室')[0])
        return Room
    def Hall(x):
        Hall = int(x.split('室')[1].split('厅')[0])
        return(Hall)
    def Bash(x):
        Bash = int(x.split('室')[1].split('厅')[1].split('卫')[0])
    data_train['room'] = data_train['houseType'].apply(lambda x :Room(x))
    data_train['hall'] = data_train['houseType'].apply(lambda x :Room(x))
    data_train['bath'] = data_train['houseType'].apply(lambda x :Room(x))
    data_train['bath_room'] = (data_train['bath'] + 1) / (data_train['room'] + 1)
    return(data_train)

data_train, data_test = housetype(data_train), housetype(data_test)


In [152]:
# 填充租房类型
def renttype(data_train):
    data_train.loc[(data_train['rentType'] == '未知方式') & (data_train['room'] <= 1), 'rentType'] = '整租'
    # print(data.loc[(data['rentType']=='未知方式')&(data['Room_Bath']>1),'rentType'])
    data_train.loc[(data_train['rentType'] == '未知方式') & (data_train['bath_room'] > 1), 'rentType'] = '合租'
    data_train.loc[(data_train['rentType'] == '未知方式') & (data_train['room'] > 1) & (data_train['area'] < 50), 'rentType'] = '合租'
    data_train.loc[(data_train['rentType'] == '未知方式') & (data_train['area'] / data_train['room'] < 20), 'rentType'] = '合租'
    # data.loc[(data['rentType']=='未知方式')&(data['area']>60),'rentType']='合租'
    data_train.loc[(data_train['rentType'] == '未知方式') & (data_train['area'] <= 50) & (data_train['room'] == 2), 'rentType'] = '合租'
    data_train.loc[(data_train['rentType'] == '未知方式') & (data_train['area'] > 60) & (data_train['room'] == 2), 'rentType'] = '整租'
    data_train.loc[(data_train['rentType'] == '未知方式') & (data_train['area'] <= 60) & (data_train['room'] == 3), 'rentType'] = '合租'
    data_train.loc[(data_train['rentType'] == '未知方式') & (data_train['area'] > 60) & (data_train['room'] == 3), 'rentType'] = '整租'
    data_train.loc[(data_train['rentType'] == '未知方式') & (data_train['area'] >= 100) & (data_train['room'] > 3), 'rentType'] = '整租'
    return(data_train)

data_train, data_test = renttype(data_train), renttype(data_test)

In [153]:
data_train.loc[data_train['rentType'] == '--',['rentType', 'area', 'room', 'bath', 'bath_room']]

Unnamed: 0,rentType,area,room,bath,bath_room
4370,--,63.26,1,1,1.0
4396,--,49.0,2,2,1.0
4428,--,37.8,1,1,1.0
38859,--,136.0,3,3,1.0
39407,--,30.0,1,1,1.0


In [154]:
# 对rentType中“--”填补
data_train.loc[(data_train['rentType'] == '--') & (data_train['area'] >60) & (data_train['room'] <= 1), 'rentType'] = '整租'
data_train.loc[(data_train['rentType'] == '--') & (data_train['area'] >50) & (data_train['room'] == 2), 'rentType'] = '整租'
data_train.loc[(data_train['rentType'] == '--')  & (data_train['room'] <= 1), 'rentType'] = '整租'
data_train.loc[(data_train['rentType'] == '--') & (data_train['area'] >100) & (data_train['room'] == 3), 'rentType'] = '整租'

In [155]:
# 是否有必要将‘未知方式’细分全部填补
data_train.loc[data_train['rentType'] == '未知方式',['rentType', 'area', 'room', 'bath', 'bath_room']].head(10)

Unnamed: 0,rentType,area,room,bath,bath_room
19,未知方式,58.0,2,2,1.0
30,未知方式,50.65,2,2,1.0
41,未知方式,55.97,2,2,1.0
49,未知方式,50.55,2,2,1.0
66,未知方式,58.0,2,2,1.0
71,未知方式,53.12,2,2,1.0
72,未知方式,59.0,2,2,1.0
76,未知方式,54.0,2,2,1.0
81,未知方式,58.0,2,2,1.0
85,未知方式,55.0,2,2,1.0


In [156]:
# 处理tradeTime
def time(data_train):
    def month(x):
        month = int(x.split('/')[1])
        return(month)
    def day(x):
        day = int(x.split('/')[2])
        return(day)
    def year(x):
        year = int(x.split('/')[2])
        return(year)
    data_train['month'] = data_train['tradeTime'].apply(lambda x: month(x))
    data_train['day'] = data_train['tradeTime'].apply(lambda x: day(x))
    data_train['year'] = data_train['tradeTime'].apply(lambda x: year(x))
    return(data_train)

data_train, data_test = time(data_train), time(data_test)

In [157]:
# 其余特征融合（脑洞大开）
def otherca(data_train):
    data_train['pv/uv'] = data_train['pv'] / data_train['uv']
    data_train['房间总数'] = data_train['room'] + data_train['hall'] + data_train['bath']

    # 公共设施
    # 各交通工具权重自定（参照交通工具对租金影响程度而定）
    data_train['transportNum'] = 5*data_train['subwayStationNum'] / data_train['subwayStationNum'].mean() + data_train['busStationNum'] / data_train['busStationNum'].mean()
    # 教育设施权重（可类比，国际学校，私立学校，公立学校影响）
    data_train['all_SchoolNum'] = 2 * data_train['interSchoolNum'] / data_train['interSchoolNum'].mean() + data_train['schoolNum'] / data_train['schoolNum'].mean() + data_train['privateSchoolNum'] / data_train['privateSchoolNum'].mean()
    # 商场，同上
    data_train['all_mall'] = data_train['mallNum'] / data_train['mallNum'].mean() + data_train['superMarketNum'] / data_train['superMarketNum'].mean()
    # 医疗
    data_train['all_hospitalNum'] = 2 * data_train['hospitalNum'] / data_train['hospitalNum'].mean() + data_train['drugStoreNum'] / data_train['drugStoreNum'].mean()
    # 其他公共设施；体育馆，银行等
    data_train['otherNum'] = data_train['gymNum'] / data_train['gymNum'].mean() + data_train['bankNum'] / data_train['bankNum'].mean() + data_train['shopNum'] / data_train['shopNum'].mean() + 2 * data_train['parkNum'] / data_train['parkNum'].mean()

    # 删除已用特征
    data_train.drop(['houseType','tradeTime','subwayStationNum', 'busStationNum','interSchoolNum', 'schoolNum', 'privateSchoolNum',
                   'hospitalNum', 'drugStoreNum', 'mallNum', 'superMarketNum', 'gymNum', 'bankNum', 'shopNum', 'parkNum'],
                  axis=1, inplace=True)
    return(data_train)

data_train, data_test = otherca(data_train), otherca(data_test)

In [158]:
#计算统计特征
def featureCount(train,test):
    train['data_type'] = 0
    test['data_type'] = 1
    data = pd.concat([train, test], axis=0, join='outer')
    def feature_count(data, features=[]):
        new_feature = 'count'
        for i in features:
            new_feature += '_' + i
        temp = data.groupby(features).size().reset_index().rename(columns={0: new_feature})
        data = data.merge(temp, 'left', on=features)
        return data

    data = feature_count(data, ['communityName'])
    data = feature_count(data, ['buildYear'])
    data = feature_count(data, ['totalFloor'])
    data = feature_count(data, ['communityName', 'totalFloor'])
    data = feature_count(data, ['communityName', 'newWorkers'])
    data = feature_count(data, ['communityName', 'totalTradeMoney'])
    new_train = data[data['data_type'] == 0]
    new_test = data[data['data_type'] == 1]
    new_train.drop('data_type', axis=1, inplace=True)
    new_test.drop(['data_type'], axis=1, inplace=True)
    return new_train, new_test
    
data_train, data_test = featureCount(data_train, data_test)

In [159]:
#groupby生成统计特征：mean,std等

def groupby(train,test):
    train['data_type'] = 0
    test['data_type'] = 1
    data = pd.concat([train, test], axis=0, join='outer')
    columns = ['rentType', 'houseFloor', 'houseToward', 'houseDecoration', 'communityName', 'region', 'plate']
    for feature in columns:
        data[feature] = LabelEncoder().fit_transform(data[feature])

    temp = data.groupby('communityName')['area'].agg({'com_area_mean': 'mean', 'com_area_std': 'std'})
    temp.fillna(0, inplace=True)
    data = data.merge(temp, on='communityName', how='left')
    
    data['price_per_area'] = data.tradeMeanPrice / data.area * 100
    temp = data.groupby('communityName')['price_per_area'].agg(
        {'comm_price_mean': 'mean', 'comm_price_std': 'std'})
    temp.fillna(0, inplace=True)
    data = data.merge(temp, on='communityName', how='left')
   
    temp = data.groupby('plate')['price_per_area'].agg(
        {'plate_price_mean': 'mean', 'plate_price_std': 'std'})
    temp.fillna(0, inplace=True)
    data = data.merge(temp, on='plate', how='left')
    data.drop('price_per_area', axis=1, inplace=True)

    temp = data.groupby('plate')['area'].agg({'plate_area_mean': 'mean', 'plate_area_std': 'std'})
    temp.fillna(0, inplace=True)
    data = data.merge(temp, on='plate', how='left')
    
    temp = data.groupby(['plate'])['buildYear'].agg({'plate_year_mean': 'mean', 'plate_year_std': 'std'})
    data = data.merge(temp, on='plate', how='left')
    data.plate_year_mean = data.plate_year_mean.astype('int')
    data['comm_plate_year_diff'] = data.buildYear - data.plate_year_mean
    data.drop('plate_year_mean', axis=1, inplace=True)

    temp = data.groupby('plate')['transportNum'].agg('sum').reset_index(name='plate_transportNum')
    data = data.merge(temp, on='plate', how='left')
    temp = data.groupby(['communityName', 'plate'])['transportNum'].agg('sum').reset_index(name='com_transportNum')
    data = data.merge(temp, on=['communityName', 'plate'], how='left')
    data['transportNum_ratio'] = list(map(lambda x, y: round(x / y, 3) if y != 0 else -1,
                                           data['com_transportNum'], data['plate_transportNum']))
    data = data.drop(['com_transportNum', 'plate_transportNum'], axis=1)

#     temp = data.groupby('plate')['all_SchoolNum'].agg('sum').reset_index(name='plate_all_SchoolNum')
#     data = data.merge(temp, on='plate', how='left')
#     temp = data.groupby(['communityName', 'plate'])['all_SchoolNum'].agg('sum').reset_index(name='com_all_SchoolNum')
#     data = data.merge(temp, on=['communityName', 'plate'], how='left')
#     data = data.drop(['com_all_SchoolNum', 'plate_all_SchoolNum'], axis=1)  # 没有融合

    temp = data.groupby(['communityName', 'plate'])['all_mall'].agg('sum').reset_index(name='com_all_mall')
    data = data.merge(temp, on=['communityName', 'plate'], how='left')

    temp = data.groupby('plate')['otherNum'].agg('sum').reset_index(name='plate_otherNum')
    data = data.merge(temp, on='plate', how='left')
    temp = data.groupby(['communityName', 'plate'])['otherNum'].agg('sum').reset_index(name='com_otherNum')
    data = data.merge(temp, on=['communityName', 'plate'], how='left')
    data['other_ratio'] = list(map(lambda x, y: round(x / y, 3) if y != 0 else -1,
                                   data['com_otherNum'], data['plate_otherNum']))
    data = data.drop(['com_otherNum', 'plate_otherNum'], axis=1)

    temp = data.groupby(['month', 'communityName']).size().reset_index(name='communityName_saleNum')
    data = data.merge(temp, on=['month', 'communityName'], how='left')
    temp = data.groupby(['month', 'plate']).size().reset_index(name='plate_saleNum')
    data = data.merge(temp, on=['month', 'plate'], how='left')

    data['sale_ratio'] = round((data.communityName_saleNum + 1) / (data.plate_saleNum + 1), 3)
    data['sale_newworker_differ'] = 3 * data.plate_saleNum - data.newWorkers
    data.drop(['communityName_saleNum', 'plate_saleNum'], axis=1, inplace=True)

    new_train = data[data['data_type'] == 0]
    new_test = data[data['data_type'] == 1]
    new_train.drop('data_type', axis=1, inplace=True)
    new_test.drop(['data_type'], axis=1, inplace=True)
    return new_train, new_test

data_train, data_test = groupby(data_train, data_test)

In [185]:
#聚类
def cluster(train,test):
    from sklearn.mixture import GaussianMixture

    train['data_type'] = 0
    test['data_type'] = 1
    data = pd.concat([train, test], axis=0, join='outer')
    col = ['totalFloor',
           'houseDecoration', 'communityName', 'region', 'plate', 'buildYear',

           'tradeMeanPrice', 'tradeSecNum', 'totalNewTradeMoney',
           'totalNewTradeArea', 'tradeNewMeanPrice', 'tradeNewNum', 'remainNewNum',

           'landTotalPrice', 'landMeanPrice', 'totalWorkers',
           'newWorkers', 'residentPopulation', 'lookNum',
           'transportNum',
           'all_SchoolNum', 'all_hospitalNum', 'all_mall', 'otherNum']

    # EM
    gmm = GaussianMixture(n_components=3, covariance_type='full', random_state=0)
    data['cluster']= pd.DataFrame(gmm.fit_predict(data[col]))


    col1 = ['totalFloor','houseDecoration', 'communityName', 'region', 'plate', 'buildYear']
    col2 = ['tradeMeanPrice', 'tradeSecNum', 'totalNewTradeMoney',
            'totalNewTradeArea', 'tradeNewMeanPrice', 'tradeNewNum', 'remainNewNum',
            'landTotalPrice', 'landMeanPrice', 'totalWorkers',
            'newWorkers', 'residentPopulation', 'lookNum',
            'transportNum',
            'all_SchoolNum', 'all_hospitalNum', 'all_mall', 'otherNum']
    for feature1 in col1:
        for feature2 in col2:
        
            temp = data.groupby(['cluster',feature1])[feature2].agg('mean').reset_index(name=feature2+'_'+feature1+'_cluster_mean')
            temp.fillna(0, inplace=True)
       
            data = data.merge(temp, on=['cluster', feature1], how='left')
    
   
    new_train = data[data['data_type'] == 0]
    new_test = data[data['data_type'] == 1]
    new_train.drop('data_type', axis=1, inplace=True)
    new_test.drop(['data_type'], axis=1, inplace=True)
    
    return new_train, new_test

data_train, data_test = cluster(data_train, data_test)   

# 特征选择

In [161]:
# 过大量级值取log平滑（针对线性模型有效）
big_num_cols = ['totalTradeMoney','totalTradeArea','tradeMeanPrice','totalNewTradeMoney', 'totalNewTradeArea',
                'tradeNewMeanPrice','remainNewNum', 'supplyNewNum', 'supplyLandArea',
                'tradeLandArea','landTotalPrice','landMeanPrice','totalWorkers','newWorkers',
                'residentPopulation','pv','uv']

for col in big_num_cols:
        data_train[col] = data_train[col].map(lambda x: np.log1p(x))
        data_test[col] = data_test[col].map(lambda x: np.log1p(x))
        

In [163]:
#对比特征工程前后线性模型结果情况
data_test = data_test.fillna(0)
data_train = data_train.fillna(0)
# Lasso回归
from sklearn.linear_model import Lasso
lasso=Lasso(alpha=0.1,fit_intercept = True)
lasso.fit(data_train,train_target)
#预测测试集和训练集结果
y_pred_train=lasso.predict(data_train)
y_pred_test=lasso.predict(data_test)

#对比结果
from sklearn.metrics import r2_score
score_train=r2_score(y_pred_train,train_target)
print("训练集结果：",score_train)
score_test=r2_score(y_pred_test, test_target)
print("测试集结果：",score_test)

训练集结果： -46.32801917128227
测试集结果： -0.0007724994345201797


In [177]:
#相关系数法特征选择
from sklearn.feature_selection import SelectKBest

print(data_train.shape)

sk = SelectKBest(k=150)
new_train = sk.fit_transform(data_train,train_target)
print(new_train.shape)

# 获取对应列索引
select_columns=sk.get_support(indices = True)
print(select_columns)

# 获取对应列名
print(data_train.columns[select_columns])
select_columns_name=data_test.columns[select_columns]
new_test=data_test[select_columns_name]
print(new_test.shape)
# Lasso回归
from sklearn.linear_model import Lasso

lasso=Lasso(alpha=0.1)
lasso.fit(new_train,train_target)
#预测测试集和训练集结果
y_pred_train=lasso.predict(new_train)

y_pred_test=lasso.predict(new_test)

#对比结果
from sklearn.metrics import r2_score
score_train=r2_score(y_pred_train,train_target)
print("训练集结果：",score_train)
score_test=r2_score(y_pred_test, test_target)
print("测试集结果：",score_test)

(41440, 177)
(41440, 150)
[  0   1   3   4   5   6   7   8   9  10  12  13  14  15  16  17  18  19
  20  21  22  27  28  29  30  31  33  34  35  37  38  39  40  41  43  44
  45  46  47  48  49  50  51  52  53  55  56  57  58  59  60  61  62  64
  66  67  68  69  70  71  72  73  74  75  78  79  80  81  82  83  84  85
  86  87  88  89  90  91  92  93  96  97  98  99 100 101 102 103 104 105
 106 107 108 109 110 111 114 115 116 117 119 120 121 122 123 124 125 126
 127 128 129 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146
 147 150 151 152 153 155 156 157 158 159 161 162 163 164 165 168 169 170
 171 172 173 174 175 176]
Index(['area', 'rentType', 'totalFloor', 'houseToward', 'houseDecoration',
       'communityName', 'region', 'plate', 'buildYear', 'saleSecHouseNum',
       ...
       'remainNewNum_buildYear_cluster_mean',
       'totalWorkers_buildYear_cluster_mean',
       'newWorkers_buildYear_cluster_mean',
       'residentPopulation_buildYear_cluster_mean',
       'lookNum

In [178]:
# Wrapper

from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
rfe = RFE(lr, n_features_to_select=100)
rfe.fit(data_train,train_target)

RFE(estimator=LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
                               normalize=False),
    n_features_to_select=40, step=1, verbose=0)




select_columns = [f for f, s in zip(data_train.columns, rfe.support_) if s]
print(select_columns)
new_train = data_train[select_columns]
new_test = data_test[select_columns]

# Lasso回归
from sklearn.linear_model import Lasso

lasso=Lasso(alpha=0.1)
lasso.fit(new_train,train_target)
#预测测试集和训练集结果
y_pred_train=lasso.predict(new_train)

y_pred_test=lasso.predict(new_test)

#对比结果
from sklearn.metrics import r2_score
score_train=r2_score(y_pred_train,train_target)
print("训练集结果：",score_train)
score_test=r2_score(y_pred_test, test_target)
print("测试集结果：",score_test)

['rentType', 'houseFloor', 'totalFloor', 'houseToward', 'houseDecoration', 'region', 'plate', 'buildYear', 'saleSecHouseNum', 'totalTradeMoney', 'totalTradeArea', 'tradeMeanPrice', 'totalNewTradeMoney', 'totalNewTradeArea', 'tradeNewMeanPrice', 'remainNewNum', 'supplyNewNum', 'supplyLandNum', 'tradeLandNum', 'tradeLandArea', 'landTotalPrice', 'landMeanPrice', 'totalWorkers', 'newWorkers', 'residentPopulation', 'pv', 'uv', 'lookNum', 'room', 'hall', 'bath', 'month', 'day', 'year', 'pv/uv', '房间总数', 'transportNum', 'all_SchoolNum', 'all_mall', 'all_hospitalNum', 'otherNum', 'count_communityName_newWorkers', 'count_communityName_totalTradeMoney', 'com_area_mean', 'com_area_std', 'plate_area_mean', 'plate_year_std', 'comm_plate_year_diff', 'transportNum_ratio', 'com_all_mall', 'other_ratio', 'sale_ratio', 'cluster', 'tradeNewNum_totalFloor_cluster_mean', 'lookNum_totalFloor_cluster_mean', 'transportNum_totalFloor_cluster_mean', 'all_SchoolNum_totalFloor_cluster_mean', 'all_hospitalNum_total

In [183]:
# Embedded
# 基于惩罚项的特征选择法
# Lasso(l1)和Ridge(l2)

from sklearn.linear_model import Ridge

ridge = Ridge(alpha=5)
ridge.fit(data_train,train_target)

Ridge(alpha=5, copy_X=True, fit_intercept=True, max_iter=None, normalize=False,
      random_state=None, solver='auto', tol=0.001)

# 特征系数排序
coefSort = ridge.coef_.argsort()
print(coefSort)


# 特征系数
featureCoefSore=ridge.coef_[coefSort]
print(featureCoefSore)


select_columns = [f for f, s in zip(data_train.columns, featureCoefSore) if abs(s)> 0.0000005 ] 
# 选择绝对值大于0.0000005的特征
print(select_columns)

new_train = data_train[select_columns]
new_test = data_test[select_columns]
# Lasso回归
from sklearn.linear_model import Lasso

lasso=Lasso(alpha=0.1)
lasso.fit(new_train,train_target)
#预测测试集和训练集结果
y_pred_train=lasso.predict(new_train)

y_pred_test=lasso.predict(new_test)

#对比结果
from sklearn.metrics import r2_score
score_train=r2_score(y_pred_train,train_target)
print("训练集结果：",score_train)
score_test=r2_score(y_pred_test, test_target)
print("测试集结果：",score_test)

[122 119 121  42  30  65  63 154 176  23 139 156  28  26 117  86  83 120
 138 175  15   7   2 140 110  13  74 173  11  62  22  20   4   3  88  59
  10 137  93   8  51 147 106  18  25  75   1 124  54  97 162 144 149 101
  90  14 151  95  64 169  79 126  50  87 141 133  91  67  99   0  80  49
  78 163  58  73 152 116 127 159  55 150 114  56 107 148  71  76 125  36
 166 130 112  94 143 161  89 123 109 132 145  57 170 168 134  77  48  98
  96 103 167 105  69  68 131 102 104   6  81  60 100 113 115  72 108 129
   5  39  38 160  34  35  33  92 165  53 111  47  52 142  41  70 128  12
  61 172  19   9  27 164  37 136 146  40  32  17 157  21  16  82 171 174
  29 135  24 153  84 158  43  31  45  66  85 155 118  44  46]
[-1.07043917e+05 -8.85050306e+04 -5.42423228e+04 -3.81877509e+04
 -3.73220157e+04 -3.26398380e+04 -3.10856881e+04 -2.79184696e+04
 -2.14706867e+04 -1.97127343e+04 -1.95677912e+04 -1.94806348e+04
 -1.81321739e+04 -1.63465145e+04 -1.60198762e+04 -1.36437337e+04
 -9.53828535e+03 -7.9

训练集结果： -46.32793332103571
测试集结果： -0.0007724776821702584


In [184]:
# Embedded
# 基于树模型的特征选择法
# 随机森林 平均不纯度减少（mean decrease impurity


from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor()
# 训练随机森林模型，并通过feature_importances_属性获取每个特征的重要性分数。rf = RandomForestRegressor()
rf.fit(data_train,train_target)
print("Features sorted by their score:")
print(sorted(zip(map(lambda x: round(x, 4), rf.feature_importances_), data_train.columns),
             reverse=True))

select_columns = [f for f, s in zip(data_train.columns, rf.feature_importances_) if abs(s)> 0.00005 ] 
# 选择绝对值大于0.00005的特征

new_train = data_train[select_columns]
new_test = data_test[select_columns]

# Lasso回归
from sklearn.linear_model import Lasso

lasso=Lasso(alpha=0.1)
lasso.fit(new_train,train_target)
#预测测试集和训练集结果
y_pred_train=lasso.predict(new_train)

y_pred_test=lasso.predict(new_test)

#对比结果
from sklearn.metrics import r2_score
score_train=r2_score(y_pred_train,train_target)
print("训练集结果：",score_train)
score_test=r2_score(y_pred_test, test_target)
print("测试集结果：",score_test)

Features sorted by their score:
[(0.2735, 'year'), (0.0907, 'lookNum'), (0.089, 'tradeNewNum_houseDecoration_cluster_mean'), (0.088, 'all_mall_houseDecoration_cluster_mean'), (0.0762, 'tradeSecNum_houseDecoration_cluster_mean'), (0.0733, 'totalNewTradeArea_houseDecoration_cluster_mean'), (0.0696, 'totalNewTradeArea_totalFloor_cluster_mean'), (0.063, 'otherNum_houseDecoration_cluster_mean'), (0.0559, 'newWorkers_communityName_cluster_mean'), (0.0499, 'area'), (0.0174, 'all_SchoolNum_communityName_cluster_mean'), (0.0171, 'otherNum_communityName_cluster_mean'), (0.0125, 'day'), (0.0111, 'totalTradeMoney'), (0.008, 'sale_newworker_differ'), (0.0017, 'room'), (0.0004, 'pv'), (0.0004, 'com_area_mean'), (0.0003, 'tradeNewMeanPrice_buildYear_cluster_mean'), (0.0003, 'tradeMeanPrice_buildYear_cluster_mean'), (0.0003, 'count_communityName'), (0.0003, 'comm_plate_year_diff'), (0.0001, 'tradeSecNum'), (0.0001, 'tradeNewMeanPrice_region_cluster_mean'), (0.0001, 'tradeMeanPrice_totalFloor_cluster_m