In [1]:
#coding:utf-8
#导入warnings包，利用过滤器来实现忽略警告语句。
import xgboost as xgb
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
from category_encoders import TargetEncoder
from xgboost.sklearn import XGBRegressor

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

## 加载数据

In [2]:
df_train = pd.read_csv('./data/used_car_train_20200313.csv', sep=' ')
df_train['is_train'] = 1
df_test = pd.read_csv('./data/used_car_testB_20200421.csv', sep=' ')
df_test['is_train'] = 0

## 清洗异常数据

In [3]:
date_feature = ['regDate', 'creatDate']


def clean_error_month(x):
    '''
    清洗日期中月份出现错误的数据 将00->01
    '''
    x = str(x)
    if len(x) != 8:
        return pd.NaT
    if x[4:6] == '00':
        x = x[:4]+'01'+x[6:]
    return x


for feature in date_feature:
    df_train[feature] = df_train[feature].apply(clean_error_month)
    df_test[feature] = df_test[feature].apply(clean_error_month)

## 缺失值填充

In [4]:
df_train['model'] = df_train['model'].fillna(167)
df_train['bodyType'] = df_train['bodyType'].fillna(1)
df_train['fuelType'] = df_train['fuelType'].fillna(2)
df_train['gearbox'] = df_train['gearbox'].fillna(0)
df_train['power'] = df_train['power'].where(df_train['power'] <= 600, 600)
df_train['notRepairedDamage'] = df_train['notRepairedDamage'].replace('-', '1').astype('float')

df_test['bodyType'] = df_test['bodyType'].fillna(1)
df_test['fuelType'] = df_test['fuelType'].fillna(2)
df_test['gearbox'] = df_test['gearbox'].fillna(0)
df_test['power'] = df_test['power'].where(df_test['power'] <= 600, 600)
df_test['notRepairedDamage'] = df_test['notRepairedDamage'].replace('-', '1').astype('float')

In [5]:
# 合并数据集
df = pd.concat([df_train, df_test],axis=0)
df.head().append(df.tail())

Unnamed: 0,SaleID,name,regDate,model,brand,bodyType,fuelType,gearbox,power,kilometer,...,v_6,v_7,v_8,v_9,v_10,v_11,v_12,v_13,v_14,is_train
0,0,736,20040402,30.0,6,1.0,0.0,0.0,60,12.5,...,0.101988,0.129549,0.022816,0.097462,-2.881803,2.804097,-2.420821,0.795292,0.914762,1
1,1,2262,20030301,40.0,1,2.0,0.0,0.0,0,15.0,...,0.121004,0.135731,0.026597,0.020582,-4.900482,2.096338,-1.030483,-1.722674,0.245522,1
2,2,14874,20040403,115.0,15,1.0,0.0,0.0,163,12.5,...,0.114912,0.165147,0.062173,0.027075,-4.846749,1.803559,1.56533,-0.832687,-0.229963,1
3,3,71865,19960908,109.0,10,0.0,0.0,1.0,193,15.0,...,0.1103,0.121964,0.033395,0.0,-4.509599,1.28594,-0.501868,-2.438353,-0.478699,1
4,4,111080,20120103,110.0,5,1.0,0.0,0.0,68,5.0,...,0.073205,0.09188,0.078819,0.121534,-1.89624,0.910783,0.93111,2.834518,1.923482,1
49995,249995,111443,20041005,4.0,4,0.0,2.0,1.0,150,15.0,...,0.000292,0.141804,0.076393,0.039272,2.072901,-2.531869,1.716978,-1.063437,0.326587,0
49996,249996,152834,20130409,65.0,1,0.0,0.0,0.0,179,4.0,...,0.000991,0.155868,0.108425,0.067841,1.358504,-3.290295,4.269809,0.140524,0.556221,0
49997,249997,132531,20041211,4.0,4,0.0,0.0,1.0,147,12.5,...,0.000318,0.141872,0.071968,0.042966,2.165658,-2.417885,1.370612,-1.073133,0.270602,0
49998,249998,143405,20020702,40.0,1,4.0,0.0,1.0,176,15.0,...,2.3e-05,0.067483,0.067526,0.009006,2.030114,-2.939244,0.569078,-1.718245,0.316379,0
49999,249999,78202,20090708,32.0,8,1.0,0.0,0.0,0,3.0,...,0.103947,0.096027,0.062328,0.11018,-3.68909,2.032376,0.109157,2.202828,0.847469,0


## 删除异常数据，注意只能删除训练集的数据，测试集数据是不能删除的

### 异常值是去除了 但是泛化能力如何未知(先不删除数据)

In [6]:
# # 使用【Q1-1.5IQR,Q3+1.5IQR】
# # 数值型变量
# numeric_features = ['power', 'kilometer', 'v_0', 'v_1', 'v_2', 'v_3', 'v_4',
#                     'v_5', 'v_6', 'v_7', 'v_8', 'v_9', 'v_10', 'v_11', 'v_12', 'v_13', 'v_14']

# # 类别型变量
# categorical_features = ['name', 'model', 'brand', 'bodyType',
#                         'fuelType', 'gearbox', 'notRepairedDamage', 'regionCode']
# # 日期型变量
# date_features = ['regDate', 'creatDate']

# # 异常数据处理
# rules = []
# for feature in numeric_features:
#     Q1 = df_train[feature].quantile(0.25)
#     Q3 = df_train[feature].quantile(0.75)
#     IQR = Q3-Q1
#     min_border = Q1-3*IQR
#     max_border = Q3+3*IQR
#     rules.append((feature, min_border, max_border))

# for rule in rules:
#     feature, min_border, max_border = rule
#     df_train = df_train[(df_train[feature] <= max_border)
#                         & (df_train[feature] >= min_border)]
# print(rules)
# print(df_train.shape)

## 特征构建

### price做对数变换

In [7]:
df['price'] = np.log(df['price'])

In [8]:
def my_agg(data, dim, measure='price'):
    '''
    按照指定维度聚合并重命名,暂时维度只有1
    '''
    index = ['min', 'max', 'mean', 'median','sum', 'std']
    new_columns = [dim]+list(map(lambda x: f'{dim}_{measure}_{x}', index))
    new_df = data.groupby(dim).agg({
        measure: ['min', 'max', 'mean', 'median', 'sum', 'std']
    }).reset_index()
    new_df.fillna(0,inplace=True)
    new_df.columns = new_columns
    return new_df

### name字段处理（直接删除name字段，太过离散，血妈坑）

In [9]:
# df['name_count'] = df.groupby('name')['SaleID'].transform('count')
# name_price_df = my_agg(df[df['is_train'] == 1], 'name')
# df = pd.merge(df, name_price_df, on='name', how='left')
# df.head()

### 处理时间特征

In [10]:
df['used_days'] = (pd.to_datetime(df['creatDate'], format='%Y%m%d') -
                   pd.to_datetime(df['regDate'], format='%Y%m%d')).dt.days
df['used_years'] = round(df['used_days']/365, 1)
df['kilometer_div_years'] = df['kilometer']/df['used_years']
df['kilometer_div_days'] = df['kilometer']/df['used_days']

# 对使用天数进行分箱
df['use_days_bin_20'] = pd.qcut(df['used_days'], 20, labels=False)
use_days_bin_20_price_df = my_agg(df[df['is_train'] == 1], 'use_days_bin_20')
df = pd.merge(df, use_days_bin_20_price_df, on='use_days_bin_20', how='left')
df.head()

Unnamed: 0,SaleID,name,regDate,model,brand,bodyType,fuelType,gearbox,power,kilometer,...,used_years,kilometer_div_years,kilometer_div_days,use_days_bin_20,use_days_bin_20_price_min,use_days_bin_20_price_max,use_days_bin_20_price_mean,use_days_bin_20_price_median,use_days_bin_20_price_sum,use_days_bin_20_price_std
0,0,736,20040402,30.0,6,1.0,0.0,0.0,60,12.5,...,12.0,1.041667,0.002851,9,3.912023,11.512915,8.105361,8.101678,61049.581133,0.709339
1,1,2262,20030301,40.0,1,2.0,0.0,0.0,0,15.0,...,13.0,1.153846,0.003153,10,3.555348,11.049301,7.888989,7.893572,59269.971526,0.736892
2,2,14874,20040403,115.0,15,1.0,0.0,0.0,163,12.5,...,12.0,1.041667,0.002853,9,3.912023,11.512915,8.105361,8.101678,61049.581133,0.709339
3,3,71865,19960908,109.0,10,0.0,0.0,1.0,193,15.0,...,19.5,0.769231,0.002105,18,2.397895,11.383955,6.792672,6.745236,50843.146651,0.952653
4,4,111080,20120103,110.0,5,1.0,0.0,0.0,68,5.0,...,4.2,1.190476,0.003266,1,3.688879,11.396358,9.485865,9.510371,70764.551589,0.569662


### 处理区域编码(高基类别特征 使用target encoder进行编码）

In [11]:
df['region_count'] = df.groupby('regionCode')['SaleID'].transform('count')
region_price_df = my_agg(df[df['is_train'] == 1], 'regionCode')
df = pd.merge(df, region_price_df, on='regionCode', how='left')

region_used_days_df = my_agg(df, 'regionCode','used_days')
df = pd.merge(df, region_used_days_df, on='regionCode', how='left')

region_power_df = my_agg(df, 'regionCode','power')
df = pd.merge(df, region_power_df, on='regionCode', how='left')

region_kilometer_df = my_agg(df, 'regionCode','kilometer')
df = pd.merge(df, region_kilometer_df, on='regionCode', how='left')

### 处理城市编码(高基类别特征 使用target encoder进行编码）

In [12]:
# 由于使用的是德国邮编，最后三位是城市
df['city']=df['regionCode'].apply(lambda x:str(x)[-3:])
df['city_count'] = df.groupby('city')['SaleID'].transform('count')
city_price_df = my_agg(df[df['is_train'] == 1],'city')
df = pd.merge(df, city_price_df, on='city', how='left')

city_used_days_df = my_agg(df, 'city','used_days')
df = pd.merge(df, city_used_days_df, on='city', how='left')

city_power_df = my_agg(df, 'city','power')
df = pd.merge(df, city_power_df, on='city', how='left')

city_kilometer_df = my_agg(df, 'city','kilometer')
df = pd.merge(df, city_kilometer_df, on='city', how='left')
# df.head()

### 处理model(高基类别特征-车型编码)

In [13]:
df['model_count'] = df.groupby('model')['SaleID'].transform('count')

model_price_df = my_agg(df[df['is_train'] == 1], 'model')
df = pd.merge(df, model_price_df, on='model', how='left')

model_used_days_df = my_agg(df, 'model','used_days')
df = pd.merge(df, model_price_df, on='model', how='left')

model_power_df = my_agg(df,'model','power')
df = pd.merge(df, model_power_df, on='model', how='left')

model_kilometer_df = my_agg(df,'model','kilometer')
df = pd.merge(df, model_power_df, on='model', how='left')

### 处理brand(高基类别特征40)

In [14]:
# 量
df['brand_count'] = df.groupby('brand')['SaleID'].transform('count')

brand_price_df = my_agg(df[df['is_train'] == 1], 'brand')
df = pd.merge(df, brand_price_df, on='brand', how='left')

brand_power_df = my_agg(df,'brand','power')
df = pd.merge(df, brand_power_df, on='brand', how='left')

brand_used_days_df = my_agg(df,'brand','used_days')
df = pd.merge(df, brand_used_days_df, on='brand', how='left')

brand_kilometer_df = my_agg(df,'brand','kilometer')
df = pd.merge(df, brand_kilometer_df, on='brand', how='left')

### 处理bodyType(可以onehot)

In [15]:
df['bodyType_count'] = df.groupby('bodyType')['SaleID'].transform('count')
bodyType_price_df = my_agg(df[df['is_train'] == 1], 'bodyType')
df = pd.merge(df, bodyType_price_df, on='bodyType', how='left')
tmp_df = pd.get_dummies(df['bodyType'], prefix='bodyType')
df = pd.concat([df, tmp_df], axis=1)

In [16]:
body_type_df = my_agg(df,'bodyType','power')
df = pd.merge(df, body_type_df, on='bodyType', how='left')
df.head()

Unnamed: 0,SaleID,name,regDate,model,brand,bodyType,fuelType,gearbox,power,kilometer,...,bodyType_4.0,bodyType_5.0,bodyType_6.0,bodyType_7.0,bodyType_power_min,bodyType_power_max,bodyType_power_mean,bodyType_power_median,bodyType_power_sum,bodyType_power_std
0,0,736,20040402,30.0,6,1.0,0.0,0.0,60,12.5,...,0,0,0,0,0,600,64.922974,63,3456629,41.803383
1,1,2262,20030301,40.0,1,2.0,0.0,0.0,0,15.0,...,0,0,0,0,0,600,133.410395,136,5364432,59.687952
2,2,14874,20040403,115.0,15,1.0,0.0,0.0,163,12.5,...,0,0,0,0,0,600,64.922974,63,3456629,41.803383
3,3,71865,19960908,109.0,10,0.0,0.0,1.0,193,15.0,...,0,0,0,0,0,600,131.649488,122,7265077,68.81654
4,4,111080,20120103,110.0,5,1.0,0.0,0.0,68,5.0,...,0,0,0,0,0,600,64.922974,63,3456629,41.803383


### 处理fuelType

In [17]:
df['fuelType_count'] = df.groupby('fuelType')['SaleID'].transform('count')
fuelType_price_df = my_agg(df[df['is_train'] == 1], 'fuelType')
df = pd.merge(df, fuelType_price_df, on='fuelType', how='left')
tmp_df = pd.get_dummies(df['fuelType'], prefix='fuelType')
df = pd.concat([df, tmp_df], axis=1)
# df.head()

In [18]:
fuelType_power_df = my_agg(df,'fuelType','power')
df = pd.merge(df, fuelType_power_df, on='fuelType', how='left')
# df.head()

### 处理gearbox

In [19]:
df['gearbox_count'] = df.groupby('gearbox')['SaleID'].transform('count')
gearbox_price_df = my_agg(df[df['is_train'] == 1], 'gearbox')
df = pd.merge(df, gearbox_price_df, on='gearbox', how='left')
tmp_df = pd.get_dummies(df['gearbox'], prefix='gearbox')
df = pd.concat([df, tmp_df], axis=1)
# df.head()

In [20]:
gearbox_power_df = my_agg(df,'gearbox','power')
df = pd.merge(df, gearbox_power_df, on='gearbox', how='left')
# df.head()

### 处理kilometer

In [21]:
df['kilometer_count'] = df.groupby('kilometer')['SaleID'].transform('count')
kilometer_price_df = my_agg(df[df['is_train'] == 1], 'kilometer')
df = pd.merge(df, kilometer_price_df, on='kilometer', how='left')
df.head()

Unnamed: 0,SaleID,name,regDate,model,brand,bodyType,fuelType,gearbox,power,kilometer,...,gearbox_power_median,gearbox_power_sum,gearbox_power_std,kilometer_count,kilometer_price_min,kilometer_price_max,kilometer_price_mean,kilometer_price_median,kilometer_price_sum,kilometer_price_std
0,0,736,20040402,30.0,6,1.0,0.0,0.0,60,12.5,...,101,15937679,56.924362,21035,2.564949,11.383955,8.234097,8.318742,129456.475465,1.117207
1,1,2262,20030301,40.0,1,2.0,0.0,0.0,0,15.0,...,101,15937679,56.924362,129120,2.397895,11.512915,7.700833,7.718685,746033.630873,1.097219
2,2,14874,20040403,115.0,15,1.0,0.0,0.0,163,12.5,...,101,15937679,56.924362,21035,2.564949,11.383955,8.234097,8.318742,129456.475465,1.117207
3,3,71865,19960908,109.0,10,0.0,0.0,1.0,193,15.0,...,170,7417361,85.000005,129120,2.397895,11.512915,7.700833,7.718685,746033.630873,1.097219
4,4,111080,20120103,110.0,5,1.0,0.0,0.0,68,5.0,...,101,15937679,56.924362,4254,3.912023,11.512825,9.197718,9.287301,28917.625984,0.884359


In [22]:
# 这个特征还是挺重要的，继续挖掘下与重要匿名特征进行组合 0 3 6 12
for feat in ['v_0','v_3','v_6','v_12']:
    tmp_df = my_agg(df, 'kilometer',feat)
    df = pd.merge(df, tmp_df, on='kilometer', how='left')
    df[f'kilometer_add_{feat}'] = df['kilometer'] + df[feat]
    df[f'kilometer_minus_{feat}'] = df['kilometer'] - df[feat]
    df[f'kilometer_multiply_{feat}'] = df['kilometer'] * df[feat]
    df[f'kilometer_div_{feat}'] = df['kilometer'] / df[feat]

In [23]:
df.shape

(200000, 247)

### 处理notRepairedDamage

In [24]:
df['notRepairedDamage_count'] = df.groupby('notRepairedDamage')['SaleID'].transform('count')
notRepairedDamage_price_df = my_agg(df[df['is_train'] == 1], 'notRepairedDamage')
df = pd.merge(df, notRepairedDamage_price_df, on='notRepairedDamage', how='left')
tmp_df = pd.get_dummies(df['notRepairedDamage'], prefix='notRepairedDamage')
df = pd.concat([df, tmp_df], axis=1)
# df.head()

In [25]:
notRepairedDamage_power_df = my_agg(df,'notRepairedDamage','power')
df = pd.merge(df, notRepairedDamage_power_df, on='notRepairedDamage', how='left')
# df.head()

In [26]:
notRepairedDamage_kilometer_df = my_agg(df,'notRepairedDamage','kilometer')
df = pd.merge(df, notRepairedDamage_kilometer_df, on='notRepairedDamage', how='left')

In [27]:
for feat in ['v_0','v_3','v_6','v_12']:
    tmp_df = my_agg(df, 'notRepairedDamage',feat)
    df = pd.merge(df, tmp_df, on='notRepairedDamage', how='left')

### 匿名特征处理

In [28]:
# 根据分箱对租金进行进行特征组合
features_list = ['v_0', 'v_8', 'v_12', 'v_3']
for i in features_list:
    feature = f'box_{i}'
    df[f'box_{i}'] = pd.qcut(df[i], 20, duplicates='drop', labels=False)
    tmp_df = my_agg(df[df['is_train'] == 1], feature)
    df = pd.merge(df, tmp_df, on=feature, how='left')
    del df[f'box_{i}']
# df.head()

In [29]:
# 匿名特征之间的组合
for i in range(15):
    for j in range(i+1, 15):
        df[f'v_{i}_add_v_{j}'] = df[f'v_{i}']+df[f'v_{j}']
        df[f'v_{i}_minus_v_{j}'] = df[f'v_{i}']-df[f'v_{j}']
        df[f'v_{i}_multiply_v_{j}'] = df[f'v_{i}']*df[f'v_{j}']
        df[f'v_{i}_div_v_{j}'] = df[f'v_{i}']/df[f'v_{j}']
# df.head()

In [30]:
df_train = df[df['is_train']==1]
df_test = df[df['is_train']==0]
print(df_train.shape)
print(df_test.shape)

(150000, 736)
(50000, 736)


In [31]:
y_train = df_train['price']

enc = TargetEncoder(cols=['city','regionCode','model','brand'])

df_train = enc.fit_transform(df_train,y_train)
df_test = enc.transform(df_test)

In [32]:
# 删除无效特征
delete_features = ['SaleID','name','regDate','offerType','seller','bodyType','fuelType','gearbox','notRepairedDamage','creatDate','is_train']
for feature in delete_features:
    del df_train[feature]
    del df_test[feature]
del df_test['price']

### TODO 可以尝试非金额统计度量做组合特征

## 内存优化

In [33]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() 
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem/1024/1024))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() 
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem/1024/1024))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    return df

In [34]:
df_train = reduce_mem_usage(df_train)
df_test = reduce_mem_usage(df_test)

Memory usage of dataframe is 811.82 MB
Memory usage after optimization is: 263.50 MB
Decreased by 67.5%
Memory usage of dataframe is 270.22 MB
Memory usage after optimization is: 85.16 MB
Decreased by 68.5%


## 特征选择

In [35]:
# y = df_train.pop('price').values
# X = df_train.values
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=2020)

In [36]:
# for index,column in enumerate(df_train.columns):
#     print(index,column)

In [37]:
# def my_score(preds, dtrain):
#     labels = dtrain.get_label()
#     return 'exp_score', mean_absolute_error(np.exp(labels),np.nan_to_num(np.exp(preds)))
# param_dist = {
#         "n_estimators": 200,
#         "objective": "reg:squarederror",
#         # "eval_metric": "mae",
#         "learning_rate": 0.3,  # [0.03~0.3]
#         "max_depth":6,
#         "early_stopping_rounds": 10,
#         "tree_method": "gpu_hist",
#         "gpu_id": 1
#     }
# clf = XGBRegressor(**param_dist)
# print('开始训练。。。')
# clf.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_test, y_test)], eval_metric=my_score)
# # print(clf.best_score)
# # print(clf.best_iteration)
# # eval_result = clf.evals_result()
# # print(eval_result)
# clf.feature_importances_
# xgb.plot_importance(clf,importance_type='gain',max_num_features=20)
# clf.importance_type='gain'
# im = pd.DataFrame({'feature':df_train.columns,'importance':clf.feature_importances_})
# im=im.sort_values('importance',ascending=False)
# im.head(20)

In [38]:
# sns.barplot(y='feature',x='importance',data=im.head(30))

In [39]:
# y_pred = clf.predict(X_test)
# score = mean_absolute_error(np.nan_to_num(np.exp(y_test)),np.nan_to_num(np.exp(y_pred)))
# score

In [40]:
# im['bin_importance'] = pd.qcut(im['importance'],20,labels=False,duplicates='drop')
# im[im['bin_importance']==0]
# for bins in range(16):
#     features = list(im.loc[im['bin_importance']>=bins,'feature'].values)
#     print('======')
#     print(f'bins>={bins},特征数量:{len(features)}')
#     X = df_train[features].values
#     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=2020)
#     clf.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_test, y_test)], eval_metric=my_score,verbose=0)
#     y_pred = clf.predict(X_test)
#     score = mean_absolute_error(np.nan_to_num(np.exp(y_test)),np.nan_to_num(np.exp(y_pred)))
#     print(score)
# features=list(im.loc[im['bin_importance']>=15,'feature'].values)
# features

## 输出

In [41]:
# df_train[features+['price']].to_csv('user_data/train_tree.csv',index=False)
# df_test[features].to_csv('user_data/test_tree.csv',index=False)

df_train.to_csv('user_data/train_tree.csv',index=False)
df_test.to_csv('user_data/test_tree.csv',index=False)
print('树模型特征工程已完成')

树模型特征工程已完成
