In [34]:
#coding:utf-8
#导入warnings包，利用过滤器来实现忽略警告语句。
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
from category_encoders import TargetEncoder

## 加载数据

In [2]:
df_train = pd.read_csv('./data/used_car_train_20200313.csv', sep=' ')
df_train['is_train'] = 1
df_test = pd.read_csv('./data/used_car_testA_20200313.csv', sep=' ')
df_test['is_train'] = 0

## 删除无用特征

In [3]:
# 删除无效特征
delete_features = ['offerType','seller']
for feature in delete_features:
    del df_train[feature]
    del df_test[feature]

In [4]:
date_feature = ['regDate', 'creatDate']

def clean_error_month(x):
    '''
    清洗日期中月份出现错误的数据 将00->01
    '''
    x = str(x)
    if len(x) != 8:
        return pd.NaT
    if x[4:6] == '00':
        x = x[:4]+'01'+x[6:]
    return x


for feature in date_feature:
    df_train[feature] = df_train[feature].apply(clean_error_month)
    df_test[feature] = df_test[feature].apply(clean_error_month)

## 缺失值按众数填充(暂时先不填充，由算法自己来填充)

In [5]:
df_train['model'] = df_train['model'].fillna(0)
df_train['bodyType'] = df_train['bodyType'].fillna(0)
df_train['fuelType'] = df_train['fuelType'].fillna(0)
df_train['gearbox'] = df_train['gearbox'].fillna(0)
df_train['power'] = df_train['power'].where(df_train['power'] <= 600, 600)
df_train['notRepairedDamage'] = df_train['notRepairedDamage'].astype('str').apply(
    lambda x: x if x != '-' else None).astype('float')


df_test['bodyType'] = df_test['bodyType'].fillna(0)
df_test['fuelType'] = df_test['fuelType'].fillna(0)
df_test['gearbox'] = df_test['gearbox'].fillna(0)
df_test['power'] = df_test['power'].where(df_test['power'] <= 600, 600)
df_test['notRepairedDamage'] = df_test['notRepairedDamage'].astype('str').apply(
    lambda x: x if x != '-' else None).astype('float')

## 删除异常数据，注意只能删除训练集的数据，测试集数据是不能删除的

In [6]:
# 使用【Q1-1.5IQR,Q3+1.5IQR】
# 数值型变量
numeric_features = ['power', 'kilometer', 'v_0', 'v_1', 'v_2', 'v_3', 'v_4',
                    'v_5', 'v_6', 'v_7', 'v_8', 'v_9', 'v_10', 'v_11', 'v_12', 'v_13', 'v_14']

# 类别型变量
categorical_features = ['name', 'model', 'brand', 'bodyType',
                        'fuelType', 'gearbox', 'notRepairedDamage', 'regionCode']
# 日期型变量
date_features = ['regDate', 'creatDate']

### 异常值是去除了 但是泛化能力如何未知

In [7]:
# 异常数据处理
rules = []
for feature in numeric_features:
    Q1 = df_train[feature].quantile(0.25)
    Q3 = df_train[feature].quantile(0.75)
    IQR = Q3-Q1
    min_border = Q1-3*IQR
    max_border = Q3+3*IQR
    rules.append((feature,min_border,max_border))

for rule in rules:
    feature,min_border,max_border = rule
    df_train = df_train[(df_train[feature]<=max_border)&(df_train[feature]>=min_border)]
print(rule)
print(df_train.shape)

('v_14', -3.78926889659761, 4.0326133029067375)
(134288, 30)


In [8]:
# 合并数据集
df = pd.concat([df_train, df_test],axis=0)
df.head()

Unnamed: 0,SaleID,name,regDate,model,brand,bodyType,fuelType,gearbox,power,kilometer,...,v_6,v_7,v_8,v_9,v_10,v_11,v_12,v_13,v_14,is_train
0,0,736,20040402,30.0,6,1.0,0.0,0.0,60,12.5,...,0.101988,0.129549,0.022816,0.097462,-2.881803,2.804097,-2.420821,0.795292,0.914762,1
1,1,2262,20030301,40.0,1,2.0,0.0,0.0,0,15.0,...,0.121004,0.135731,0.026597,0.020582,-4.900482,2.096338,-1.030483,-1.722674,0.245522,1
2,2,14874,20040403,115.0,15,1.0,0.0,0.0,163,12.5,...,0.114912,0.165147,0.062173,0.027075,-4.846749,1.803559,1.56533,-0.832687,-0.229963,1
3,3,71865,19960908,109.0,10,0.0,0.0,1.0,193,15.0,...,0.1103,0.121964,0.033395,0.0,-4.509599,1.28594,-0.501868,-2.438353,-0.478699,1
4,4,111080,20120103,110.0,5,1.0,0.0,0.0,68,5.0,...,0.073205,0.09188,0.078819,0.121534,-1.89624,0.910783,0.93111,2.834518,1.923482,1


## 特征构建

### price做对数变换

In [9]:
df['price'] = np.log(df['price'])
y_train = df.loc[df['is_train']==1,'price']

### name字段处理（高基类别特征 使用target encoder进行编码）

In [10]:
df['name_count'] = df.groupby('name')['SaleID'].transform('count')

In [11]:
def my_agg(df, dim, measure='price'):
    '''
    按照指定维度聚合并重命名,暂时维度只有1
    '''
    index = ['min', 'max', 'mean', 'median','sum', 'std', 'kurt', 'skew', 'mad']
    new_columns = [dim]+list(map(lambda x: f'{dim}_{measure}_{x}', index))
    new_df = df.groupby('model').agg({
        'price': ['min', 'max', 'mean', 'median', 'sum', 'std', pd.DataFrame.kurt, pd.DataFrame.skew, pd.DataFrame.mad]
    }).reset_index()
    new_df.columns = new_columns
    return new_df

### 处理城市编码(高基类别特征 使用target encoder进行编码）

In [12]:
df['city_code'] = df['regionCode'].apply(lambda x: str(x)[:2])
df['city_count'] = df.groupby('city_code')['SaleID'].transform('count')
df['region_count'] = df.groupby('regionCode')['SaleID'].transform('count')

### 处理model(高基类别特征-车型编码)

In [13]:
# 量
df['model_count'] = df.groupby('model')['SaleID'].transform('count')

In [14]:
model_price_df = my_agg(df[df['is_train'] == 1], 'model')
model_price_df.head()

Unnamed: 0,model,model_price_min,model_price_max,model_price_mean,model_price_median,model_price_sum,model_price_std,model_price_kurt,model_price_skew,model_price_mad
0,0.0,3.401197,10.434086,7.891917,7.901007,82707.285816,1.072218,-0.368386,-0.25478,0.882622
1,1.0,3.401197,11.512915,7.518231,7.467371,34456.052465,1.203831,-0.4153,0.036641,0.982058
2,2.0,6.214608,10.657259,8.676237,8.699515,2316.555241,0.845278,0.037475,-0.145091,0.668872
3,3.0,5.703782,9.375855,8.043157,8.058327,6635.604518,0.606403,0.30129,-0.406535,0.487724
4,4.0,3.850148,10.75257,8.247872,8.268732,66527.334309,1.019115,-0.194181,-0.338488,0.823811


In [15]:
df = pd.merge(df, model_price_df, on='model', how='left')
df.head()

Unnamed: 0,SaleID,name,regDate,model,brand,bodyType,fuelType,gearbox,power,kilometer,...,model_count,model_price_min,model_price_max,model_price_mean,model_price_median,model_price_sum,model_price_std,model_price_kurt,model_price_skew,model_price_mad
0,0,736,20040402,30.0,6,1.0,0.0,0.0,60,12.5,...,2872,3.401197,9.564512,7.238523,7.31322,14976.504969,1.099224,-0.462727,-0.33923,0.913094
1,1,2262,20030301,40.0,1,2.0,0.0,0.0,0,15.0,...,5718,4.60517,10.755773,8.347556,8.486734,35969.619196,1.013267,-0.464128,-0.467438,0.842701
2,2,14874,20040403,115.0,15,1.0,0.0,0.0,163,12.5,...,1095,6.50129,10.462818,9.035968,9.053687,6984.803082,0.518705,0.005399,-0.341823,0.425525
3,3,71865,19960908,109.0,10,0.0,0.0,1.0,193,15.0,...,443,6.906755,11.220298,8.948898,8.853665,2908.391696,0.823441,-0.665132,0.152431,0.69441
4,4,111080,20120103,110.0,5,1.0,0.0,0.0,68,5.0,...,656,3.912023,8.681011,6.858144,6.791221,3333.058196,0.973839,-0.405394,-0.141099,0.78547


### 处理brand(高基类别特征40)

In [16]:
# 量
df['brand_count'] = df.groupby('brand')['SaleID'].transform('count')

In [17]:
def my_agg(df, dim, measure='price'):
    '''
    按照指定维度聚合并重命名,暂时维度只有1
    '''
    index = ['min', 'max', 'mean', 'median','sum', 'std', 'kurt', 'skew', 'mad']
    new_columns = [dim]+list(map(lambda x: f'{dim}_{measure}_{x}', index))
    new_df = df.groupby('model').agg({
        'price': ['min', 'max', 'mean', 'median', 'sum', 'std', pd.DataFrame.kurt, pd.DataFrame.skew, pd.DataFrame.mad]
    }).reset_index()
    new_df.columns = new_columns
    return new_df


brand_price_df = my_agg(df[df['is_train'] == 1], 'brand')
brand_price_df.head()

Unnamed: 0,brand,brand_price_min,brand_price_max,brand_price_mean,brand_price_median,brand_price_sum,brand_price_std,brand_price_kurt,brand_price_skew,brand_price_mad
0,0.0,3.401197,10.434086,7.891917,7.901007,82707.285816,1.072218,-0.368386,-0.25478,0.882622
1,1.0,3.401197,11.512915,7.518231,7.467371,34456.052465,1.203831,-0.4153,0.036641,0.982058
2,2.0,6.214608,10.657259,8.676237,8.699515,2316.555241,0.845278,0.037475,-0.145091,0.668872
3,3.0,5.703782,9.375855,8.043157,8.058327,6635.604518,0.606403,0.30129,-0.406535,0.487724
4,4.0,3.850148,10.75257,8.247872,8.268732,66527.334309,1.019115,-0.194181,-0.338488,0.823811


In [18]:
df = pd.merge(df, brand_price_df, on='brand', how='left')
df.head()

Unnamed: 0,SaleID,name,regDate,model,brand,bodyType,fuelType,gearbox,power,kilometer,...,brand_count,brand_price_min,brand_price_max,brand_price_mean,brand_price_median,brand_price_sum,brand_price_std,brand_price_kurt,brand_price_skew,brand_price_mad
0,0,736,20040402,30.0,6,1.0,0.0,0.0,60,12.5,...,12727,6.214608,9.642123,8.469141,8.38936,2278.198804,0.543652,0.630015,-0.090508,0.428179
1,1,2262,20030301,40.0,1,2.0,0.0,0.0,0,15.0,...,16994,3.401197,11.512915,7.518231,7.467371,34456.052465,1.203831,-0.4153,0.036641,0.982058
2,2,14874,20040403,115.0,15,1.0,0.0,0.0,163,12.5,...,1665,5.298317,9.903438,7.535615,7.438384,3556.810392,1.000242,-1.021809,0.103182,0.86317
3,3,71865,19960908,109.0,10,0.0,0.0,1.0,193,15.0,...,17760,3.912023,9.159047,6.843343,6.897705,12420.668343,0.848594,0.578729,-0.239942,0.640245
4,4,111080,20120103,110.0,5,1.0,0.0,0.0,68,5.0,...,5861,4.382027,9.581214,7.718601,7.803843,14827.433042,0.785844,0.003021,-0.414304,0.638729


### 处理bodyType(可以onehot)

In [19]:
bodyType_price_df = my_agg(df[df['is_train'] == 1], 'bodyType')
bodyType_price_df.head()

Unnamed: 0,bodyType,bodyType_price_min,bodyType_price_max,bodyType_price_mean,bodyType_price_median,bodyType_price_sum,bodyType_price_std,bodyType_price_kurt,bodyType_price_skew,bodyType_price_mad
0,0.0,3.401197,10.434086,7.891917,7.901007,82707.285816,1.072218,-0.368386,-0.25478,0.882622
1,1.0,3.401197,11.512915,7.518231,7.467371,34456.052465,1.203831,-0.4153,0.036641,0.982058
2,2.0,6.214608,10.657259,8.676237,8.699515,2316.555241,0.845278,0.037475,-0.145091,0.668872
3,3.0,5.703782,9.375855,8.043157,8.058327,6635.604518,0.606403,0.30129,-0.406535,0.487724
4,4.0,3.850148,10.75257,8.247872,8.268732,66527.334309,1.019115,-0.194181,-0.338488,0.823811


In [20]:
df = pd.merge(df, bodyType_price_df, on='bodyType', how='left')
df.head()

Unnamed: 0,SaleID,name,regDate,model,brand,bodyType,fuelType,gearbox,power,kilometer,...,brand_price_mad,bodyType_price_min,bodyType_price_max,bodyType_price_mean,bodyType_price_median,bodyType_price_sum,bodyType_price_std,bodyType_price_kurt,bodyType_price_skew,bodyType_price_mad
0,0,736,20040402,30.0,6,1.0,0.0,0.0,60,12.5,...,0.428179,3.401197,11.512915,7.518231,7.467371,34456.052465,1.203831,-0.4153,0.036641,0.982058
1,1,2262,20030301,40.0,1,2.0,0.0,0.0,0,15.0,...,0.982058,6.214608,10.657259,8.676237,8.699515,2316.555241,0.845278,0.037475,-0.145091,0.668872
2,2,14874,20040403,115.0,15,1.0,0.0,0.0,163,12.5,...,0.86317,3.401197,11.512915,7.518231,7.467371,34456.052465,1.203831,-0.4153,0.036641,0.982058
3,3,71865,19960908,109.0,10,0.0,0.0,1.0,193,15.0,...,0.640245,3.401197,10.434086,7.891917,7.901007,82707.285816,1.072218,-0.368386,-0.25478,0.882622
4,4,111080,20120103,110.0,5,1.0,0.0,0.0,68,5.0,...,0.638729,3.401197,11.512915,7.518231,7.467371,34456.052465,1.203831,-0.4153,0.036641,0.982058


In [21]:
tmp_df = pd.get_dummies(df['bodyType'], prefix='bodyType')
df = pd.concat([df, tmp_df], axis=1)
# 记得删除bodyType字段
# del df['bodyType']

In [22]:
# df.bodyType.value_counts()

In [23]:
# df.columns

In [24]:
# cate_features = ['model','brand','bodyType','fuelType','gearbox','kilometer','notRepairedDamage','regionCode','city_code']
# for feature in cate_features:
#     df[feature] = df[feature].astype('category')
#     tmp_df = my_agg(df[df['is_train']==1],feature,'price')
#     df = pd.merge(df, tmp_df, on=feature, how='left')
#     print(tmp_df.head())
#     break

### 处理fuelType

In [25]:
fuelType_price_df = my_agg(df[df['is_train'] == 1], 'fuelType')
fuelType_price_df.head()
# df.fuelType.value_counts()
df = pd.merge(df, fuelType_price_df, on='fuelType', how='left')
tmp_df = pd.get_dummies(df['fuelType'], prefix='fuelType')
df = pd.concat([df, tmp_df], axis=1)

# del df['fuelType']
df.head()

Unnamed: 0,SaleID,name,regDate,model,brand,bodyType,fuelType,gearbox,power,kilometer,...,fuelType_price_kurt,fuelType_price_skew,fuelType_price_mad,fuelType_0.0,fuelType_1.0,fuelType_2.0,fuelType_3.0,fuelType_4.0,fuelType_5.0,fuelType_6.0
0,0,736,20040402,30.0,6,1.0,0.0,0.0,60,12.5,...,-0.368386,-0.25478,0.882622,1,0,0,0,0,0,0
1,1,2262,20030301,40.0,1,2.0,0.0,0.0,0,15.0,...,-0.368386,-0.25478,0.882622,1,0,0,0,0,0,0
2,2,14874,20040403,115.0,15,1.0,0.0,0.0,163,12.5,...,-0.368386,-0.25478,0.882622,1,0,0,0,0,0,0
3,3,71865,19960908,109.0,10,0.0,0.0,1.0,193,15.0,...,-0.368386,-0.25478,0.882622,1,0,0,0,0,0,0
4,4,111080,20120103,110.0,5,1.0,0.0,0.0,68,5.0,...,-0.368386,-0.25478,0.882622,1,0,0,0,0,0,0


### 处理gearbox

In [26]:
gearbox_price_df = my_agg(df[df['is_train'] == 1], 'gearbox')
gearbox_price_df.head()
# df.fuelType.value_counts()
df = pd.merge(df, gearbox_price_df, on='gearbox', how='left')
tmp_df = pd.get_dummies(df['gearbox'], prefix='gearbox')
df = pd.concat([df, tmp_df], axis=1)

# del df['gearbox']
df.head()

Unnamed: 0,SaleID,name,regDate,model,brand,bodyType,fuelType,gearbox,power,kilometer,...,gearbox_price_max,gearbox_price_mean,gearbox_price_median,gearbox_price_sum,gearbox_price_std,gearbox_price_kurt,gearbox_price_skew,gearbox_price_mad,gearbox_0.0,gearbox_1.0
0,0,736,20040402,30.0,6,1.0,0.0,0.0,60,12.5,...,10.434086,7.891917,7.901007,82707.285816,1.072218,-0.368386,-0.25478,0.882622,1,0
1,1,2262,20030301,40.0,1,2.0,0.0,0.0,0,15.0,...,10.434086,7.891917,7.901007,82707.285816,1.072218,-0.368386,-0.25478,0.882622,1,0
2,2,14874,20040403,115.0,15,1.0,0.0,0.0,163,12.5,...,10.434086,7.891917,7.901007,82707.285816,1.072218,-0.368386,-0.25478,0.882622,1,0
3,3,71865,19960908,109.0,10,0.0,0.0,1.0,193,15.0,...,11.512915,7.518231,7.467371,34456.052465,1.203831,-0.4153,0.036641,0.982058,0,1
4,4,111080,20120103,110.0,5,1.0,0.0,0.0,68,5.0,...,10.434086,7.891917,7.901007,82707.285816,1.072218,-0.368386,-0.25478,0.882622,1,0


### 处理kilometer

In [27]:
kilometer_price_df = my_agg(df[df['is_train'] == 1], 'kilometer')
df = pd.merge(df, kilometer_price_df, on='kilometer', how='left')

# del df['kilometer']
df.head()

Unnamed: 0,SaleID,name,regDate,model,brand,bodyType,fuelType,gearbox,power,kilometer,...,gearbox_1.0,kilometer_price_min,kilometer_price_max,kilometer_price_mean,kilometer_price_median,kilometer_price_sum,kilometer_price_std,kilometer_price_kurt,kilometer_price_skew,kilometer_price_mad
0,0,736,20040402,30.0,6,1.0,0.0,0.0,60,12.5,...,0,,,,,,,,,
1,1,2262,20030301,40.0,1,2.0,0.0,0.0,0,15.0,...,0,5.298317,9.903438,7.535615,7.438384,3556.810392,1.000242,-1.021809,0.103182,0.86317
2,2,14874,20040403,115.0,15,1.0,0.0,0.0,163,12.5,...,0,,,,,,,,,
3,3,71865,19960908,109.0,10,0.0,0.0,1.0,193,15.0,...,1,5.298317,9.903438,7.535615,7.438384,3556.810392,1.000242,-1.021809,0.103182,0.86317
4,4,111080,20120103,110.0,5,1.0,0.0,0.0,68,5.0,...,0,4.382027,9.581214,7.718601,7.803843,14827.433042,0.785844,0.003021,-0.414304,0.638729


### 处理notRepairedDamage

In [28]:
# df.kilometer.value_counts()

notRepairedDamage_price_df = my_agg(df[df['is_train'] == 1], 'notRepairedDamage')
df = pd.merge(df, notRepairedDamage_price_df, on='notRepairedDamage', how='left')

# del df['kilometer']
df.head()

Unnamed: 0,SaleID,name,regDate,model,brand,bodyType,fuelType,gearbox,power,kilometer,...,kilometer_price_mad,notRepairedDamage_price_min,notRepairedDamage_price_max,notRepairedDamage_price_mean,notRepairedDamage_price_median,notRepairedDamage_price_sum,notRepairedDamage_price_std,notRepairedDamage_price_kurt,notRepairedDamage_price_skew,notRepairedDamage_price_mad
0,0,736,20040402,30.0,6,1.0,0.0,0.0,60,12.5,...,,3.401197,10.434086,7.891917,7.901007,82707.285816,1.072218,-0.368386,-0.25478,0.882622
1,1,2262,20030301,40.0,1,2.0,0.0,0.0,0,15.0,...,0.86317,,,,,,,,,
2,2,14874,20040403,115.0,15,1.0,0.0,0.0,163,12.5,...,,3.401197,10.434086,7.891917,7.901007,82707.285816,1.072218,-0.368386,-0.25478,0.882622
3,3,71865,19960908,109.0,10,0.0,0.0,1.0,193,15.0,...,0.86317,3.401197,10.434086,7.891917,7.901007,82707.285816,1.072218,-0.368386,-0.25478,0.882622
4,4,111080,20120103,110.0,5,1.0,0.0,0.0,68,5.0,...,0.638729,3.401197,10.434086,7.891917,7.901007,82707.285816,1.072218,-0.368386,-0.25478,0.882622


### 处理时间特征

In [29]:
df['regDate_year'] = pd.to_datetime(df['regDate'], format='%Y%m%d').dt.year
df['regDate_month'] = pd.to_datetime(df['regDate'], format='%Y%m%d').dt.month
df['regDate_date'] = pd.to_datetime(df['regDate'], format='%Y%m%d').dt.day

df['creatDate_year'] = pd.to_datetime(df['creatDate'], format='%Y%m%d').dt.year
df['creatDate_month'] = pd.to_datetime(
    df['creatDate'], format='%Y%m%d').dt.month
df['creatDate_date'] = pd.to_datetime(df['creatDate'], format='%Y%m%d').dt.day


df['used_days'] = (pd.to_datetime(df['creatDate'], format='%Y%m%d') -
                   pd.to_datetime(df['regDate'], format='%Y%m%d')).dt.days
df['used_years'] = round(df['used_days']/365, 1)

df['kilometer_div_years'] = df['kilometer']/df['used_years']
df['kilometer_div_days'] = df['kilometer']/df['used_days']

# 删除注册日期和上线日期
del df['regDate']
del df['creatDate']

In [31]:
# 对使用天数进行分箱
df['use_days_bin_20'] = pd.qcut(df['used_days'], 20, labels=False)
use_days_bin_20_price_df = my_agg(df[df['is_train'] == 1], 'use_days_bin_20')
df = pd.merge(df, use_days_bin_20_price_df, on='use_days_bin_20', how='left')
df.head()

Unnamed: 0,SaleID,name,model,brand,bodyType,fuelType,gearbox,power,kilometer,notRepairedDamage,...,use_days_bin_20,use_days_bin_20_price_min,use_days_bin_20_price_max,use_days_bin_20_price_mean,use_days_bin_20_price_median,use_days_bin_20_price_sum,use_days_bin_20_price_std,use_days_bin_20_price_kurt,use_days_bin_20_price_skew,use_days_bin_20_price_mad
0,0,736,30.0,6,1.0,0.0,0.0,60,12.5,0.0,...,9,7.31322,10.56875,9.233681,9.384294,406.281967,0.603896,1.472683,-0.710294,0.456442
1,1,2262,40.0,1,2.0,0.0,0.0,0,15.0,,...,10,3.912023,9.159047,6.843343,6.897705,12420.668343,0.848594,0.578729,-0.239942,0.640245
2,2,14874,115.0,15,1.0,0.0,0.0,163,12.5,0.0,...,9,7.31322,10.56875,9.233681,9.384294,406.281967,0.603896,1.472683,-0.710294,0.456442
3,3,71865,109.0,10,0.0,0.0,1.0,193,15.0,0.0,...,18,5.298317,9.409191,7.010394,6.684612,245.363779,0.956759,-0.210759,0.69198,0.800504
4,4,111080,110.0,5,1.0,0.0,0.0,68,5.0,0.0,...,0,3.401197,10.434086,7.891917,7.901007,82707.285816,1.072218,-0.368386,-0.25478,0.882622


In [32]:
df[df['is_train']==1]

Unnamed: 0,SaleID,name,model,brand,bodyType,fuelType,gearbox,power,kilometer,notRepairedDamage,...,use_days_bin_20,use_days_bin_20_price_min,use_days_bin_20_price_max,use_days_bin_20_price_mean,use_days_bin_20_price_median,use_days_bin_20_price_sum,use_days_bin_20_price_std,use_days_bin_20_price_kurt,use_days_bin_20_price_skew,use_days_bin_20_price_mad
0,0,736,30.0,6,1.0,0.0,0.0,60,12.5,0.0,...,9,7.313220,10.568750,9.233681,9.384294,406.281967,0.603896,1.472683,-0.710294,0.456442
1,1,2262,40.0,1,2.0,0.0,0.0,0,15.0,,...,10,3.912023,9.159047,6.843343,6.897705,12420.668343,0.848594,0.578729,-0.239942,0.640245
2,2,14874,115.0,15,1.0,0.0,0.0,163,12.5,0.0,...,9,7.313220,10.568750,9.233681,9.384294,406.281967,0.603896,1.472683,-0.710294,0.456442
3,3,71865,109.0,10,0.0,0.0,1.0,193,15.0,0.0,...,18,5.298317,9.409191,7.010394,6.684612,245.363779,0.956759,-0.210759,0.691980,0.800504
4,4,111080,110.0,5,1.0,0.0,0.0,68,5.0,0.0,...,0,3.401197,10.434086,7.891917,7.901007,82707.285816,1.072218,-0.368386,-0.254780,0.882622
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
134283,149995,163978,121.0,10,4.0,0.0,1.0,163,15.0,0.0,...,13,3.401197,10.799576,8.585904,8.732305,31141.072897,0.979082,-0.411823,-0.368708,0.817268
134284,149996,184535,116.0,11,0.0,0.0,0.0,125,10.0,0.0,...,2,6.214608,10.657259,8.676237,8.699515,2316.555241,0.845278,0.037475,-0.145091,0.668872
134285,149997,147587,60.0,11,1.0,1.0,0.0,90,6.0,0.0,...,1,3.401197,11.512915,7.518231,7.467371,34456.052465,1.203831,-0.415300,0.036641,0.982058
134286,149998,45907,34.0,10,3.0,1.0,0.0,156,15.0,0.0,...,6,6.214608,9.642123,8.469141,8.389360,2278.198804,0.543652,0.630015,-0.090508,0.428179


### 高基编码(最后去做)

In [37]:
enc = TargetEncoder(cols=['name','city_code','model','brand','bodyType'])
X_train = enc.fit_transform(df[df['is_train']==1],y_train)
X_test = enc.transform(df[df['is_train']==0])

Unnamed: 0,SaleID,name,model,brand,bodyType,fuelType,gearbox,power,kilometer,notRepairedDamage,...,use_days_bin_20,use_days_bin_20_price_min,use_days_bin_20_price_max,use_days_bin_20_price_mean,use_days_bin_20_price_median,use_days_bin_20_price_sum,use_days_bin_20_price_std,use_days_bin_20_price_kurt,use_days_bin_20_price_skew,use_days_bin_20_price_mad
0,0,7.957690,7.985997,7.977203,7.976056,0.0,0.0,60,12.5,0.0,...,9,7.313220,10.568750,9.233681,9.384294,406.281967,0.603896,1.472683,-0.710294,0.456442
1,1,8.088887,7.983751,7.996634,7.983349,0.0,0.0,0,15.0,,...,10,3.912023,9.159047,6.843343,6.897705,12420.668343,0.848594,0.578729,-0.239942,0.640245
2,2,7.979443,7.968381,7.953839,7.976056,0.0,0.0,163,12.5,0.0,...,9,7.313220,10.568750,9.233681,9.384294,406.281967,0.603896,1.472683,-0.710294,0.456442
3,3,8.161672,8.057116,7.982382,7.977155,0.0,1.0,193,15.0,0.0,...,18,5.298317,9.409191,7.010394,6.684612,245.363779,0.956759,-0.210759,0.691980,0.800504
4,4,7.979443,8.001770,8.018759,7.976056,0.0,0.0,68,5.0,0.0,...,0,3.401197,10.434086,7.891917,7.901007,82707.285816,1.072218,-0.368386,-0.254780,0.882622
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
134283,149995,7.979443,7.941730,7.982382,7.990113,0.0,1.0,163,15.0,0.0,...,13,3.401197,10.799576,8.585904,8.732305,31141.072897,0.979082,-0.411823,-0.368708,0.817268
134284,149996,7.979443,7.937312,7.963616,7.977155,0.0,0.0,125,10.0,0.0,...,2,6.214608,10.657259,8.676237,8.699515,2316.555241,0.845278,0.037475,-0.145091,0.668872
134285,149997,,7.929357,7.963616,7.976056,1.0,0.0,90,6.0,0.0,...,1,3.401197,11.512915,7.518231,7.467371,34456.052465,1.203831,-0.415300,0.036641,0.982058
134286,149998,7.979443,8.077360,7.982382,7.982597,1.0,0.0,156,15.0,0.0,...,6,6.214608,9.642123,8.469141,8.389360,2278.198804,0.543652,0.630015,-0.090508,0.428179
