In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn import linear_model
import sklearn
import xgboost as xgb
import featuretools as ft
import warnings
import os

warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings('ignore')

In [3]:
os.listdir('./data')

['used_car_testA_20200313.csv',
 'used_car_train_20200313.csv',
 'used_car_sample_submit.csv']

In [68]:
train_data = pd.read_csv('./data/used_car_train_20200313.csv',sep=' ')
test_data = pd.read_csv('./data/used_car_testA_20200313.csv',sep=' ')
print(train_data.shape)
print(test_data.shape)

train_data['notRepairedDamage'] = train_data['notRepairedDamage'].replace({'-':np.nan}).astype(float)

(150000, 31)
(50000, 30)


In [14]:
train_data

Unnamed: 0,SaleID,name,regDate,model,brand,bodyType,fuelType,gearbox,power,kilometer,...,v_5,v_6,v_7,v_8,v_9,v_10,v_11,v_12,v_13,v_14
0,0,736,20040402,30.0,6,1.0,0.0,0.0,60,12.5,...,0.235676,0.101988,0.129549,0.022816,0.097462,-2.881803,2.804097,-2.420821,0.795292,0.914762
1,1,2262,20030301,40.0,1,2.0,0.0,0.0,0,15.0,...,0.264777,0.121004,0.135731,0.026597,0.020582,-4.900482,2.096338,-1.030483,-1.722674,0.245522
2,2,14874,20040403,115.0,15,1.0,0.0,0.0,163,12.5,...,0.251410,0.114912,0.165147,0.062173,0.027075,-4.846749,1.803559,1.565330,-0.832687,-0.229963
3,3,71865,19960908,109.0,10,0.0,0.0,1.0,193,15.0,...,0.274293,0.110300,0.121964,0.033395,0.000000,-4.509599,1.285940,-0.501868,-2.438353,-0.478699
4,4,111080,20120103,110.0,5,1.0,0.0,0.0,68,5.0,...,0.228036,0.073205,0.091880,0.078819,0.121534,-1.896240,0.910783,0.931110,2.834518,1.923482
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
149995,149995,163978,20000607,121.0,10,4.0,0.0,1.0,163,15.0,...,0.280264,0.000310,0.048441,0.071158,0.019174,1.988114,-2.983973,0.589167,-1.304370,-0.302592
149996,149996,184535,20091102,116.0,11,0.0,0.0,0.0,125,10.0,...,0.253217,0.000777,0.084079,0.099681,0.079371,1.839166,-2.774615,2.553994,0.924196,-0.272160
149997,149997,147587,20101003,60.0,11,1.0,1.0,0.0,90,6.0,...,0.233353,0.000705,0.118872,0.100118,0.097914,2.439812,-1.630677,2.290197,1.891922,0.414931
149998,149998,45907,20060312,34.0,10,3.0,1.0,0.0,156,15.0,...,0.256369,0.000252,0.081479,0.083558,0.081498,2.075380,-2.633719,1.414937,0.431981,-1.659014


In [22]:
basis_features_name = list(train_data.columns[1:])
basis_features_name.remove('price')

basis_features_name

['name',
 'regDate',
 'model',
 'brand',
 'bodyType',
 'fuelType',
 'gearbox',
 'power',
 'kilometer',
 'notRepairedDamage',
 'regionCode',
 'seller',
 'offerType',
 'creatDate',
 'v_0',
 'v_1',
 'v_2',
 'v_3',
 'v_4',
 'v_5',
 'v_6',
 'v_7',
 'v_8',
 'v_9',
 'v_10',
 'v_11',
 'v_12',
 'v_13',
 'v_14']

In [72]:
y_series = train_data['price']
x_series = train_data.drop('price', axis=1).iloc[:,1:]

X_train, X_test, y_train, y_test = train_test_split(x_series, y_series, test_size=0.3, shuffle=True)

params = {
    'booster': 'gbtree',
    'objective': 'reg:squarederror',
    'eval_metric': 'mae',
    'gamma': 0.3,
    'min_child_weight': 0.5,
    'max_depth': 6,
    'lamba': 10,
    'subsample': 0.7,
    'eta': 0.05,
    'seed': 0,
    'colsample_bytree': 1,
    'colsample_bylevel': 0.7,
}
train_dataset = xgb.DMatrix(X_train, label=y_train)
test_dataset = xgb.DMatrix(X_test)

watchlist = [(train_dataset, 'train')]

model = xgb.train(params, train_dataset, num_boost_round=30, evals=watchlist)

metrics.mean_absolute_error(y_test, model.predict(test_dataset))

[0]	train-mae:5616.27
[1]	train-mae:5336.46
[2]	train-mae:5070.98
[3]	train-mae:4818.4
[4]	train-mae:4579.17
[5]	train-mae:4351.75
[6]	train-mae:4135.87
[7]	train-mae:3931.93
[8]	train-mae:3738.68
[9]	train-mae:3555.42
[10]	train-mae:3381.88
[11]	train-mae:3217.97
[12]	train-mae:3062.81
[13]	train-mae:2916.45
[14]	train-mae:2777.76
[15]	train-mae:2646.61
[16]	train-mae:2523.11
[17]	train-mae:2407.13
[18]	train-mae:2297.04
[19]	train-mae:2192.75
[20]	train-mae:2094.75
[21]	train-mae:2002.06
[22]	train-mae:1914.19
[23]	train-mae:1832.72
[24]	train-mae:1756.19
[25]	train-mae:1683.98
[26]	train-mae:1616.58
[27]	train-mae:1553.72
[28]	train-mae:1494.63
[29]	train-mae:1438.43


1460.6669425842285

## 数据EDA，特征工程

In [73]:
train_data.head()

Unnamed: 0,SaleID,name,regDate,model,brand,bodyType,fuelType,gearbox,power,kilometer,...,v_5,v_6,v_7,v_8,v_9,v_10,v_11,v_12,v_13,v_14
0,0,736,20040402,30.0,6,1.0,0.0,0.0,60,12.5,...,0.235676,0.101988,0.129549,0.022816,0.097462,-2.881803,2.804097,-2.420821,0.795292,0.914762
1,1,2262,20030301,40.0,1,2.0,0.0,0.0,0,15.0,...,0.264777,0.121004,0.135731,0.026597,0.020582,-4.900482,2.096338,-1.030483,-1.722674,0.245522
2,2,14874,20040403,115.0,15,1.0,0.0,0.0,163,12.5,...,0.25141,0.114912,0.165147,0.062173,0.027075,-4.846749,1.803559,1.56533,-0.832687,-0.229963
3,3,71865,19960908,109.0,10,0.0,0.0,1.0,193,15.0,...,0.274293,0.1103,0.121964,0.033395,0.0,-4.509599,1.28594,-0.501868,-2.438353,-0.478699
4,4,111080,20120103,110.0,5,1.0,0.0,0.0,68,5.0,...,0.228036,0.073205,0.09188,0.078819,0.121534,-1.89624,0.910783,0.93111,2.834518,1.923482


In [75]:
train_data[['regDate', 'creatDate']]

Unnamed: 0,regDate,creatDate
0,20040402,20160404
1,20030301,20160309
2,20040403,20160402
3,19960908,20160312
4,20120103,20160313
...,...,...
149995,20000607,20160327
149996,20091102,20160312
149997,20101003,20160328
149998,20060312,20160401


In [77]:
train_data['creatDate'].sort_values()

52953     20150618
80759     20150807
46025     20150810
132382    20150904
98006     20150904
            ...   
78008     20160407
64637     20160407
679       20160407
39805     20160407
27586     20160407
Name: creatDate, Length: 150000, dtype: int64