## 数据导入

In [1]:
import pandas as pd

# 数据加载
train_data = pd.read_csv('./used_car_train_20200313.csv', sep=' ')
test = pd.read_csv("./used_car_testB_20200421.csv", sep = ' ')

In [2]:
# 输出结果格式设置
result = pd.DataFrame()
result['SaleID'] = test['SaleID']

## 异常值，缺失值处理

In [3]:
train_data['notRepairedDamage'].value_counts()

0.0    111361
-       24324
1.0     14315
Name: notRepairedDamage, dtype: int64

In [4]:
# 处理notRepairedDamage列异常值,数据取值0,1
train_data['notRepairedDamage'].replace('-', '0.0', inplace=True)
test['notRepairedDamage'].replace('-', '0.0', inplace=True)

In [5]:
# 处理power列异常值,发动机功率：范围 [ 0, 600 ]
train_data['power'][train_data['power']>600] = 600
test['power'][test['power']>600] = 600

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [6]:
print(train_data.isnull().sum())
print(test.isnull().sum())

SaleID                  0
name                    0
regDate                 0
model                   1
brand                   0
bodyType             4506
fuelType             8680
gearbox              5981
power                   0
kilometer               0
notRepairedDamage       0
regionCode              0
seller                  0
offerType               0
creatDate               0
price                   0
v_0                     0
v_1                     0
v_2                     0
v_3                     0
v_4                     0
v_5                     0
v_6                     0
v_7                     0
v_8                     0
v_9                     0
v_10                    0
v_11                    0
v_12                    0
v_13                    0
v_14                    0
dtype: int64
SaleID                  0
name                    0
regDate                 0
model                   0
brand                   0
bodyType             1504
fuelType             2924

In [7]:
# 缺失值处理，众数填充
train_data['model'].fillna(train_data['model'].mode()[0],inplace=True)
train_data['bodyType'].fillna(train_data['bodyType'].mode()[0],inplace=True)
test['bodyType'].fillna(test['bodyType'].mode()[0],inplace=True)
train_data['fuelType'].fillna(train_data['fuelType'].mode()[0],inplace=True)
test['fuelType'].fillna(test['fuelType'].mode()[0],inplace=True)
train_data['gearbox'].fillna(train_data['gearbox'].mode()[0],inplace=True)
test['gearbox'].fillna(test['gearbox'].mode()[0],inplace=True)

## 对日期格式进行处理

In [8]:
train_data_regDate_New = []
for i in train_data['regDate']:
    j = str(i)
    if j[4:6] == '00': i +=100
    train_data_regDate_New.append(i)
test_regDate_New = []
for i in test['regDate']:
    j = str(i)
    if j[4:6] == '00': i +=100
    test_regDate_New.append(i)
train_data['regDate'] = train_data_regDate_New
test['regDate'] = test_regDate_New

In [9]:
train_data['regDate'] = pd.to_datetime(train_data['regDate'],format='%Y%m%d',errors='coerce')
test['regDate'] = pd.to_datetime(test['regDate'],format='%Y%m%d',errors='coerce')
train_data['creatDate'] = pd.to_datetime(train_data['creatDate'],format='%Y%m%d',errors='coerce')
test['creatDate'] = pd.to_datetime(test['creatDate'],format='%Y%m%d',errors='coerce')

In [10]:
# 时间多尺度
train_data['regDate_year'] = train_data['regDate'].apply(lambda x: str(x)[0:4])
train_data['regDate_month'] = train_data['regDate'].apply(lambda x: str(x)[5:7])
train_data['regDate_day'] = train_data['regDate'].apply(lambda x: str(x)[8:10])
print(train_data[['regDate', 'regDate_year', 'regDate_month', 'regDate_day']])

test['regDate_year'] = test['regDate'].apply(lambda x: str(x)[0:4])
test['regDate_month'] = test['regDate'].apply(lambda x: str(x)[5:7])
test['regDate_day'] = test['regDate'].apply(lambda x: str(x)[8:10])
print(test[['regDate', 'regDate_year', 'regDate_month', 'regDate_day']])

train_data['creatDate_year'] = train_data['creatDate'].apply(lambda x: str(x)[0:4])
train_data['creatDate_month'] = train_data['creatDate'].apply(lambda x: str(x)[5:7])
train_data['creatDate_day'] = train_data['creatDate'].apply(lambda x: str(x)[8:10])
print(train_data[['creatDate', 'creatDate_year', 'creatDate_month', 'creatDate_day']])

test['creatDate_year'] = test['creatDate'].apply(lambda x: str(x)[0:4])
test['creatDate_month'] = test['creatDate'].apply(lambda x: str(x)[5:7])
test['creatDate_day'] = test['creatDate'].apply(lambda x: str(x)[8:10])
print(test[['creatDate', 'creatDate_year', 'creatDate_month', 'creatDate_day']])

          regDate regDate_year regDate_month regDate_day
0      2004-04-02         2004            04          02
1      2003-03-01         2003            03          01
2      2004-04-03         2004            04          03
3      1996-09-08         1996            09          08
4      2012-01-03         2012            01          03
...           ...          ...           ...         ...
149995 2000-06-07         2000            06          07
149996 2009-11-02         2009            11          02
149997 2010-10-03         2010            10          03
149998 2006-03-12         2006            03          12
149999 1999-02-04         1999            02          04

[150000 rows x 4 columns]
         regDate regDate_year regDate_month regDate_day
0     2000-05-01         2000            05          01
1     1995-02-11         1995            02          11
2     2009-06-06         2009            06          06
3     2002-06-01         2002            06          01
4     200

In [11]:
# 时间diff
train_data['regDate_diff'] = (train_data['regDate'] - train_data['regDate'].min()).dt.days
test['regDate_diff'] = (test['regDate'] - train_data['regDate'].min()).dt.days
train_data['creatDate_diff'] = (train_data['creatDate'] - train_data['creatDate'].min()).dt.days
test['creatDate_diff'] = (test['creatDate'] - train_data['creatDate'].min()).dt.days
print(train_data[['regDate', 'regDate_diff']])
print(test[['regDate', 'regDate_diff']])
print(train_data[['creatDate', 'creatDate_diff']])
print(test[['creatDate', 'creatDate_diff']])

          regDate  regDate_diff
0      2004-04-02          4840
1      2003-03-01          4442
2      2004-04-03          4841
3      1996-09-08          2077
4      2012-01-03          7672
...           ...           ...
149995 2000-06-07          3445
149996 2009-11-02          6880
149997 2010-10-03          7215
149998 2006-03-12          5549
149999 1999-02-04          2956

[150000 rows x 2 columns]
         regDate  regDate_diff
0     2000-05-01          3408
1     1995-02-11          1502
2     2009-06-06          6731
3     2002-06-01          4169
4     2003-03-01          4442
...          ...           ...
49995 2004-10-05          5026
49996 2013-04-09          8134
49997 2004-12-11          5093
49998 2002-07-02          4200
49999 2009-07-08          6763

[50000 rows x 2 columns]
        creatDate  creatDate_diff
0      2016-04-04             291
1      2016-03-09             265
2      2016-04-02             289
3      2016-03-12             268
4      2016-03-13    

In [12]:
print(train_data.isnull().sum())
print(test.isnull().sum())

SaleID               0
name                 0
regDate              0
model                0
brand                0
bodyType             0
fuelType             0
gearbox              0
power                0
kilometer            0
notRepairedDamage    0
regionCode           0
seller               0
offerType            0
creatDate            0
price                0
v_0                  0
v_1                  0
v_2                  0
v_3                  0
v_4                  0
v_5                  0
v_6                  0
v_7                  0
v_8                  0
v_9                  0
v_10                 0
v_11                 0
v_12                 0
v_13                 0
v_14                 0
regDate_year         0
regDate_month        0
regDate_day          0
creatDate_year       0
creatDate_month      0
creatDate_day        0
regDate_diff         0
creatDate_diff       0
dtype: int64
SaleID               0
name                 0
regDate              0
model                

## 增加新的特征，对brand进行统计

In [13]:
# 增加新的特征，对brand进行统计
# brand_amount, brand_price_max, brand_price_min, brand_price_median, brand_price_sum, brand_price_std, brand_price_mean
brand_data = train_data.groupby('brand')
all_info = {}
for brand_index, brand_temp in brand_data:
    info = {}
    brand_temp = brand_temp[brand_temp['price'] > 0]
    info['brand_amount'] = len(brand_temp)
    info['brand_price_max'] = brand_temp.price.max()
    info['brand_price_min'] = brand_temp.price.min()
    info['brand_price_median'] = brand_temp.price.median()
    info['brand_price_mean'] = brand_temp.price.mean()
    info['brand_price_sum'] = brand_temp.price.sum()
    info['brand_price_std'] = brand_temp.price.std()
    info['brand_price_ptp'] = info['brand_price_max'] - info['brand_price_min']
    all_info[brand_index] = info
all_info

{0: {'brand_amount': 31480,
  'brand_price_max': 68500,
  'brand_price_min': 13,
  'brand_price_median': 3199.0,
  'brand_price_mean': 5535.36499364676,
  'brand_price_sum': 174253290,
  'brand_price_std': 6272.522118575291,
  'brand_price_ptp': 68487},
 1: {'brand_amount': 13794,
  'brand_price_max': 99900,
  'brand_price_min': 15,
  'brand_price_median': 6499.0,
  'brand_price_mean': 9273.31194722343,
  'brand_price_sum': 127916065,
  'brand_price_std': 9369.631497175233,
  'brand_price_ptp': 99885},
 2: {'brand_amount': 321,
  'brand_price_max': 59800,
  'brand_price_min': 35,
  'brand_price_median': 7700.0,
  'brand_price_mean': 12037.822429906542,
  'brand_price_sum': 3864141,
  'brand_price_std': 10875.892890769717,
  'brand_price_ptp': 59765},
 3: {'brand_amount': 2461,
  'brand_price_max': 37500,
  'brand_price_min': 65,
  'brand_price_median': 4990.0,
  'brand_price_mean': 6482.822429906542,
  'brand_price_sum': 15954226,
  'brand_price_std': 5396.32750326748,
  'brand_price_p

In [14]:
brand_stats = pd.DataFrame(all_info).T.reset_index().rename(columns={'index':'brand'})
train_data = train_data.merge(brand_stats, how='left', on='brand')
test = test.merge(brand_stats, how='left', on='brand')

## 特征选择，删除['SaleID', 'name', 'seller', 'regDate', 'creatDate', 'offerType', 'price']列

In [15]:
# 特征选择
drop_cols = ['SaleID', 'name', 'seller', 'regDate', 'creatDate', 'offerType', 'price']
feature_cols = [col for col in train_data.columns if col not in drop_cols]
feature_cols

['model',
 'brand',
 'bodyType',
 'fuelType',
 'gearbox',
 'power',
 'kilometer',
 'notRepairedDamage',
 'regionCode',
 'v_0',
 'v_1',
 'v_2',
 'v_3',
 'v_4',
 'v_5',
 'v_6',
 'v_7',
 'v_8',
 'v_9',
 'v_10',
 'v_11',
 'v_12',
 'v_13',
 'v_14',
 'regDate_year',
 'regDate_month',
 'regDate_day',
 'creatDate_year',
 'creatDate_month',
 'creatDate_day',
 'regDate_diff',
 'creatDate_diff',
 'brand_amount',
 'brand_price_max',
 'brand_price_min',
 'brand_price_median',
 'brand_price_mean',
 'brand_price_sum',
 'brand_price_std',
 'brand_price_ptp']

## 数据格式转换

In [16]:
# 查看数值类型
numerical_cols = train_data.select_dtypes(exclude='object').columns
# 查看分类类型
categorical_cols = train_data.select_dtypes(include='object').columns
print(numerical_cols,categorical_cols)

Index(['SaleID', 'name', 'regDate', 'model', 'brand', 'bodyType', 'fuelType',
       'gearbox', 'power', 'kilometer', 'regionCode', 'seller', 'offerType',
       'creatDate', 'price', 'v_0', 'v_1', 'v_2', 'v_3', 'v_4', 'v_5', 'v_6',
       'v_7', 'v_8', 'v_9', 'v_10', 'v_11', 'v_12', 'v_13', 'v_14',
       'regDate_diff', 'creatDate_diff', 'brand_amount', 'brand_price_max',
       'brand_price_min', 'brand_price_median', 'brand_price_mean',
       'brand_price_sum', 'brand_price_std', 'brand_price_ptp'],
      dtype='object') Index(['notRepairedDamage', 'regDate_year', 'regDate_month', 'regDate_day',
       'creatDate_year', 'creatDate_month', 'creatDate_day'],
      dtype='object')


In [17]:
# 转换数据类型
import warnings
warnings.filterwarnings('ignore')
#X_data.info()
train_data['notRepairedDamage'] = train_data['notRepairedDamage'].astype('float64')
test['notRepairedDamage'] = test['notRepairedDamage'].astype('float64')

In [18]:
train_data['regDate_year'] = train_data['regDate_year'].astype('int64')
test['regDate_year'] = test['regDate_year'].astype('int64')

train_data['regDate_month'] = train_data['regDate_month'].astype('int64')
test['regDate_month'] = test['regDate_month'].astype('int64')

train_data['regDate_day'] = train_data['regDate_day'].astype('int64')
test['regDate_day'] = test['regDate_day'].astype('int64')

train_data['creatDate_year'] = train_data['creatDate_year'].astype('int64')
test['creatDate_year'] = test['creatDate_year'].astype('int64')

train_data['creatDate_month'] = train_data['creatDate_month'].astype('int64')
test['creatDate_month'] = test['creatDate_month'].astype('int64')

train_data['creatDate_day'] = train_data['creatDate_day'].astype('int64')
test['creatDate_day'] = test['creatDate_day'].astype('int64')

In [19]:
# 查看数值类型
numerical_cols = train_data.select_dtypes(exclude='object').columns
# 查看分类类型
categorical_cols = train_data.select_dtypes(include='object').columns
print(numerical_cols,categorical_cols)

Index(['SaleID', 'name', 'regDate', 'model', 'brand', 'bodyType', 'fuelType',
       'gearbox', 'power', 'kilometer', 'notRepairedDamage', 'regionCode',
       'seller', 'offerType', 'creatDate', 'price', 'v_0', 'v_1', 'v_2', 'v_3',
       'v_4', 'v_5', 'v_6', 'v_7', 'v_8', 'v_9', 'v_10', 'v_11', 'v_12',
       'v_13', 'v_14', 'regDate_year', 'regDate_month', 'regDate_day',
       'creatDate_year', 'creatDate_month', 'creatDate_day', 'regDate_diff',
       'creatDate_diff', 'brand_amount', 'brand_price_max', 'brand_price_min',
       'brand_price_median', 'brand_price_mean', 'brand_price_sum',
       'brand_price_std', 'brand_price_ptp'],
      dtype='object') Index([], dtype='object')


## 创建模型

In [20]:
# 提取特征列
X_data = train_data[feature_cols]
Y_data = train_data['price']
X_test = test[feature_cols]

In [21]:
import numpy as np
# 定一个统计函数，用于统计某字段的特征
def show_stats(data):
    print('min: ', np.min(data))
    print('max: ', np.max(data))
    # ptp = max - min
    print('ptp: ', np.ptp(data))
    print('mean: ', np.mean(data))
    print('std: ', np.std(data))
    print('var: ', np.var(data))
# 查看price
show_stats(Y_data)

min:  11
max:  99999
ptp:  99988
mean:  5923.327333333334
std:  7501.973469876438
var:  56279605.94272992


#### 5折交叉验证+XGBoost

In [25]:
# 具体几个子模型，可以测试，可以试一下10折
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import mean_absolute_error
def ensemble_model(clf, train_x, train_y, test, n):
    # 采用五折交叉验证
    sk = StratifiedKFold(n_splits=n, shuffle=True, random_state=2021)
    mean_mae = 0
    result = []
    for k, (train_index, val_index) in enumerate(sk.split(train_x, train_y)):
        # 使用sk，得到训练集，验证集
        train_x_real = train_x.iloc[train_index]
        train_y_real = train_y.iloc[train_index]
        val_x = train_x.iloc[val_index]
        val_y = train_y.iloc[val_index]
        # 子模型训练
        clf = clf.fit(train_x_real, train_y_real)
        val_y_pred = clf.predict(val_x)
        # 子模型评估
        mae_val = mean_absolute_error(val_y,val_y_pred)
        print(f'第{k+1}个子模型MAE{mae_val}')
        mean_mae += mae_val/n
        # 使用子模型对测试集进行预测
        test_y_pred = clf.predict(X_test)
        result.append(test_y_pred)
    print(mean_mae)
    # 最终结果 = 5个子模型的平均值
    mean_result = sum(result) / n
    return mean_result

In [70]:
import xgboost as xgb
# 祖传参数
model_xgb = xgb.XGBRegressor(
        max_depth=6, learning_rate=0.1, n_estimators=15000, 
        early_stopping_rounds=200,
        objective='reg:linear', 
        tree_method='gpu_hist',
        subsample=0.8, colsample_bytree=0.8, 
        min_child_samples=3, eval_metric='mae', reg_lambda=0.5
    )

# model_xgb.fit(X_data, Y_data)
y_pred_xgb = ensemble_model(model_xgb, X_data, Y_data, X_test, 10)

Parameters: { "early_stopping_rounds", "min_child_samples" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


第1个子模型MAE498.88858030522266
Parameters: { "early_stopping_rounds", "min_child_samples" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


第2个子模型MAE513.8890718258699
Parameters: { "early_stopping_rounds", "min_child_samples" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find 

In [71]:
show_stats(y_pred_xgb)

min:  -710.5499
max:  90757.6
ptp:  91468.15
mean:  5912.983
std:  7388.3213
var:  54587290.0


In [72]:
y_pred_xgb[y_pred_xgb<0] = 11

#### nn

In [73]:
# 数据归一化
from sklearn.preprocessing import MinMaxScaler
mm = MinMaxScaler()
x = mm.fit_transform(X_data)
x_test = mm.transform(X_test)
y = Y_data.values

In [74]:
from tensorflow import keras

model_nn = keras.Sequential([
    # 250维度，定义向量的长度
    keras.layers.Dense(320,activation='relu',input_shape=[len(feature_cols)]),
    keras.layers.Dense(160,activation='relu'),
    keras.layers.Dense(80,activation='relu'),
    # 最终输出结果是price
    keras.layers.Dense(1)
])
# 设置优化器
model_nn.compile(loss='mean_absolute_error', optimizer='Adam')
# 模型训练
model_nn.fit(x, y, batch_size=2048, epochs=300)

Epoch 1/300
Epoch 2/300
Epoch 3/300
Epoch 4/300
Epoch 5/300
Epoch 6/300
Epoch 7/300
Epoch 8/300
Epoch 9/300
Epoch 10/300
Epoch 11/300
Epoch 12/300
Epoch 13/300
Epoch 14/300
Epoch 15/300
Epoch 16/300
Epoch 17/300
Epoch 18/300
Epoch 19/300
Epoch 20/300
Epoch 21/300
Epoch 22/300
Epoch 23/300
Epoch 24/300
Epoch 25/300
Epoch 26/300
Epoch 27/300
Epoch 28/300
Epoch 29/300
Epoch 30/300
Epoch 31/300
Epoch 32/300
Epoch 33/300
Epoch 34/300
Epoch 35/300
Epoch 36/300
Epoch 37/300
Epoch 38/300
Epoch 39/300
Epoch 40/300
Epoch 41/300
Epoch 42/300
Epoch 43/300
Epoch 44/300
Epoch 45/300
Epoch 46/300
Epoch 47/300
Epoch 48/300
Epoch 49/300
Epoch 50/300
Epoch 51/300
Epoch 52/300
Epoch 53/300
Epoch 54/300
Epoch 55/300
Epoch 56/300
Epoch 57/300
Epoch 58/300
Epoch 59/300
Epoch 60/300
Epoch 61/300
Epoch 62/300
Epoch 63/300
Epoch 64/300
Epoch 65/300
Epoch 66/300
Epoch 67/300
Epoch 68/300
Epoch 69/300
Epoch 70/300
Epoch 71/300
Epoch 72/300
Epoch 73/300
Epoch 74/300
Epoch 75/300
Epoch 76/300
Epoch 77/300
Epoch 78

Epoch 98/300
Epoch 99/300
Epoch 100/300
Epoch 101/300
Epoch 102/300
Epoch 103/300
Epoch 104/300
Epoch 105/300
Epoch 106/300
Epoch 107/300
Epoch 108/300
Epoch 109/300
Epoch 110/300
Epoch 111/300
Epoch 112/300
Epoch 113/300
Epoch 114/300
Epoch 115/300
Epoch 116/300
Epoch 117/300
Epoch 118/300
Epoch 119/300
Epoch 120/300
Epoch 121/300
Epoch 122/300
Epoch 123/300
Epoch 124/300
Epoch 125/300
Epoch 126/300
Epoch 127/300
Epoch 128/300
Epoch 129/300
Epoch 130/300
Epoch 131/300
Epoch 132/300
Epoch 133/300
Epoch 134/300
Epoch 135/300
Epoch 136/300
Epoch 137/300
Epoch 138/300
Epoch 139/300
Epoch 140/300
Epoch 141/300
Epoch 142/300
Epoch 143/300
Epoch 144/300
Epoch 145/300
Epoch 146/300
Epoch 147/300
Epoch 148/300
Epoch 149/300
Epoch 150/300
Epoch 151/300
Epoch 152/300
Epoch 153/300
Epoch 154/300
Epoch 155/300
Epoch 156/300
Epoch 157/300
Epoch 158/300
Epoch 159/300
Epoch 160/300
Epoch 161/300
Epoch 162/300
Epoch 163/300
Epoch 164/300
Epoch 165/300
Epoch 166/300
Epoch 167/300
Epoch 168/300
Epoch 16

Epoch 289/300
Epoch 290/300
Epoch 291/300
Epoch 292/300
Epoch 293/300
Epoch 294/300
Epoch 295/300
Epoch 296/300
Epoch 297/300
Epoch 298/300
Epoch 299/300
Epoch 300/300


<tensorflow.python.keras.callbacks.History at 0x1798d349c88>

In [84]:
model_nn.fit(x, y, batch_size=2048, epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Epoch 98/100
Epoch 99/100
Epoch 100/100


<tensorflow.python.keras.callbacks.History at 0x17984dfab08>

In [94]:
model_nn.fit(x, y, batch_size=2048, epochs=600)

Epoch 1/600
Epoch 2/600
Epoch 3/600
Epoch 4/600
Epoch 5/600
Epoch 6/600
Epoch 7/600
Epoch 8/600
Epoch 9/600
Epoch 10/600
Epoch 11/600
Epoch 12/600
Epoch 13/600
Epoch 14/600
Epoch 15/600
Epoch 16/600
Epoch 17/600
Epoch 18/600
Epoch 19/600
Epoch 20/600
Epoch 21/600
Epoch 22/600
Epoch 23/600
Epoch 24/600
Epoch 25/600
Epoch 26/600
Epoch 27/600
Epoch 28/600
Epoch 29/600
Epoch 30/600
Epoch 31/600
Epoch 32/600
Epoch 33/600
Epoch 34/600
Epoch 35/600
Epoch 36/600
Epoch 37/600
Epoch 38/600
Epoch 39/600
Epoch 40/600
Epoch 41/600
Epoch 42/600
Epoch 43/600
Epoch 44/600
Epoch 45/600
Epoch 46/600
Epoch 47/600
Epoch 48/600
Epoch 49/600
Epoch 50/600
Epoch 51/600
Epoch 52/600
Epoch 53/600
Epoch 54/600
Epoch 55/600
Epoch 56/600
Epoch 57/600
Epoch 58/600
Epoch 59/600
Epoch 60/600
Epoch 61/600
Epoch 62/600
Epoch 63/600
Epoch 64/600
Epoch 65/600
Epoch 66/600
Epoch 67/600
Epoch 68/600
Epoch 69/600
Epoch 70/600
Epoch 71/600
Epoch 72/600
Epoch 73/600
Epoch 74/600
Epoch 75/600
Epoch 76/600
Epoch 77/600
Epoch 78

Epoch 195/600
Epoch 196/600
Epoch 197/600
Epoch 198/600
Epoch 199/600
Epoch 200/600
Epoch 201/600
Epoch 202/600
Epoch 203/600
Epoch 204/600
Epoch 205/600
Epoch 206/600
Epoch 207/600
Epoch 208/600
Epoch 209/600
Epoch 210/600
Epoch 211/600
Epoch 212/600
Epoch 213/600
Epoch 214/600
Epoch 215/600
Epoch 216/600
Epoch 217/600
Epoch 218/600
Epoch 219/600
Epoch 220/600
Epoch 221/600
Epoch 222/600
Epoch 223/600
Epoch 224/600
Epoch 225/600
Epoch 226/600
Epoch 227/600
Epoch 228/600
Epoch 229/600
Epoch 230/600
Epoch 231/600
Epoch 232/600
Epoch 233/600
Epoch 234/600
Epoch 235/600
Epoch 236/600
Epoch 237/600
Epoch 238/600
Epoch 239/600
Epoch 240/600
Epoch 241/600
Epoch 242/600
Epoch 243/600
Epoch 244/600
Epoch 245/600
Epoch 246/600
Epoch 247/600
Epoch 248/600
Epoch 249/600
Epoch 250/600
Epoch 251/600
Epoch 252/600
Epoch 253/600
Epoch 254/600
Epoch 255/600
Epoch 256/600
Epoch 257/600
Epoch 258/600
Epoch 259/600
Epoch 260/600
Epoch 261/600
Epoch 262/600
Epoch 263/600
Epoch 264/600
Epoch 265/600
Epoch 

Epoch 386/600
Epoch 387/600
Epoch 388/600
Epoch 389/600
Epoch 390/600
Epoch 391/600
Epoch 392/600
Epoch 393/600
Epoch 394/600
Epoch 395/600
Epoch 396/600
Epoch 397/600
Epoch 398/600
Epoch 399/600
Epoch 400/600
Epoch 401/600
Epoch 402/600
Epoch 403/600
Epoch 404/600
Epoch 405/600
Epoch 406/600
Epoch 407/600
Epoch 408/600
Epoch 409/600
Epoch 410/600
Epoch 411/600
Epoch 412/600
Epoch 413/600
Epoch 414/600
Epoch 415/600
Epoch 416/600
Epoch 417/600
Epoch 418/600
Epoch 419/600
Epoch 420/600
Epoch 421/600
Epoch 422/600
Epoch 423/600
Epoch 424/600
Epoch 425/600
Epoch 426/600
Epoch 427/600
Epoch 428/600
Epoch 429/600
Epoch 430/600
Epoch 431/600
Epoch 432/600
Epoch 433/600
Epoch 434/600
Epoch 435/600
Epoch 436/600
Epoch 437/600
Epoch 438/600
Epoch 439/600
Epoch 440/600
Epoch 441/600
Epoch 442/600
Epoch 443/600
Epoch 444/600
Epoch 445/600
Epoch 446/600
Epoch 447/600
Epoch 448/600
Epoch 449/600
Epoch 450/600
Epoch 451/600
Epoch 452/600
Epoch 453/600
Epoch 454/600
Epoch 455/600
Epoch 456/600
Epoch 

Epoch 575/600
Epoch 576/600
Epoch 577/600
Epoch 578/600
Epoch 579/600
Epoch 580/600
Epoch 581/600
Epoch 582/600
Epoch 583/600
Epoch 584/600
Epoch 585/600
Epoch 586/600
Epoch 587/600
Epoch 588/600
Epoch 589/600
Epoch 590/600
Epoch 591/600
Epoch 592/600
Epoch 593/600
Epoch 594/600
Epoch 595/600
Epoch 596/600
Epoch 597/600
Epoch 598/600
Epoch 599/600
Epoch 600/600


<tensorflow.python.keras.callbacks.History at 0x17984fb7888>

In [95]:
y_pred_nn = model_nn.predict(x_test)
y_pred_nn

array([[1246.5139],
       [1942.1127],
       [8915.489 ],
       ...,
       [5544.947 ],
       [4789.205 ],
       [5694.6562]], dtype=float32)

In [96]:
y_pred_nn = y_pred_nn.reshape(1,-1)
y_pred_nn = y_pred_nn[0]

In [97]:
y_pred_nn[y_pred_nn<0] = 11

## 模型融合

y_pred_xgb, y_pred_lgb, y_pred_cbt, y_pred_nn

In [98]:
y_pred = []
for i in range(50000):
    j = (y_pred_xgb[i]+y_pred_nn[i])/2
    y_pred.append(j)
print(y_pred) # 451

[1257.730712890625, 1960.0986328125, 8812.064453125, 1168.158447265625, 1989.9970703125, 1078.420166015625, 458.47540283203125, 3621.63818359375, 10304.912109375, 609.2015380859375, 662.297119140625, 2708.697021484375, 5875.0419921875, 7795.9951171875, 1409.6591796875, 250.721435546875, 1725.021484375, 8381.9599609375, 6839.669921875, 1045.694091796875, 24936.171875, 7808.96240234375, 1851.333740234375, 1527.9423828125, 812.142578125, 7451.1904296875, 478.95892333984375, 12285.9365234375, 4011.57373046875, 7861.046875, 18298.263671875, 13071.830078125, 622.5595703125, 14864.16796875, 941.2418212890625, 1981.82373046875, 14009.716796875, 5789.3857421875, 1566.963623046875, 342.4539489746094, 584.7427978515625, 27564.681640625, 453.5389099121094, 7407.17919921875, 18267.873046875, 494.88201904296875, 16153.0380859375, 17781.28515625, 2301.32861328125, 3049.9912109375, 564.8944091796875, 4409.72509765625, 2751.3779296875, 518.2325439453125, 12896.6875, 10716.09375, 5577.74609375, 3373.923

## 输出结果

In [99]:
y_pred = np.array(y_pred)

In [100]:
y_pred = y_pred.astype('int64')
y_pred

array([1257, 1960, 8812, ..., 5610, 4756, 5644], dtype=int64)

In [101]:
y_pred[y_pred<0] = 11

In [102]:
result['price'] = y_pred
result

Unnamed: 0,SaleID,price
0,200000,1257
1,200001,1960
2,200002,8812
3,200003,1168
4,200004,1989
...,...,...
49995,249995,6899
49996,249996,18541
49997,249997,5610
49998,249998,4756


In [103]:
result.to_csv('./optimize10.csv', index=False)

## 对比上一版本，加入了10折交叉验证，且模型融合仅融合了XGBoost和nn，分数：434分