In [1]:
import pandas as pd

##### 加载数据

In [2]:
train_data = pd.read_csv('used_car_train_20200313.csv', sep = ' ')
test_data = pd.read_csv('used_car_testB_20200421.csv', sep = ' ')

##### 数据清洗

In [3]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150000 entries, 0 to 149999
Data columns (total 31 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   SaleID             150000 non-null  int64  
 1   name               150000 non-null  int64  
 2   regDate            150000 non-null  int64  
 3   model              149999 non-null  float64
 4   brand              150000 non-null  int64  
 5   bodyType           145494 non-null  float64
 6   fuelType           141320 non-null  float64
 7   gearbox            144019 non-null  float64
 8   power              150000 non-null  int64  
 9   kilometer          150000 non-null  float64
 10  notRepairedDamage  150000 non-null  object 
 11  regionCode         150000 non-null  int64  
 12  seller             150000 non-null  int64  
 13  offerType          150000 non-null  int64  
 14  creatDate          150000 non-null  int64  
 15  price              150000 non-null  int64  
 16  v_

In [4]:
train_data['notRepairedDamage'].value_counts()

0.0    111361
-       24324
1.0     14315
Name: notRepairedDamage, dtype: int64

##### 因为有-值，所以对它进行替换

In [5]:
train_data['notRepairedDamage'].replace('-','0.0',inplace=True)

In [6]:
train_data['notRepairedDamage'].value_counts()

0.0    135685
1.0     14315
Name: notRepairedDamage, dtype: int64

In [7]:
train_data['notRepairedDamage'] = train_data['notRepairedDamage'].astype('float64')

##### 同样清洗测试集

In [8]:
test_data['notRepairedDamage'].replace('-','0.0',inplace=True)
test_data['notRepairedDamage'].value_counts()
test_data['notRepairedDamage'] = train_data['notRepairedDamage'].astype('float64')

##### 继续清洗训练集，查看其他数值

In [9]:
train_data['power'].describe()

count    150000.000000
mean        119.316547
std         177.168419
min           0.000000
25%          75.000000
50%         110.000000
75%         150.000000
max       19312.000000
Name: power, dtype: float64

##### 因为这里power的最大值超过了600，不符合题目中对于power最大值的范围，所以把超过600的值全部设置为600

In [10]:
train_data['power'][train_data['power'] > 600] = 600
train_data['power'].describe()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


count    150000.000000
mean        116.860973
std          70.075256
min           0.000000
25%          75.000000
50%         110.000000
75%         150.000000
max         600.000000
Name: power, dtype: float64

##### 对test数据同样进行修改

In [11]:
test_data['power'][test_data['power'] > 600] = 600
test_data['power'].describe()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


count    50000.00000
mean       116.51788
std         70.48107
min          0.00000
25%         75.00000
50%        110.00000
75%        150.00000
max        600.00000
Name: power, dtype: float64

##### 添加缺失值

In [12]:
train_data.fillna(train_data.median(),inplace=True)

In [13]:
train_data.isnull().sum()

SaleID               0
name                 0
regDate              0
model                0
brand                0
bodyType             0
fuelType             0
gearbox              0
power                0
kilometer            0
notRepairedDamage    0
regionCode           0
seller               0
offerType            0
creatDate            0
price                0
v_0                  0
v_1                  0
v_2                  0
v_3                  0
v_4                  0
v_5                  0
v_6                  0
v_7                  0
v_8                  0
v_9                  0
v_10                 0
v_11                 0
v_12                 0
v_13                 0
v_14                 0
dtype: int64

In [14]:
test_data.fillna(test_data.median(), inplace=True)
test_data.isnull().sum()

SaleID               0
name                 0
regDate              0
model                0
brand                0
bodyType             0
fuelType             0
gearbox              0
power                0
kilometer            0
notRepairedDamage    0
regionCode           0
seller               0
offerType            0
creatDate            0
v_0                  0
v_1                  0
v_2                  0
v_3                  0
v_4                  0
v_5                  0
v_6                  0
v_7                  0
v_8                  0
v_9                  0
v_10                 0
v_11                 0
v_12                 0
v_13                 0
v_14                 0
dtype: int64

##### 提取特征值

In [15]:
features = train_data.columns.to_list()

In [16]:
features.remove('SaleID')

In [17]:
features.remove('price')

In [18]:
features

['name',
 'regDate',
 'model',
 'brand',
 'bodyType',
 'fuelType',
 'gearbox',
 'power',
 'kilometer',
 'notRepairedDamage',
 'regionCode',
 'seller',
 'offerType',
 'creatDate',
 'v_0',
 'v_1',
 'v_2',
 'v_3',
 'v_4',
 'v_5',
 'v_6',
 'v_7',
 'v_8',
 'v_9',
 'v_10',
 'v_11',
 'v_12',
 'v_13',
 'v_14']

##### 对特征做归一化处理

In [19]:
from sklearn.preprocessing import MinMaxScaler

In [20]:
min_max_scaler = MinMaxScaler()

In [21]:
x = min_max_scaler.fit_transform(train_data[features].values)

In [22]:
y = train_data['price'].values

##### 下面的代码在之前的测试中因为错误的直接复制了上面的代码导致使用了fit.transform而不是transform，
##### 所以一开始的预测结果很不好，超过了2000，修正之后结果来到了600以内

In [23]:
x_test = min_max_scaler.transform(test_data[features].values)

##### 数据集切分

In [24]:
from sklearn.model_selection import train_test_split

In [25]:
train_x, test_x, train_y, test_y = train_test_split(x,y,test_size = 0.2)

##### 搭建模型

In [26]:
from tensorflow import keras

In [27]:
model = keras.Sequential([
    keras.layers.Dense(250, activation='relu', input_shape=[len(features)]),
    keras.layers.Dense(250, activation='relu'),
    keras.layers.Dense(250, activation='relu'),
    keras.layers.Dense(1)
])

##### 这里epochs设置成40次是因为担心过拟合，因为课上200次的迭代使得最后的线上预测达到了1000

In [28]:
model.compile(loss = 'mean_absolute_error', optimizer = 'Adam')
model.fit(train_x, train_y, batch_size = 1024, epochs = 40)

Train on 120000 samples
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


<tensorflow.python.keras.callbacks.History at 0x1a2967685f8>

##### 使用自己拆分的测试集看看模型的效果

In [29]:
from sklearn.metrics import mean_absolute_error

In [30]:
print('训练集评估')
mean_absolute_error(train_y,model(train_x))
print('测试集评估')
mean_absolute_error(test_y,model(test_x))

训练集评估


To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.

测试集评估


568.1938425393502

##### MAE是568，证明模型效果还不错，使用所有数据进行训练

In [31]:
model = keras.Sequential([
    keras.layers.Dense(250, activation='relu', input_shape=[len(features)]),
    keras.layers.Dense(250, activation='relu'),
    keras.layers.Dense(250, activation='relu'),
    keras.layers.Dense(1)
])

model.compile(loss = 'mean_absolute_error', optimizer = 'Adam')
model.fit(x, y, batch_size = 1024, epochs = 40)

Train on 150000 samples
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


<tensorflow.python.keras.callbacks.History at 0x1a2f5d48be0>

##### 模型预测结果

In [32]:
Y_predict = model.predict(x_test)

##### 估计一下预测的结果

In [33]:
import numpy as np

def show_stats(data):
    print('min: ',np.min(data))
    print('max: ',np.max(data))
    print('ptp: ',np.ptp(data))
    print('mean: ',np.mean(data))
    print('std: ',np.std(data))
    print('var: ',np.var(data))

In [34]:
print('训练集price的数据分布:')
show_stats(y)

训练集price的数据分布:
min:  11
max:  99999
ptp:  99988
mean:  5923.327333333334
std:  7501.973469876635
var:  56279605.942732885


##### 通过下面的结果可以看出除了最小值之外，统计结果已经很接近测试数据了

In [35]:
print('神经网络预测统计情况')
show_stats(Y_predict)

神经网络预测统计情况
min:  -26.663393
max:  96526.93
ptp:  96553.59
mean:  5802.2456
std:  7296.553
var:  53239692.0


##### 如果预测的统计情况和训练集的统计情况相近，则输出结果

In [36]:
result = pd.DataFrame()

##### 将最大最小值限制在11-99999之间

In [37]:
result['SaleID'] = test_data['SaleID']
result['price'] = Y_predict
result.loc[result['price'] < 11, 'price'] = 12
result[result['price'] < 11]

Unnamed: 0,SaleID,price


In [38]:
result.loc[result['price'] > 99999, 'price'] = 90000
result[result['price'] > 99999]

Unnamed: 0,SaleID,price


In [39]:
show_stats(result['price'])

min:  12.0
max:  96526.93
ptp:  96514.93
mean:  5802.247
std:  7296.5522
var:  53239670.0


##### 输出最终的预测数据

In [40]:
result.to_csv('./nn_ans.csv', index='False')