https://tianchi.aliyun.com/notebook-ai/detail?postId=103212

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.special import jn
from IPython.display import display, clear_output
import time
from tqdm import tqdm
import itertools
import warnings

%matplotlib inline
warnings.filterwarnings('ignore')
pd.set_option('max_columns', None)
pd.set_option('max_rows', None)

from sklearn import linear_model
from sklearn import preprocessing
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

from sklearn.decomposition import PCA, FastICA, FactorAnalysis, SparsePCA

import lightgbm as lgb
import xgboost as xgb

from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold, KFold, train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
import scipy.signal as signal

In [2]:
def reduce_mem_usage(df):
    """ 
    iterate through all the columns of a dataframe and modify the data type
    to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() 
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() 
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    return df

In [3]:
# 处理异常值
def smooth_cols(group,out_value,kind):
    cols = ['power']
    if kind == 'g':
        for col in cols:
            yes_no = (group[col]<out_value).astype('int')
            new = yes_no * group[col]
            group[col] = new.replace(0,group[col].quantile(q=0.995))
        return group
    if kind == 'l':
        for col in cols:
            yes_no = (group[col]>out_value).astype('int')
            new = yes_no * group[col]
            group[col] = new.replace(0,group[col].quantile(q=0.07))
        return group        

def date_proc(x):
    # '20200426' '20200026'
    m = int(x[4:6])
    if m == 0:
        m = 1
    return x[:4] + '-' + str(m) + '-' + x[6:]

# 定义日期提取函数
def date_tran(df,fea_col):
    for f in tqdm(fea_col):
        df[f] = pd.to_datetime(df[f].astype('str').apply(date_proc))
        df[f + '_year'] = df[f].dt.year # 年份
        df[f + '_month'] = df[f].dt.month # 月份
        df[f + '_day'] = df[f].dt.day # 多少号
        df[f + '_dayofweek'] = df[f].dt.dayofweek # 周几
    return df

# 分桶操作
def cut_group(df, cols, num_bins=50):
    for col in cols:
        all_range = int(df[col].max() - df[col].min())
        bin = [i * all_range / num_bins for i in range(all_range)]
        df[col + '_bin'] = pd.cut(df[col], bin, labels=False)
    return df

# count编码
def count_coding(df, fea_col):
    for f in fea_col:
        df[f + '_count'] = df[f].map(df[f].value_counts())
    return df

# 定义交叉特征统计
def cross_cat_num(df, num_col, cat_col):
    for f1 in tqdm(cat_col):
        g = df.groupby(f1, as_index=False)
        for f2 in tqdm(num_col):
            feat = g[f2].agg({
                '{}_{}_max'.format(f1, f2): 'max', # 最大值
                '{}_{}_min'.format(f1, f2): 'min', # 最小值
                '{}_{}_median'.format(f1, f2): 'median', # 中位数
            })
            df = df.merge(feat, on=f1, how='left')
    return df

# 类别特征的二阶交叉
from scipy.stats import entropy
def cross_qua_cat_num(df):
    for f_pair in tqdm([
        ['model', 'brand'], ['model', 'regionCode'], ['brand', 'regionCode']
    ]):
        # 共现次数
        df['_'.join(f_pair) + '_count'] = df.groupby(f_pair)['SaleID'].transform('count')
        # n unique、熵
        df = df.merge(df.groupby(f_pair[0], as_index=False)[f_pair[1]].agg({
            '{}_{}_nunique'.format(f_pair[0], f_pair[1]): 'nunique',
            '{}_{}_ent'.format(f_pair[0], f_pair[1]): lambda x: entropy(x.value_counts() / x.shape[0])
        }), on=f_pair[0], how='left')
        df = df.merge(df.groupby(f_pair[1], as_index=False)[f_pair[0]].agg({
            '{}_{}_nunique'.format(f_pair[1], f_pair[0]): 'nunique',
            '{}_{}_ent'.format(f_pair[1], f_pair[0]): lambda x: entropy(x.value_counts() / x.shape[0])
        }), on=f_pair[1], how='left')
        # 比例偏好
        df['{}_in_{}_prop'.format(f_pair[0], f_pair[1])] = df['_'.join(f_pair) + '_count'] / df[f_pair[1] + '_count']
        df['{}_in_{}_prop'.format(f_pair[1], f_pair[0])] = df['_'.join(f_pair) + '_count'] / df[f_pair[0] + '_count']
    return df

In [4]:
Train_data = reduce_mem_usage(pd.read_csv('input/used_car_train_20200313.csv',
                                          sep=' '))
TestA_data = reduce_mem_usage(pd.read_csv('input/used_car_testB_20200421.csv',
                                          sep=' '))

print('Train data shape: {}'.format(Train_data.shape))
print('TestA_data shape: {}'.format(TestA_data.shape))

Memory usage of dataframe is 37200080.00 MB
Memory usage after optimization is: 10200184.00 MB
Decreased by 72.6%
Memory usage of dataframe is 12000080.00 MB
Memory usage after optimization is: 3200184.00 MB
Decreased by 73.3%
Train data shape: (150000, 31)
TestA_data shape: (50000, 30)


In [5]:
Train_data.head(10)

Unnamed: 0,SaleID,name,regDate,model,brand,bodyType,fuelType,gearbox,power,kilometer,notRepairedDamage,regionCode,seller,offerType,creatDate,price,v_0,v_1,v_2,v_3,v_4,v_5,v_6,v_7,v_8,v_9,v_10,v_11,v_12,v_13,v_14
0,0,736,20040402,30.0,6,1.0,0.0,0.0,60,12.5,0.0,1046,0,0,20160404,1850,43.34375,3.966797,0.050262,2.160156,1.143555,0.235718,0.10199,0.129517,0.022812,0.097473,-2.880859,2.804688,-2.419922,0.79541,0.914551
1,1,2262,20030301,40.0,1,2.0,0.0,0.0,0,15.0,-,4366,0,0,20160309,3600,45.3125,5.234375,0.137939,1.380859,-1.421875,0.264893,0.121033,0.135742,0.026596,0.020584,-4.902344,2.095703,-1.030273,-1.722656,0.245483
2,2,14874,20040403,115.0,15,1.0,0.0,0.0,163,12.5,0.0,2806,0,0,20160402,6222,45.96875,4.824219,1.319336,-0.998535,-0.99707,0.251465,0.114929,0.165161,0.062164,0.027069,-4.847656,1.803711,1.56543,-0.83252,-0.22998
3,3,71865,19960908,109.0,10,0.0,0.0,1.0,193,15.0,0.0,434,0,0,20160312,2400,45.6875,4.492188,-0.050629,0.883789,-2.228516,0.274414,0.110291,0.121948,0.033386,0.0,-4.507812,1.286133,-0.501953,-2.4375,-0.47876
4,4,111080,20120103,110.0,5,1.0,0.0,0.0,68,5.0,0.0,6977,0,0,20160313,5200,44.375,2.03125,0.572266,-1.571289,2.246094,0.228027,0.073181,0.091858,0.078796,0.121521,-1.896484,0.910645,0.931152,2.833984,1.923828
5,5,137642,20090602,24.0,10,0.0,1.0,0.0,109,10.0,0.0,3690,0,0,20160319,8000,46.3125,-3.228516,0.156616,-1.727539,-0.345703,0.260254,0.000518,0.119812,0.090942,0.048767,1.885742,-2.722656,2.457031,-0.286865,0.206543
6,6,2402,19990411,13.0,4,0.0,0.0,1.0,150,15.0,0.0,3073,0,0,20160317,3500,46.09375,4.925781,0.113281,1.644531,-1.270508,0.268066,0.117676,0.142334,0.025452,0.028168,-4.902344,1.610352,-0.834473,-1.996094,-0.10321
7,7,165346,19990706,26.0,14,1.0,0.0,0.0,101,15.0,0.0,4000,0,0,20160326,1000,42.25,-3.167969,-0.676758,1.942383,0.524414,0.239502,0.0,0.122925,0.039825,0.082397,3.693359,-0.244995,-2.193359,0.236694,0.195557
8,8,2974,20030205,19.0,1,2.0,1.0,1.0,179,15.0,0.0,4679,0,0,20160326,2850,46.09375,4.894531,0.475342,0.556641,-1.262695,0.263916,0.116577,0.144287,0.039856,0.024384,-4.925781,1.587891,0.075317,-1.550781,0.069458
9,9,82021,19980101,7.0,7,5.0,0.0,0.0,88,15.0,0.0,302,0,0,20160402,650,43.0625,1.666016,-2.201172,3.097656,0.84375,0.262451,0.068237,0.012177,0.010292,0.098755,-1.089844,0.600586,-4.1875,0.198242,-1.025391


In [6]:
TestA_data.head(10)

Unnamed: 0,SaleID,name,regDate,model,brand,bodyType,fuelType,gearbox,power,kilometer,notRepairedDamage,regionCode,seller,offerType,creatDate,v_0,v_1,v_2,v_3,v_4,v_5,v_6,v_7,v_8,v_9,v_10,v_11,v_12,v_13,v_14
0,200000,133777,20000501,67.0,0,1.0,0.0,0.0,101,15.0,0.0,5019,0,0,20160308,42.15625,-3.095703,-0.721191,1.466797,1.009766,0.236572,0.000241,0.105347,0.046234,0.094543,3.619141,-0.280518,-2.019531,0.979004,0.803223
1,200001,61206,19950211,19.0,6,2.0,0.0,0.0,73,6.0,0.0,1505,0,0,20160310,43.90625,-3.244141,-0.766602,1.276367,-1.06543,0.261475,0.0,0.1203,0.046783,0.0354,2.998047,-1.40625,-1.020508,-1.349609,-0.200562
2,200002,67829,20090606,5.0,5,4.0,0.0,0.0,120,5.0,-,1776,0,0,20160309,45.375,3.373047,-0.965332,-2.447266,0.624512,0.261719,0.09082,0.0,0.079651,0.073608,-3.951172,-0.43335,0.918945,1.634766,1.027344
3,200003,8892,20020601,22.0,9,1.0,0.0,0.0,58,15.0,0.0,26,0,0,20160314,42.78125,4.035156,-0.217407,1.708984,1.119141,0.236084,0.101807,0.098938,0.026825,0.096619,-2.847656,2.800781,-2.525391,1.077148,0.46167
4,200004,76998,20030301,46.0,6,0.0,,0.0,116,15.0,0.0,738,0,0,20160306,43.65625,-3.134766,-1.133789,0.470215,0.134033,0.25708,0.0,0.066711,0.05777,0.068848,2.839844,-1.660156,-0.924316,0.199463,0.450928
5,200005,142813,19990006,37.0,18,6.0,0.0,0.0,125,15.0,0.0,3393,0,0,20160404,43.65625,-3.130859,-2.519531,1.180664,1.295898,0.261475,0.0,0.0,0.046875,0.110718,2.646484,-2.441406,-2.255859,0.712402,-3.314453
6,200006,135370,19980503,36.0,6,4.0,0.0,0.0,75,15.0,0.0,2244,0,0,20160305,41.90625,-3.117188,-2.884766,3.189453,0.860352,0.261963,0.0,0.0,0.018356,0.098389,3.501953,-1.248047,-4.574219,0.569824,1.083984
7,200007,7138,20040201,88.0,14,3.0,0.0,1.0,125,15.0,0.0,155,0,0,20160325,44.875,4.542969,-0.659668,-0.196411,1.53125,0.251465,0.10907,0.027527,0.051117,0.108276,-4.433594,1.144531,-0.750977,1.530273,-0.90332
8,200008,7977,20110209,77.0,0,3.0,1.0,1.0,140,7.0,0.0,1184,0,0,20160328,47.09375,4.433594,0.462402,-1.986328,0.339111,0.259277,0.10907,0.081909,0.076965,0.066772,-5.183594,0.286865,2.080078,0.625977,0.68457
9,200009,104001,19991012,30.0,6,1.0,0.0,0.0,74,15.0,0.0,4874,0,0,20160317,41.5,-3.134766,-1.010742,2.517578,0.414551,0.240112,0.0,0.11084,0.030624,0.080078,3.958984,0.066406,-3.095703,0.205444,0.367188


In [7]:
# 合并数据集
concat_data = pd.concat([Train_data, TestA_data],
                        ignore_index=True) # 重新生成索引

# 'notRepairedDamage'中的'-'用0替换
concat_data['notRepairedDamage'] = concat_data['notRepairedDamage'].replace('-', 0).astype('float16')

In [8]:
# 每列中的缺失值用每列的众数填充
concat_data = concat_data.fillna(concat_data.mode().iloc[0, :])

In [9]:
concat_data.shape

(200000, 31)

In [10]:
concat_data.head(10)

Unnamed: 0,SaleID,bodyType,brand,creatDate,fuelType,gearbox,kilometer,model,name,notRepairedDamage,offerType,power,price,regDate,regionCode,seller,v_0,v_1,v_10,v_11,v_12,v_13,v_14,v_2,v_3,v_4,v_5,v_6,v_7,v_8,v_9
0,0,1.0,6,20160404,0.0,0.0,12.5,30.0,736,0.0,0,60,1850.0,20040402,1046,0,43.34375,3.966797,-2.880859,2.804688,-2.419922,0.79541,0.914551,0.050262,2.160156,1.143555,0.235718,0.10199,0.129517,0.022812,0.097473
1,1,2.0,1,20160309,0.0,0.0,15.0,40.0,2262,0.0,0,0,3600.0,20030301,4366,0,45.3125,5.234375,-4.902344,2.095703,-1.030273,-1.722656,0.245483,0.137939,1.380859,-1.421875,0.264893,0.121033,0.135742,0.026596,0.020584
2,2,1.0,15,20160402,0.0,0.0,12.5,115.0,14874,0.0,0,163,6222.0,20040403,2806,0,45.96875,4.824219,-4.847656,1.803711,1.56543,-0.83252,-0.22998,1.319336,-0.998535,-0.99707,0.251465,0.114929,0.165161,0.062164,0.027069
3,3,0.0,10,20160312,0.0,1.0,15.0,109.0,71865,0.0,0,193,2400.0,19960908,434,0,45.6875,4.492188,-4.507812,1.286133,-0.501953,-2.4375,-0.47876,-0.050629,0.883789,-2.228516,0.274414,0.110291,0.121948,0.033386,0.0
4,4,1.0,5,20160313,0.0,0.0,5.0,110.0,111080,0.0,0,68,5200.0,20120103,6977,0,44.375,2.03125,-1.896484,0.910645,0.931152,2.833984,1.923828,0.572266,-1.571289,2.246094,0.228027,0.073181,0.091858,0.078796,0.121521
5,5,0.0,10,20160319,1.0,0.0,10.0,24.0,137642,0.0,0,109,8000.0,20090602,3690,0,46.3125,-3.228516,1.885742,-2.722656,2.457031,-0.286865,0.206543,0.156616,-1.727539,-0.345703,0.260254,0.000518,0.119812,0.090942,0.048767
6,6,0.0,4,20160317,0.0,1.0,15.0,13.0,2402,0.0,0,150,3500.0,19990411,3073,0,46.09375,4.925781,-4.902344,1.610352,-0.834473,-1.996094,-0.10321,0.113281,1.644531,-1.270508,0.268066,0.117676,0.142334,0.025452,0.028168
7,7,1.0,14,20160326,0.0,0.0,15.0,26.0,165346,0.0,0,101,1000.0,19990706,4000,0,42.25,-3.167969,3.693359,-0.244995,-2.193359,0.236694,0.195557,-0.676758,1.942383,0.524414,0.239502,0.0,0.122925,0.039825,0.082397
8,8,2.0,1,20160326,1.0,1.0,15.0,19.0,2974,0.0,0,179,2850.0,20030205,4679,0,46.09375,4.894531,-4.925781,1.587891,0.075317,-1.550781,0.069458,0.475342,0.556641,-1.262695,0.263916,0.116577,0.144287,0.039856,0.024384
9,9,5.0,7,20160402,0.0,0.0,15.0,7.0,82021,0.0,0,88,650.0,19980101,302,0,43.0625,1.666016,-1.089844,0.600586,-4.1875,0.198242,-1.025391,-2.201172,3.097656,0.84375,0.262451,0.068237,0.012177,0.010292,0.098755


In [11]:
concat_data.tail(10)

Unnamed: 0,SaleID,bodyType,brand,creatDate,fuelType,gearbox,kilometer,model,name,notRepairedDamage,offerType,power,price,regDate,regionCode,seller,v_0,v_1,v_10,v_11,v_12,v_13,v_14,v_2,v_3,v_4,v_5,v_6,v_7,v_8,v_9
199990,249990,3.0,5,20160331,0.0,0.0,15.0,19.0,61395,0.0,0,132,500.0,19990906,3160,0,43.09375,-3.195312,3.181641,-1.394531,-2.490234,-1.06543,-0.442627,-1.672852,2.144531,-0.656738,0.265137,0.0,0.070984,0.033417,0.051117
199991,249991,2.0,10,20160403,0.0,1.0,15.0,17.0,72277,0.0,0,224,500.0,20031106,163,0,45.96875,-3.212891,1.99707,-2.519531,2.419922,-1.568359,0.269043,0.245117,-1.666992,-1.884766,0.268066,0.000387,0.137573,0.086853,0.002478
199992,249992,1.0,6,20160331,0.0,0.0,8.0,41.0,29738,0.0,0,60,500.0,20071009,3929,0,43.375,3.027344,-2.238281,2.027344,-1.245117,2.113281,1.605469,0.086121,0.496582,1.914062,0.230103,0.087463,0.096252,0.047455,0.116638
199993,249993,2.0,4,20160328,0.0,1.0,15.0,13.0,35,0.0,0,193,500.0,19970712,3258,0,44.71875,3.8125,-3.392578,1.723633,-1.783203,-2.158203,-0.41748,-0.223511,2.148438,-1.631836,0.266113,0.100708,0.136597,0.018753,0.018051
199994,249994,0.0,4,20160330,0.0,0.0,15.0,4.0,41919,0.0,0,150,500.0,20050807,5640,0,46.40625,2.798828,-3.435547,-0.013077,1.506836,-0.947266,0.359863,0.323975,-1.174805,-1.170898,0.26709,0.085876,0.109436,0.0672,0.024033
199995,249995,0.0,4,20160309,0.0,1.0,15.0,4.0,111443,0.0,0,150,500.0,20041005,5564,0,46.3125,-3.304688,2.072266,-2.53125,1.716797,-1.063477,0.32666,0.073364,-0.622559,-0.77832,0.263672,0.000292,0.141846,0.076416,0.039276
199996,249996,0.0,1,20160323,0.0,0.0,4.0,65.0,152834,0.0,0,179,500.0,20130409,5220,0,48.09375,-3.318359,1.358398,-3.291016,4.269531,0.140503,0.556152,0.96582,-2.671875,0.357422,0.255371,0.000991,0.155884,0.108398,0.067871
199997,249997,0.0,4,20160316,0.0,1.0,12.5,4.0,132531,0.0,0,147,500.0,20041211,3795,0,46.15625,-3.304688,2.166016,-2.417969,1.371094,-1.073242,0.270508,-0.015282,-0.28833,-0.687012,0.262939,0.000318,0.141846,0.07196,0.042969
199998,249998,4.0,1,20160327,0.0,1.0,15.0,40.0,143405,0.0,0,176,500.0,20020702,61,0,45.5,-3.197266,2.029297,-2.939453,0.568848,-1.717773,0.316406,-1.141602,-0.434814,-1.844727,0.282227,2.3e-05,0.067505,0.067505,0.009003
199999,249999,1.0,8,20160401,0.0,0.0,3.0,32.0,78202,0.0,0,0,500.0,20090708,4158,0,44.28125,4.179688,-3.689453,2.033203,0.109131,2.203125,0.847656,0.546875,-0.775879,1.790039,0.231445,0.103943,0.096008,0.062317,0.110168


In [12]:
# 处理异常值
concat_data['power'][concat_data['power'] > 600] = 600
concat_data['power'][concat_data['power'] < 1] = 1

concat_data['v_13'][concat_data['v_13'] > 6] = 6
concat_data['v_14'][concat_data['v_14'] > 4] = 4

In [13]:
# v系列特征之间相加
for j in ['v_' + str(i) for i in range(14)]:
    for k in ['v_' + str(m) for m in range(14)]:
        concat_data[j + '+' + k] = concat_data[j] + concat_data[k]

# 原始特征与v系列特征之间相乘
for i in ['model', 'brand', 'bodyType', 'fuelType', 'gearbox', 'power', 'kilometer', 'notRepairedDamage', 'regionCode']:
    for j in ['v_' + str(k) for k in range(14)]:
        concat_data[i + '*' + j] = concat_data[i] * concat_data[j]
    
concat_data.shape

(200000, 353)

In [14]:
# 提取日期信息
date_cols = ['regDate', 'creatDate']
concat_data = date_tran(concat_data, date_cols)

concat_data.shape

100%|████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:02<00:00,  1.35s/it]


(200000, 361)

In [15]:
data = concat_data.copy()

data.columns

Index(['SaleID', 'bodyType', 'brand', 'creatDate', 'fuelType', 'gearbox',
       'kilometer', 'model', 'name', 'notRepairedDamage',
       ...
       'regionCode*v_12', 'regionCode*v_13', 'regDate_year', 'regDate_month',
       'regDate_day', 'regDate_dayofweek', 'creatDate_year', 'creatDate_month',
       'creatDate_day', 'creatDate_dayofweek'],
      dtype='object', length=361)

In [16]:
# count编码
count_list = ['regDate', 'creatDate', 'model', 'brand', 'regionCode', 'bodyType', 'fuelType', 'name',
              'regDate_year', 'regDate_month', 'regDate_day', 'regDate_dayofweek',
              'creatDate_month', 'creatDate_day', 'creatDate_dayofweek', 'kilometer']
data = count_coding(data, count_list)

In [17]:
# 特征构造
# 使用时间：data['creatDate'] - data['regDate']，反应汽车使用时间，一般来说价格与使用时间成反比
# 不过要注意，数据里有时间出错的格式，所以我们需要 errors='coerce'
data['used_time1'] = (pd.to_datetime(data['creatDate'], format='%Y%m%d', errors='coerce') - 
                      pd.to_datetime(data['regDate'], format='%Y%m%d', errors='coerce')).dt.days
data['used_time2'] = (pd.datetime.now() - pd.to_datetime(data['regDate'], format='%Y%m%d', errors='coerce')).dt.days                        
data['used_time3'] = (pd.datetime.now() - pd.to_datetime(data['creatDate'], format='%Y%m%d', errors='coerce') ).dt.days

In [18]:
# 分桶，注意：kilometer应该是已经离散化了的
cut_cols = ['power'] + ['used_time1', 'used_time2', 'used_time3']
data = cut_group(data, cut_cols, 50)

In [19]:
# 用数值特征对类别特征做统计刻画，随便挑了几个跟price相关性最高的匿名特征
cross_cat = ['model', 'brand','regDate_year']
cross_num = ['v_0','v_3', 'v_4', 'v_8', 'v_12','power', 'used_time1']
data = cross_cat_num(data, cross_num, cross_cat) # 一阶交叉
# data = cross_qua_cat_num(data) # 二阶交叉

  0%|                                                                                            | 0/3 [00:00<?, ?it/s]
  0%|                                                                                            | 0/7 [00:00<?, ?it/s]
 14%|████████████                                                                        | 1/7 [00:02<00:14,  2.34s/it]
 29%|████████████████████████                                                            | 2/7 [00:02<00:09,  1.81s/it]
 43%|████████████████████████████████████                                                | 3/7 [00:03<00:05,  1.46s/it]
 57%|████████████████████████████████████████████████                                    | 4/7 [00:04<00:03,  1.21s/it]
 71%|████████████████████████████████████████████████████████████                        | 5/7 [00:04<00:02,  1.02s/it]
 86%|████████████████████████████████████████████████████████████████████████            | 6/7 [00:05<00:00,  1.13it/s]
 33%|████████████████████████████       

In [20]:
# 选择特征列
numerical_cols = data.columns
print(numerical_cols)

Index(['SaleID', 'bodyType', 'brand', 'creatDate', 'fuelType', 'gearbox',
       'kilometer', 'model', 'name', 'notRepairedDamage',
       ...
       'regDate_year_v_8_median', 'regDate_year_v_12_max',
       'regDate_year_v_12_min', 'regDate_year_v_12_median',
       'regDate_year_power_max', 'regDate_year_power_min',
       'regDate_year_power_median', 'regDate_year_used_time1_max',
       'regDate_year_used_time1_min', 'regDate_year_used_time1_median'],
      dtype='object', length=447)


In [21]:
cat_fea = ['SaleID', 'offerType', 'seller']
feature_cols = [col for col in numerical_cols if col not in cat_fea]
feature_cols = [col for col in feature_cols if col not in ['price']]

# 将训练集和测试集分开
X_data = data.iloc[:len(Train_data), :][feature_cols]
Y_data = Train_data['price']
X_test  = data.iloc[len(Train_data):, :][feature_cols]

In [22]:
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold, KFold
from itertools import product
class MeanEncoder:
    def __init__(self, categorical_features, n_splits=10, target_type='classification', prior_weight_func=None):
        """
        :param categorical_features: list of str, the name of the categorical columns to encode
 
        :param n_splits: the number of splits used in mean encoding
 
        :param target_type: str, 'regression' or 'classification'
 
        :param prior_weight_func:
        a function that takes in the number of observations, and outputs prior weight
        when a dict is passed, the default exponential decay function will be used:
        k: the number of observations needed for the posterior to be weighted equally as the prior
        f: larger f --> smaller slope
        """
 
        self.categorical_features = categorical_features
        self.n_splits = n_splits
        self.learned_stats = {}
 
        if target_type == 'classification':
            self.target_type = target_type
            self.target_values = []
        else:
            self.target_type = 'regression'
            self.target_values = None
 
        if isinstance(prior_weight_func, dict):
            self.prior_weight_func = eval('lambda x: 1 / (1 + np.exp((x - k) / f))', dict(prior_weight_func, np=np))
        elif callable(prior_weight_func):
            self.prior_weight_func = prior_weight_func
        else:
            self.prior_weight_func = lambda x: 1 / (1 + np.exp((x - 2) / 1))
 
    @staticmethod
    def mean_encode_subroutine(X_train, y_train, X_test, variable, target, prior_weight_func):
        X_train = X_train[[variable]].copy()
        X_test = X_test[[variable]].copy()
 
        if target is not None:
            nf_name = '{}_pred_{}'.format(variable, target)
            X_train['pred_temp'] = (y_train == target).astype(int)  # classification
        else:
            nf_name = '{}_pred'.format(variable)
            X_train['pred_temp'] = y_train  # regression
        prior = X_train['pred_temp'].mean()
 
        col_avg_y = X_train.groupby(by=variable, axis=0)['pred_temp'].agg(['mean', 'size'])
        col_avg_y['size'] = prior_weight_func(col_avg_y['size'])
        col_avg_y[nf_name] = col_avg_y['size'] * prior + (1 - col_avg_y['size']) * col_avg_y['mean']
        col_avg_y.drop(['size', 'mean'], axis=1, inplace=True)
 
        nf_train = X_train.join(col_avg_y, on=variable)[nf_name].values
        nf_test = X_test.join(col_avg_y, on=variable).fillna(prior, inplace=False)[nf_name].values
 
        return nf_train, nf_test, prior, col_avg_y
 
    def fit_transform(self, X, y):
        """
        :param X: pandas DataFrame, n_samples * n_features
        :param y: pandas Series or numpy array, n_samples
        :return X_new: the transformed pandas DataFrame containing mean-encoded categorical features
        """
        X_new = X.copy()
        if self.target_type == 'classification':
            skf = StratifiedKFold(self.n_splits)
        else:
            skf = KFold(self.n_splits)
 
        if self.target_type == 'classification':
            self.target_values = sorted(set(y))
            self.learned_stats = {'{}_pred_{}'.format(variable, target): [] for variable, target in
                                  product(self.categorical_features, self.target_values)}
            for variable, target in product(self.categorical_features, self.target_values):
                nf_name = '{}_pred_{}'.format(variable, target)
                X_new.loc[:, nf_name] = np.nan
                for large_ind, small_ind in skf.split(X, y):
                    nf_large, nf_small, prior, col_avg_y = MeanEncoder.mean_encode_subroutine(
                        X_new.iloc[large_ind], y.iloc[large_ind], X_new.iloc[small_ind], variable, target, self.prior_weight_func)
                    X_new.iloc[small_ind, -1] = nf_small
                    self.learned_stats[nf_name].append((prior, col_avg_y))
        else:
            self.learned_stats = {'{}_pred'.format(variable): [] for variable in self.categorical_features}
            for variable in self.categorical_features:
                nf_name = '{}_pred'.format(variable)
                X_new.loc[:, nf_name] = np.nan
                for large_ind, small_ind in skf.split(X, y):
                    nf_large, nf_small, prior, col_avg_y = MeanEncoder.mean_encode_subroutine(
                        X_new.iloc[large_ind], y.iloc[large_ind], X_new.iloc[small_ind], variable, None, self.prior_weight_func)
                    X_new.iloc[small_ind, -1] = nf_small
                    self.learned_stats[nf_name].append((prior, col_avg_y))
        return X_new
 
    def transform(self, X):
        """
        :param X: pandas DataFrame, n_samples * n_features
        :return X_new: the transformed pandas DataFrame containing mean-encoded categorical features
        """
        X_new = X.copy()
 
        if self.target_type == 'classification':
            for variable, target in product(self.categorical_features, self.target_values):
                nf_name = '{}_pred_{}'.format(variable, target)
                X_new[nf_name] = 0
                for prior, col_avg_y in self.learned_stats[nf_name]:
                    X_new[nf_name] += X_new[[variable]].join(col_avg_y, on=variable).fillna(prior, inplace=False)[
                        nf_name]
                X_new[nf_name] /= self.n_splits
        else:
            for variable in self.categorical_features:
                nf_name = '{}_pred'.format(variable)
                X_new[nf_name] = 0
                for prior, col_avg_y in self.learned_stats[nf_name]:
                    X_new[nf_name] += X_new[[variable]].join(col_avg_y, on=variable).fillna(prior, inplace=False)[
                        nf_name]
                X_new[nf_name] /= self.n_splits
 
        return X_new

In [23]:
class_list = ['model', 'brand', 'name', 'regionCode'] + date_cols
MeanEncodeFeature = class_list
ME = MeanEncoder(categorical_features=MeanEncodeFeature, n_splits=5, target_type='regression', prior_weight_func=None)
X_data = ME.fit_transform(X_data, Y_data)
X_test = ME.transform(X_test)

In [24]:
X_data['price'] = Train_data['price']

In [25]:
# target encoding目标编码，回归场景相对来说做目标编码的选择更多，不仅可以做均值编码，还可以做标准差编码、中位数编码等
enc_cols = []
stats_default_dict = {
    'max': X_data['price'].max(),
    'min': X_data['price'].min(),
    'median': X_data['price'].median(),
    'mean': X_data['price'].mean(),
    'sum': X_data['price'].sum(),
    'std': X_data['price'].std(),
    'skew': X_data['price'].skew(), # 偏度
    'kurt': X_data['price'].kurt(), # 峰度
    'mad': X_data['price'].mad() # mean absolute deviation 平均绝对偏差
}

# 暂且选择这三种编码
enc_stats = ['max', 'min', 'mean']
skf = KFold(n_splits=10, shuffle=True, random_state=42)
for f in tqdm(['regionCode', 'brand', 'regDate_year' ,'creatDate_year', 'kilometer', 'model']):
    enc_dict = {}
    for stat in enc_stats:
        enc_dict['{}_target_{}'.format(f, stat)] = stat
        X_data['{}_target_{}'.format(f, stat)] = 0
        X_test['{}_target_{}'.format(f, stat)] = 0
        enc_cols.append('{}_target_{}'.format(f, stat))
    for i, (trn_idx, val_idx) in enumerate(skf.split(X_data, Y_data)):
        trn_x, val_x = X_data.iloc[trn_idx].reset_index(drop=True), X_data.iloc[val_idx].reset_index(drop=True)
        enc_df = trn_x.groupby(f, as_index=False)['price'].agg(enc_dict)
        val_x = val_x[[f]].merge(enc_df, on=f, how='left')
        test_x = X_test[[f]].merge(enc_df, on=f, how='left')
        for stat in enc_stats:
            val_x['{}_target_{}'.format(f, stat)] = val_x['{}_target_{}'.format(f, stat)].fillna(stats_default_dict[stat])
            test_x['{}_target_{}'.format(f, stat)] = test_x['{}_target_{}'.format(f, stat)].fillna(stats_default_dict[stat])
            X_data.loc[val_idx, '{}_target_{}'.format(f, stat)] = val_x['{}_target_{}'.format(f, stat)].values 
            X_test['{}_target_{}'.format(f, stat)] += test_x['{}_target_{}'.format(f, stat)].values / skf.n_splits

100%|████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:37<00:00,  6.37s/it]


In [26]:
drop_list = ['regDate', 'creatDate', 'brand_power_min', 'regDate_year_power_min']
x_train = X_data.drop(drop_list + ['price'], axis=1)
x_test = X_test.drop(drop_list, axis=1)
x_train.shape

(150000, 463)

In [27]:
x_train = x_train.astype('float32')
x_test = x_test.astype('float32')

In [28]:
# 特征归一化
from sklearn.preprocessing import MinMaxScaler

min_max_scaler = MinMaxScaler()
min_max_scaler.fit(pd.concat([x_train, x_test]).values)
all_data = min_max_scaler.transform(pd.concat([x_train, x_test]).values)

In [29]:
from sklearn import decomposition
pca = decomposition.PCA(n_components=146)
all_pca = pca.fit_transform(all_data)
X_pca = all_pca[:len(x_train)]
test = all_pca[len(x_train):]
y = Train_data['price'].values

In [30]:
from keras.layers import Conv1D, Activation, MaxPool1D, Flatten, Dense
from keras.layers import Input, Dense, Concatenate, Reshape, Dropout, merge, Add
def NN_model(input_dim):
    init = keras.initializers.glorot_uniform(seed=1)
    model = keras.models.Sequential()
    model.add(Dense(units=300, input_dim=input_dim, kernel_initializer=init, activation='softplus'))
    # model.add(Dropout(0.2))
    model.add(Dense(units=300, kernel_initializer=init, activation='softplus'))
    # model.add(Dropout(0.2))
    model.add(Dense(units=64, kernel_initializer=init, activation='softplus'))
    model.add(Dense(units=32, kernel_initializer=init, activation='softplus'))
    model.add(Dense(units=8, kernel_initializer=init, activation='softplus'))
    model.add(Dense(units=1))
    return model

Using TensorFlow backend.


In [31]:
from keras.callbacks import Callback, EarlyStopping
class Metric(Callback):
    def __init__(self, model, callbacks, data):
        super().__init__()
        self.model = model
        self.callbacks = callbacks
        self.data = data

    def on_train_begin(self, logs=None):
        for callback in self.callbacks:
            callback.on_train_begin(logs)

    def on_train_end(self, logs=None):
        for callback in self.callbacks:
            callback.on_train_end(logs)

    def on_epoch_end(self, batch, logs=None):
        X_train, y_train = self.data[0][0], self.data[0][1]
        y_pred3 = self.model.predict(X_train)
        y_pred = np.zeros((len(y_pred3), ))
        y_true = np.zeros((len(y_pred3), ))
        for i in range(len(y_pred3)):
            y_pred[i] = y_pred3[i]
        for i in range(len(y_pred3)):
            y_true[i] = y_train[i]
        trn_s = mean_absolute_error(y_true, y_pred)
        logs['trn_score'] = trn_s
        
        X_val, y_val = self.data[1][0], self.data[1][1]
        y_pred3 = self.model.predict(X_val)
        y_pred = np.zeros((len(y_pred3), ))
        y_true = np.zeros((len(y_pred3), ))
        for i in range(len(y_pred3)):
            y_pred[i] = y_pred3[i]
        for i in range(len(y_pred3)):
            y_true[i] = y_val[i]
        val_s = mean_absolute_error(y_true, y_pred)
        logs['val_score'] = val_s
        print('trn_score', trn_s, 'val_score', val_s)

        for callback in self.callbacks:
            callback.on_epoch_end(batch, logs)

In [32]:
import keras.backend as K
from keras.callbacks import LearningRateScheduler
  
def scheduler(epoch):
    # 每隔100个epoch，学习率减小为原来的1/10
    if epoch % 20 == 0 and epoch != 0:
        lr = K.get_value(model.optimizer.lr)
        K.set_value(model.optimizer.lr, lr * 0.6)
        print("lr changed to {}".format(lr * 0.6))
    return K.get_value(model.optimizer.lr)
reduce_lr = LearningRateScheduler(scheduler)
# model.fit(train_x, train_y, batch_size=32, epochs=5, callbacks=[reduce_lr])

In [33]:
n_splits = 6
kf = KFold(n_splits=n_splits, shuffle=True)

import keras 

b_size = 2000
max_epochs = 145
oof_pred = np.zeros((len(X_pca), ))

sub = pd.read_csv('input/used_car_testB_20200421.csv', sep=' ')[['SaleID']].copy()
sub['price'] = 0

avg_mae = 0
for fold, (trn_idx, val_idx) in enumerate(kf.split(X_pca, y)):
    print('fold:', fold)
    X_train, y_train = X_pca[trn_idx], y[trn_idx]
    X_val, y_val = X_pca[val_idx], y[val_idx]
    
    model = NN_model(X_train.shape[1])
    simple_adam = keras.optimizers.Adam(lr = 0.015)
    
    model.compile(loss='mae', optimizer=simple_adam,metrics=['mae'])
    es = EarlyStopping(monitor='val_score', patience=10, verbose=2, mode='min', restore_best_weights=True,)
    es.set_model(model)
    metric = Metric(model, [es], [(X_train, y_train), (X_val, y_val)])
    model.fit(X_train, y_train, batch_size=b_size, epochs=max_epochs, 
              validation_data = [X_val, y_val],
              callbacks=[reduce_lr], shuffle=True, verbose=2)
    y_pred3 = model.predict(X_val)
    y_pred = np.zeros((len(y_pred3), ))
    sub['price'] += model.predict(test).reshape(-1,) / n_splits
    for i in range(len(y_pred3)):
        y_pred[i] = y_pred3[i]
        
    oof_pred[val_idx] = y_pred
    val_mae = mean_absolute_error(y[val_idx], y_pred)
    avg_mae += val_mae / n_splits
    print()
    print('val_mae is:{}'.format(val_mae))
    print()
mean_absolute_error(y, oof_pred)

fold: 0
Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.
Train on 125000 samples, validate on 25000 samples
Epoch 1/145
 - 5s - loss: 2604.6301 - mae: 2604.6296 - val_loss: 956.1041 - val_mae: 956.1042
Epoch 2/145
 - 5s - loss: 837.4435 - mae: 837.4435 - val_loss: 725.3945 - val_mae: 725.3944
Epoch 3/145
 - 4s - loss: 682.3839 - mae: 682.3839 - val_loss: 663.3570 - val_mae: 663.3570
Epoch 4/145
 - 6s - loss: 655.3445 - mae: 655.3445 - val_loss: 684.6003 - val_mae: 684.6003
Epoch 5/145
 - 5s - loss: 649.6578 - mae: 649.6578 - val_loss: 558.4263 - val_mae: 558.4263
Epoch 6/145
 - 5s - loss: 561.6686 - mae: 561.6686 - val_loss: 531.8434 - val_mae: 531.8434
Epoch 7/145
 - 5s - loss: 546.3971 - mae: 546.3972 - val_loss: 523.6252 - val_mae: 523.6253
Epoch 8/145
 - 5s - loss: 524.8794 - mae: 524.8793 - val_loss: 511.1096 - val_mae: 511.1096
Epoch 9/145
 - 7s - loss: 511.2446 - mae: 511.2448 - val_loss: 505.4069 - val_mae: 

lr changed to 0.0019439998548477888
 - 3s - loss: 380.0712 - mae: 380.0713 - val_loss: 420.6666 - val_mae: 420.6666
Epoch 82/145
 - 3s - loss: 380.5213 - mae: 380.5212 - val_loss: 424.0998 - val_mae: 424.0998
Epoch 83/145
 - 3s - loss: 378.9605 - mae: 378.9605 - val_loss: 415.6860 - val_mae: 415.6860
Epoch 84/145
 - 3s - loss: 378.1932 - mae: 378.1932 - val_loss: 418.4516 - val_mae: 418.4517
Epoch 85/145
 - 3s - loss: 378.2709 - mae: 378.2709 - val_loss: 418.0678 - val_mae: 418.0678
Epoch 86/145
 - 3s - loss: 378.8708 - mae: 378.8708 - val_loss: 418.9032 - val_mae: 418.9032
Epoch 87/145
 - 3s - loss: 379.8711 - mae: 379.8711 - val_loss: 415.6284 - val_mae: 415.6284
Epoch 88/145
 - 3s - loss: 377.8446 - mae: 377.8445 - val_loss: 418.3350 - val_mae: 418.3350
Epoch 89/145
 - 3s - loss: 377.5170 - mae: 377.5170 - val_loss: 415.8900 - val_mae: 415.8900
Epoch 90/145
 - 3s - loss: 377.0095 - mae: 377.0094 - val_loss: 414.4491 - val_mae: 414.4492
Epoch 91/145
 - 3s - loss: 376.8849 - mae: 376.

lr changed to 0.008999999798834323
 - 3s - loss: 447.1101 - mae: 447.1101 - val_loss: 477.3222 - val_mae: 477.3222
Epoch 22/145
 - 3s - loss: 446.9907 - mae: 446.9906 - val_loss: 459.5100 - val_mae: 459.5100
Epoch 23/145
 - 3s - loss: 445.7987 - mae: 445.7986 - val_loss: 455.1388 - val_mae: 455.1389
Epoch 24/145
 - 3s - loss: 442.9656 - mae: 442.9655 - val_loss: 448.6916 - val_mae: 448.6916
Epoch 25/145
 - 3s - loss: 444.5608 - mae: 444.5608 - val_loss: 466.8313 - val_mae: 466.8313
Epoch 26/145
 - 3s - loss: 441.1016 - mae: 441.1017 - val_loss: 446.9582 - val_mae: 446.9582
Epoch 27/145
 - 3s - loss: 442.0713 - mae: 442.0713 - val_loss: 462.0026 - val_mae: 462.0026
Epoch 28/145
 - 3s - loss: 448.3898 - mae: 448.3898 - val_loss: 450.2013 - val_mae: 450.2013
Epoch 29/145
 - 3s - loss: 440.8135 - mae: 440.8135 - val_loss: 458.7823 - val_mae: 458.7823
Epoch 30/145
 - 3s - loss: 437.8593 - mae: 437.8593 - val_loss: 465.0065 - val_mae: 465.0065
Epoch 31/145
 - 3s - loss: 437.8696 - mae: 437.8

Epoch 108/145
 - 3s - loss: 368.0774 - mae: 368.0774 - val_loss: 421.1464 - val_mae: 421.1464
Epoch 109/145
 - 3s - loss: 368.1576 - mae: 368.1575 - val_loss: 421.8460 - val_mae: 421.8459
Epoch 110/145
 - 3s - loss: 367.2594 - mae: 367.2594 - val_loss: 422.6092 - val_mae: 422.6092
Epoch 111/145
 - 3s - loss: 367.0980 - mae: 367.0979 - val_loss: 421.8016 - val_mae: 421.8016
Epoch 112/145
 - 3s - loss: 368.2339 - mae: 368.2339 - val_loss: 423.3190 - val_mae: 423.3190
Epoch 113/145
 - 3s - loss: 367.7920 - mae: 367.7920 - val_loss: 425.1294 - val_mae: 425.1294
Epoch 114/145
 - 3s - loss: 366.5236 - mae: 366.5235 - val_loss: 422.5322 - val_mae: 422.5322
Epoch 115/145
 - 3s - loss: 366.3694 - mae: 366.3694 - val_loss: 422.4756 - val_mae: 422.4756
Epoch 116/145
 - 3s - loss: 366.7799 - mae: 366.7799 - val_loss: 422.4432 - val_mae: 422.4431
Epoch 117/145
 - 3s - loss: 366.5445 - mae: 366.5445 - val_loss: 422.6195 - val_mae: 422.6196
Epoch 118/145
 - 3s - loss: 367.0602 - mae: 367.0601 - val_l

Epoch 49/145
 - 3s - loss: 412.9150 - mae: 412.9150 - val_loss: 423.0498 - val_mae: 423.0498
Epoch 50/145
 - 3s - loss: 409.6268 - mae: 409.6269 - val_loss: 432.4265 - val_mae: 432.4266
Epoch 51/145
 - 3s - loss: 414.8080 - mae: 414.8080 - val_loss: 446.7115 - val_mae: 446.7115
Epoch 52/145
 - 3s - loss: 432.0051 - mae: 432.0051 - val_loss: 426.2230 - val_mae: 426.2230
Epoch 53/145
 - 3s - loss: 407.7186 - mae: 407.7187 - val_loss: 427.9637 - val_mae: 427.9638
Epoch 54/145
 - 3s - loss: 405.8854 - mae: 405.8855 - val_loss: 429.9095 - val_mae: 429.9095
Epoch 55/145
 - 3s - loss: 406.1163 - mae: 406.1163 - val_loss: 421.7929 - val_mae: 421.7930
Epoch 56/145
 - 3s - loss: 403.9600 - mae: 403.9600 - val_loss: 426.4569 - val_mae: 426.4569
Epoch 57/145
 - 3s - loss: 404.3821 - mae: 404.3821 - val_loss: 421.2957 - val_mae: 421.2957
Epoch 58/145
 - 3s - loss: 405.5934 - mae: 405.5934 - val_loss: 420.9239 - val_mae: 420.9239
Epoch 59/145
 - 3s - loss: 402.7467 - mae: 402.7467 - val_loss: 422.08

Epoch 136/145
 - 3s - loss: 363.1204 - mae: 363.1204 - val_loss: 413.7688 - val_mae: 413.7688
Epoch 137/145
 - 3s - loss: 363.4909 - mae: 363.4909 - val_loss: 412.4555 - val_mae: 412.4555
Epoch 138/145
 - 3s - loss: 362.5030 - mae: 362.5030 - val_loss: 411.3558 - val_mae: 411.3558
Epoch 139/145
 - 4s - loss: 362.7395 - mae: 362.7396 - val_loss: 410.6789 - val_mae: 410.6789
Epoch 140/145
 - 3s - loss: 362.5036 - mae: 362.5036 - val_loss: 411.2678 - val_mae: 411.2677
Epoch 141/145
lr changed to 0.0004199039773084223
 - 3s - loss: 361.2673 - mae: 361.2673 - val_loss: 411.2467 - val_mae: 411.2466
Epoch 142/145
 - 3s - loss: 360.8421 - mae: 360.8421 - val_loss: 411.3813 - val_mae: 411.3813
Epoch 143/145
 - 3s - loss: 360.5367 - mae: 360.5367 - val_loss: 410.8069 - val_mae: 410.8069
Epoch 144/145
 - 3s - loss: 360.9113 - mae: 360.9113 - val_loss: 410.5651 - val_mae: 410.5651
Epoch 145/145
 - 3s - loss: 360.5329 - mae: 360.5328 - val_loss: 410.7122 - val_mae: 410.7122

val_mae is:410.71222148

Epoch 77/145
 - 3s - loss: 390.4830 - mae: 390.4830 - val_loss: 432.3826 - val_mae: 432.3826
Epoch 78/145
 - 3s - loss: 388.8571 - mae: 388.8571 - val_loss: 431.7162 - val_mae: 431.7162
Epoch 79/145
 - 3s - loss: 391.4551 - mae: 391.4550 - val_loss: 431.3386 - val_mae: 431.3386
Epoch 80/145
 - 3s - loss: 389.1626 - mae: 389.1627 - val_loss: 431.1269 - val_mae: 431.1269
Epoch 81/145
lr changed to 0.0019439998548477888
 - 3s - loss: 384.1318 - mae: 384.1318 - val_loss: 429.1710 - val_mae: 429.1710
Epoch 82/145
 - 3s - loss: 383.2888 - mae: 383.2888 - val_loss: 429.8195 - val_mae: 429.8195
Epoch 83/145
 - 3s - loss: 382.9712 - mae: 382.9712 - val_loss: 428.4957 - val_mae: 428.4956
Epoch 84/145
 - 3s - loss: 383.8090 - mae: 383.8090 - val_loss: 428.2298 - val_mae: 428.2298
Epoch 85/145
 - 3s - loss: 382.8231 - mae: 382.8231 - val_loss: 428.8239 - val_mae: 428.8239
Epoch 86/145
 - 3s - loss: 382.1816 - mae: 382.1816 - val_loss: 427.5903 - val_mae: 427.5904
Epoch 87/145
 - 3s - loss: 381.802

 - 3s - loss: 486.9722 - mae: 486.9723 - val_loss: 554.5625 - val_mae: 554.5625
Epoch 18/145
 - 3s - loss: 558.4006 - mae: 558.4006 - val_loss: 568.8307 - val_mae: 568.8307
Epoch 19/145
 - 3s - loss: 521.3737 - mae: 521.3737 - val_loss: 529.1305 - val_mae: 529.1306
Epoch 20/145
 - 4s - loss: 536.2080 - mae: 536.2080 - val_loss: 531.2009 - val_mae: 531.2009
Epoch 21/145
lr changed to 0.008999999798834323
 - 4s - loss: 468.1419 - mae: 468.1418 - val_loss: 452.3251 - val_mae: 452.3252
Epoch 22/145
 - 3s - loss: 451.8687 - mae: 451.8686 - val_loss: 460.0073 - val_mae: 460.0073
Epoch 23/145
 - 3s - loss: 452.5009 - mae: 452.5009 - val_loss: 451.7464 - val_mae: 451.7464
Epoch 24/145
 - 3s - loss: 448.9445 - mae: 448.9445 - val_loss: 449.7747 - val_mae: 449.7747
Epoch 25/145
 - 3s - loss: 446.0229 - mae: 446.0229 - val_loss: 460.6319 - val_mae: 460.6319
Epoch 26/145
 - 3s - loss: 446.9597 - mae: 446.9596 - val_loss: 446.9360 - val_mae: 446.9361
Epoch 27/145
 - 3s - loss: 446.4387 - mae: 446.4

Epoch 104/145
 - 3s - loss: 375.1990 - mae: 375.1990 - val_loss: 412.8834 - val_mae: 412.8833
Epoch 105/145
 - 3s - loss: 374.7226 - mae: 374.7226 - val_loss: 411.3611 - val_mae: 411.3611
Epoch 106/145
 - 3s - loss: 376.0185 - mae: 376.0185 - val_loss: 412.3231 - val_mae: 412.3231
Epoch 107/145
 - 3s - loss: 374.8501 - mae: 374.8501 - val_loss: 412.3652 - val_mae: 412.3651
Epoch 108/145
 - 3s - loss: 374.3826 - mae: 374.3828 - val_loss: 412.2897 - val_mae: 412.2897
Epoch 109/145
 - 3s - loss: 374.2733 - mae: 374.2734 - val_loss: 412.7684 - val_mae: 412.7684
Epoch 110/145
 - 3s - loss: 375.0055 - mae: 375.0055 - val_loss: 414.3370 - val_mae: 414.3370
Epoch 111/145
 - 3s - loss: 373.2466 - mae: 373.2466 - val_loss: 411.8754 - val_mae: 411.8754
Epoch 112/145
 - 3s - loss: 373.1140 - mae: 373.1141 - val_loss: 411.9558 - val_mae: 411.9558
Epoch 113/145
 - 3s - loss: 373.2105 - mae: 373.2106 - val_loss: 412.4952 - val_mae: 412.4952
Epoch 114/145
 - 3s - loss: 372.6017 - mae: 372.6017 - val_l

Epoch 45/145
 - 3s - loss: 415.5144 - mae: 415.5145 - val_loss: 437.9402 - val_mae: 437.9402
Epoch 46/145
 - 3s - loss: 417.9621 - mae: 417.9621 - val_loss: 456.3569 - val_mae: 456.3569
Epoch 47/145
 - 3s - loss: 436.1015 - mae: 436.1015 - val_loss: 441.5952 - val_mae: 441.5952
Epoch 48/145
 - 3s - loss: 418.3920 - mae: 418.3920 - val_loss: 449.6512 - val_mae: 449.6513
Epoch 49/145
 - 3s - loss: 412.7282 - mae: 412.7281 - val_loss: 433.7125 - val_mae: 433.7125
Epoch 50/145
 - 3s - loss: 412.2466 - mae: 412.2466 - val_loss: 433.5805 - val_mae: 433.5806
Epoch 51/145
 - 3s - loss: 411.4332 - mae: 411.4333 - val_loss: 435.5066 - val_mae: 435.5067
Epoch 52/145
 - 3s - loss: 410.4926 - mae: 410.4926 - val_loss: 434.4234 - val_mae: 434.4234
Epoch 53/145
 - 3s - loss: 429.9017 - mae: 429.9017 - val_loss: 446.5035 - val_mae: 446.5035
Epoch 54/145
 - 3s - loss: 411.7905 - mae: 411.7905 - val_loss: 434.1971 - val_mae: 434.1971
Epoch 55/145
 - 3s - loss: 408.4507 - mae: 408.4506 - val_loss: 434.19

Epoch 132/145
 - 3s - loss: 368.7828 - mae: 368.7828 - val_loss: 418.3669 - val_mae: 418.3669
Epoch 133/145
 - 3s - loss: 368.6341 - mae: 368.6341 - val_loss: 417.8061 - val_mae: 417.8061
Epoch 134/145
 - 3s - loss: 369.2081 - mae: 369.2081 - val_loss: 417.0344 - val_mae: 417.0345
Epoch 135/145
 - 3s - loss: 368.7485 - mae: 368.7486 - val_loss: 416.7450 - val_mae: 416.7450
Epoch 136/145
 - 3s - loss: 368.1534 - mae: 368.1534 - val_loss: 417.5618 - val_mae: 417.5618
Epoch 137/145
 - 3s - loss: 367.9327 - mae: 367.9327 - val_loss: 417.0064 - val_mae: 417.0064
Epoch 138/145
 - 3s - loss: 367.9482 - mae: 367.9482 - val_loss: 417.4171 - val_mae: 417.4171
Epoch 139/145
 - 3s - loss: 367.6218 - mae: 367.6217 - val_loss: 418.5260 - val_mae: 418.5260
Epoch 140/145
 - 3s - loss: 367.4545 - mae: 367.4545 - val_loss: 416.5882 - val_mae: 416.5882
Epoch 141/145
lr changed to 0.0004199039773084223
 - 3s - loss: 366.6023 - mae: 366.6023 - val_loss: 417.3705 - val_mae: 417.3706
Epoch 142/145
 - 3s - lo

416.07299597462975

In [34]:
sub.head(20)

Unnamed: 0,SaleID,price
0,200000,1265.969238
1,200001,2006.374878
2,200002,8774.297852
3,200003,1283.082153
4,200004,1990.913452
5,200005,1125.281616
6,200006,438.641174
7,200007,3613.474121
8,200008,13418.463867
9,200009,602.462341


In [35]:
sub.to_csv('submit/nn_sub_{}_{}.csv'.format('mae', sub['price'].mean()), index=False)