https://tianchi.aliyun.com/notebook-ai/detail?postId=103212

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.special import jn
from IPython.display import display, clear_output
import time
from tqdm import tqdm
import itertools
import warnings

%matplotlib inline
warnings.filterwarnings('ignore')
pd.set_option('max_columns', None)
pd.set_option('max_rows', None)

from sklearn import linear_model
from sklearn import preprocessing
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

from sklearn.decomposition import PCA, FastICA, FactorAnalysis, SparsePCA

import lightgbm as lgb
import xgboost as xgb

from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold, KFold, train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
import scipy.signal as signal

In [2]:
def reduce_mem_usage(df):
    """ 
    iterate through all the columns of a dataframe and modify the data type
    to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() 
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() 
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    return df

In [3]:
# 处理异常值
def smooth_cols(group,out_value,kind):
    cols = ['power']
    if kind == 'g':
        for col in cols:
            yes_no = (group[col]<out_value).astype('int')
            new = yes_no * group[col]
            group[col] = new.replace(0,group[col].quantile(q=0.995))
        return group
    if kind == 'l':
        for col in cols:
            yes_no = (group[col]>out_value).astype('int')
            new = yes_no * group[col]
            group[col] = new.replace(0,group[col].quantile(q=0.07))
        return group        

def date_proc(x):
    # '20200426' '20200026'
    m = int(x[4:6])
    if m == 0:
        m = 1
    return x[:4] + '-' + str(m) + '-' + x[6:]

# 定义日期提取函数
def date_tran(df,fea_col):
    for f in tqdm(fea_col):
        df[f] = pd.to_datetime(df[f].astype('str').apply(date_proc))
        df[f + '_year'] = df[f].dt.year # 年份
        df[f + '_month'] = df[f].dt.month # 月份
        df[f + '_day'] = df[f].dt.day # 多少号
        df[f + '_dayofweek'] = df[f].dt.dayofweek # 周几
    return df

# 分桶操作
def cut_group(df, cols, num_bins=50):
    for col in cols:
        all_range = int((df[col].max() + 1) - (df[col].min() - 1))
        bin = [i * all_range / num_bins for i in range(all_range)]
        df[col + '_bin'] = pd.cut(df[col], bin, labels=False)
    return df

# count编码
def count_coding(df, fea_col):
    for f in fea_col:
        df[f + '_count'] = df[f].map(df[f].value_counts())
    return df

# 定义交叉特征统计
def cross_cat_num(df, num_col, cat_col):
    for f1 in tqdm(cat_col):
        g = df.groupby(f1, as_index=False)
        for f2 in tqdm(num_col):
            feat = g[f2].agg({
                '{}_{}_max'.format(f1, f2): 'max', # 最大值
                '{}_{}_min'.format(f1, f2): 'min', # 最小值
                '{}_{}_median'.format(f1, f2): 'median', # 中位数
            })
            df = df.merge(feat, on=f1, how='left')
    return df

# 类别特征的二阶交叉
from scipy.stats import entropy
def cross_qua_cat_num(df):
    for f_pair in tqdm([
        ['model', 'brand'], ['model', 'regionCode'], ['brand', 'regionCode']
    ]):
        # 共现次数
        df['_'.join(f_pair) + '_count'] = df.groupby(f_pair)['SaleID'].transform('count')
        # n unique、熵
        df = df.merge(df.groupby(f_pair[0], as_index=False)[f_pair[1]].agg({
            '{}_{}_nunique'.format(f_pair[0], f_pair[1]): 'nunique',
            '{}_{}_ent'.format(f_pair[0], f_pair[1]): lambda x: entropy(x.value_counts() / x.shape[0])
        }), on=f_pair[0], how='left')
        df = df.merge(df.groupby(f_pair[1], as_index=False)[f_pair[0]].agg({
            '{}_{}_nunique'.format(f_pair[1], f_pair[0]): 'nunique',
            '{}_{}_ent'.format(f_pair[1], f_pair[0]): lambda x: entropy(x.value_counts() / x.shape[0])
        }), on=f_pair[1], how='left')
        # 比例偏好
        df['{}_in_{}_prop'.format(f_pair[0], f_pair[1])] = df['_'.join(f_pair) + '_count'] / df[f_pair[1] + '_count']
        df['{}_in_{}_prop'.format(f_pair[1], f_pair[0])] = df['_'.join(f_pair) + '_count'] / df[f_pair[0] + '_count']
    return df

In [4]:
Train_data = reduce_mem_usage(pd.read_csv('input/used_car_train_20200313.csv',
                                          sep=' '))
TestA_data = reduce_mem_usage(pd.read_csv('input/used_car_testB_20200421.csv',
                                          sep=' '))

print('Train data shape: {}'.format(Train_data.shape))
print('TestA_data shape: {}'.format(TestA_data.shape))

Memory usage of dataframe is 37200128.00 MB
Memory usage after optimization is: 10200232.00 MB
Decreased by 72.6%
Memory usage of dataframe is 12000128.00 MB
Memory usage after optimization is: 3200232.00 MB
Decreased by 73.3%
Train data shape: (150000, 31)
TestA_data shape: (50000, 30)


In [5]:
Train_data.head(10)

Unnamed: 0,SaleID,name,regDate,model,brand,bodyType,fuelType,gearbox,power,kilometer,notRepairedDamage,regionCode,seller,offerType,creatDate,price,v_0,v_1,v_2,v_3,v_4,v_5,v_6,v_7,v_8,v_9,v_10,v_11,v_12,v_13,v_14
0,0,736,20040402,30.0,6,1.0,0.0,0.0,60,12.5,0.0,1046,0,0,20160404,1850,43.34375,3.966797,0.050262,2.160156,1.143555,0.235718,0.10199,0.129517,0.022812,0.097473,-2.880859,2.804688,-2.419922,0.79541,0.914551
1,1,2262,20030301,40.0,1,2.0,0.0,0.0,0,15.0,-,4366,0,0,20160309,3600,45.3125,5.234375,0.137939,1.380859,-1.421875,0.264893,0.121033,0.135742,0.026596,0.020584,-4.902344,2.095703,-1.030273,-1.722656,0.245483
2,2,14874,20040403,115.0,15,1.0,0.0,0.0,163,12.5,0.0,2806,0,0,20160402,6222,45.96875,4.824219,1.319336,-0.998535,-0.99707,0.251465,0.114929,0.165161,0.062164,0.027069,-4.847656,1.803711,1.56543,-0.83252,-0.22998
3,3,71865,19960908,109.0,10,0.0,0.0,1.0,193,15.0,0.0,434,0,0,20160312,2400,45.6875,4.492188,-0.050629,0.883789,-2.228516,0.274414,0.110291,0.121948,0.033386,0.0,-4.507812,1.286133,-0.501953,-2.4375,-0.47876
4,4,111080,20120103,110.0,5,1.0,0.0,0.0,68,5.0,0.0,6977,0,0,20160313,5200,44.375,2.03125,0.572266,-1.571289,2.246094,0.228027,0.073181,0.091858,0.078796,0.121521,-1.896484,0.910645,0.931152,2.833984,1.923828
5,5,137642,20090602,24.0,10,0.0,1.0,0.0,109,10.0,0.0,3690,0,0,20160319,8000,46.3125,-3.228516,0.156616,-1.727539,-0.345703,0.260254,0.000518,0.119812,0.090942,0.048767,1.885742,-2.722656,2.457031,-0.286865,0.206543
6,6,2402,19990411,13.0,4,0.0,0.0,1.0,150,15.0,0.0,3073,0,0,20160317,3500,46.09375,4.925781,0.113281,1.644531,-1.270508,0.268066,0.117676,0.142334,0.025452,0.028168,-4.902344,1.610352,-0.834473,-1.996094,-0.10321
7,7,165346,19990706,26.0,14,1.0,0.0,0.0,101,15.0,0.0,4000,0,0,20160326,1000,42.25,-3.167969,-0.676758,1.942383,0.524414,0.239502,0.0,0.122925,0.039825,0.082397,3.693359,-0.244995,-2.193359,0.236694,0.195557
8,8,2974,20030205,19.0,1,2.0,1.0,1.0,179,15.0,0.0,4679,0,0,20160326,2850,46.09375,4.894531,0.475342,0.556641,-1.262695,0.263916,0.116577,0.144287,0.039856,0.024384,-4.925781,1.587891,0.075317,-1.550781,0.069458
9,9,82021,19980101,7.0,7,5.0,0.0,0.0,88,15.0,0.0,302,0,0,20160402,650,43.0625,1.666016,-2.201172,3.097656,0.84375,0.262451,0.068237,0.012177,0.010292,0.098755,-1.089844,0.600586,-4.1875,0.198242,-1.025391


In [6]:
TestA_data.head(10)

Unnamed: 0,SaleID,name,regDate,model,brand,bodyType,fuelType,gearbox,power,kilometer,notRepairedDamage,regionCode,seller,offerType,creatDate,v_0,v_1,v_2,v_3,v_4,v_5,v_6,v_7,v_8,v_9,v_10,v_11,v_12,v_13,v_14
0,200000,133777,20000501,67.0,0,1.0,0.0,0.0,101,15.0,0.0,5019,0,0,20160308,42.15625,-3.095703,-0.721191,1.466797,1.009766,0.236572,0.000241,0.105347,0.046234,0.094543,3.619141,-0.280518,-2.019531,0.979004,0.803223
1,200001,61206,19950211,19.0,6,2.0,0.0,0.0,73,6.0,0.0,1505,0,0,20160310,43.90625,-3.244141,-0.766602,1.276367,-1.06543,0.261475,0.0,0.1203,0.046783,0.0354,2.998047,-1.40625,-1.020508,-1.349609,-0.200562
2,200002,67829,20090606,5.0,5,4.0,0.0,0.0,120,5.0,-,1776,0,0,20160309,45.375,3.373047,-0.965332,-2.447266,0.624512,0.261719,0.09082,0.0,0.079651,0.073608,-3.951172,-0.43335,0.918945,1.634766,1.027344
3,200003,8892,20020601,22.0,9,1.0,0.0,0.0,58,15.0,0.0,26,0,0,20160314,42.78125,4.035156,-0.217407,1.708984,1.119141,0.236084,0.101807,0.098938,0.026825,0.096619,-2.847656,2.800781,-2.525391,1.077148,0.46167
4,200004,76998,20030301,46.0,6,0.0,,0.0,116,15.0,0.0,738,0,0,20160306,43.65625,-3.134766,-1.133789,0.470215,0.134033,0.25708,0.0,0.066711,0.05777,0.068848,2.839844,-1.660156,-0.924316,0.199463,0.450928
5,200005,142813,19990006,37.0,18,6.0,0.0,0.0,125,15.0,0.0,3393,0,0,20160404,43.65625,-3.130859,-2.519531,1.180664,1.295898,0.261475,0.0,0.0,0.046875,0.110718,2.646484,-2.441406,-2.255859,0.712402,-3.314453
6,200006,135370,19980503,36.0,6,4.0,0.0,0.0,75,15.0,0.0,2244,0,0,20160305,41.90625,-3.117188,-2.884766,3.189453,0.860352,0.261963,0.0,0.0,0.018356,0.098389,3.501953,-1.248047,-4.574219,0.569824,1.083984
7,200007,7138,20040201,88.0,14,3.0,0.0,1.0,125,15.0,0.0,155,0,0,20160325,44.875,4.542969,-0.659668,-0.196411,1.53125,0.251465,0.10907,0.027527,0.051117,0.108276,-4.433594,1.144531,-0.750977,1.530273,-0.90332
8,200008,7977,20110209,77.0,0,3.0,1.0,1.0,140,7.0,0.0,1184,0,0,20160328,47.09375,4.433594,0.462402,-1.986328,0.339111,0.259277,0.10907,0.081909,0.076965,0.066772,-5.183594,0.286865,2.080078,0.625977,0.68457
9,200009,104001,19991012,30.0,6,1.0,0.0,0.0,74,15.0,0.0,4874,0,0,20160317,41.5,-3.134766,-1.010742,2.517578,0.414551,0.240112,0.0,0.11084,0.030624,0.080078,3.958984,0.066406,-3.095703,0.205444,0.367188


In [7]:
# 合并数据集
concat_data = pd.concat([Train_data, TestA_data],
                        ignore_index=True) # 重新生成索引

# 'notRepairedDamage'中的'-'用0替换
concat_data['notRepairedDamage'] = concat_data['notRepairedDamage'].replace('-', 0).astype('float16')

In [8]:
# 每列中的缺失值用每列的众数填充
concat_data = concat_data.fillna(concat_data.mode().iloc[0, :])

In [9]:
concat_data.shape

(200000, 31)

In [10]:
concat_data.head(10)

Unnamed: 0,SaleID,name,regDate,model,brand,bodyType,fuelType,gearbox,power,kilometer,notRepairedDamage,regionCode,seller,offerType,creatDate,price,v_0,v_1,v_2,v_3,v_4,v_5,v_6,v_7,v_8,v_9,v_10,v_11,v_12,v_13,v_14
0,0,736,20040402,30.0,6,1.0,0.0,0.0,60,12.5,0.0,1046,0,0,20160404,1850.0,43.34375,3.966797,0.050262,2.160156,1.143555,0.235718,0.10199,0.129517,0.022812,0.097473,-2.880859,2.804688,-2.419922,0.79541,0.914551
1,1,2262,20030301,40.0,1,2.0,0.0,0.0,0,15.0,0.0,4366,0,0,20160309,3600.0,45.3125,5.234375,0.137939,1.380859,-1.421875,0.264893,0.121033,0.135742,0.026596,0.020584,-4.902344,2.095703,-1.030273,-1.722656,0.245483
2,2,14874,20040403,115.0,15,1.0,0.0,0.0,163,12.5,0.0,2806,0,0,20160402,6222.0,45.96875,4.824219,1.319336,-0.998535,-0.99707,0.251465,0.114929,0.165161,0.062164,0.027069,-4.847656,1.803711,1.56543,-0.83252,-0.22998
3,3,71865,19960908,109.0,10,0.0,0.0,1.0,193,15.0,0.0,434,0,0,20160312,2400.0,45.6875,4.492188,-0.050629,0.883789,-2.228516,0.274414,0.110291,0.121948,0.033386,0.0,-4.507812,1.286133,-0.501953,-2.4375,-0.47876
4,4,111080,20120103,110.0,5,1.0,0.0,0.0,68,5.0,0.0,6977,0,0,20160313,5200.0,44.375,2.03125,0.572266,-1.571289,2.246094,0.228027,0.073181,0.091858,0.078796,0.121521,-1.896484,0.910645,0.931152,2.833984,1.923828
5,5,137642,20090602,24.0,10,0.0,1.0,0.0,109,10.0,0.0,3690,0,0,20160319,8000.0,46.3125,-3.228516,0.156616,-1.727539,-0.345703,0.260254,0.000518,0.119812,0.090942,0.048767,1.885742,-2.722656,2.457031,-0.286865,0.206543
6,6,2402,19990411,13.0,4,0.0,0.0,1.0,150,15.0,0.0,3073,0,0,20160317,3500.0,46.09375,4.925781,0.113281,1.644531,-1.270508,0.268066,0.117676,0.142334,0.025452,0.028168,-4.902344,1.610352,-0.834473,-1.996094,-0.10321
7,7,165346,19990706,26.0,14,1.0,0.0,0.0,101,15.0,0.0,4000,0,0,20160326,1000.0,42.25,-3.167969,-0.676758,1.942383,0.524414,0.239502,0.0,0.122925,0.039825,0.082397,3.693359,-0.244995,-2.193359,0.236694,0.195557
8,8,2974,20030205,19.0,1,2.0,1.0,1.0,179,15.0,0.0,4679,0,0,20160326,2850.0,46.09375,4.894531,0.475342,0.556641,-1.262695,0.263916,0.116577,0.144287,0.039856,0.024384,-4.925781,1.587891,0.075317,-1.550781,0.069458
9,9,82021,19980101,7.0,7,5.0,0.0,0.0,88,15.0,0.0,302,0,0,20160402,650.0,43.0625,1.666016,-2.201172,3.097656,0.84375,0.262451,0.068237,0.012177,0.010292,0.098755,-1.089844,0.600586,-4.1875,0.198242,-1.025391


In [11]:
concat_data.tail(10)

Unnamed: 0,SaleID,name,regDate,model,brand,bodyType,fuelType,gearbox,power,kilometer,notRepairedDamage,regionCode,seller,offerType,creatDate,price,v_0,v_1,v_2,v_3,v_4,v_5,v_6,v_7,v_8,v_9,v_10,v_11,v_12,v_13,v_14
199990,249990,61395,19990906,19.0,5,3.0,0.0,0.0,132,15.0,0.0,3160,0,0,20160331,500.0,43.09375,-3.195312,-1.672852,2.144531,-0.656738,0.265137,0.0,0.070984,0.033417,0.051117,3.181641,-1.394531,-2.490234,-1.06543,-0.442627
199991,249991,72277,20031106,17.0,10,2.0,0.0,1.0,224,15.0,0.0,163,0,0,20160403,500.0,45.96875,-3.212891,0.245117,-1.666992,-1.884766,0.268066,0.000387,0.137573,0.086853,0.002478,1.99707,-2.519531,2.419922,-1.568359,0.269043
199992,249992,29738,20071009,41.0,6,1.0,0.0,0.0,60,8.0,0.0,3929,0,0,20160331,500.0,43.375,3.027344,0.086121,0.496582,1.914062,0.230103,0.087463,0.096252,0.047455,0.116638,-2.238281,2.027344,-1.245117,2.113281,1.605469
199993,249993,35,19970712,13.0,4,2.0,0.0,1.0,193,15.0,0.0,3258,0,0,20160328,500.0,44.71875,3.8125,-0.223511,2.148438,-1.631836,0.266113,0.100708,0.136597,0.018753,0.018051,-3.392578,1.723633,-1.783203,-2.158203,-0.41748
199994,249994,41919,20050807,4.0,4,0.0,0.0,0.0,150,15.0,0.0,5640,0,0,20160330,500.0,46.40625,2.798828,0.323975,-1.174805,-1.170898,0.26709,0.085876,0.109436,0.0672,0.024033,-3.435547,-0.013077,1.506836,-0.947266,0.359863
199995,249995,111443,20041005,4.0,4,0.0,0.0,1.0,150,15.0,0.0,5564,0,0,20160309,500.0,46.3125,-3.304688,0.073364,-0.622559,-0.77832,0.263672,0.000292,0.141846,0.076416,0.039276,2.072266,-2.53125,1.716797,-1.063477,0.32666
199996,249996,152834,20130409,65.0,1,0.0,0.0,0.0,179,4.0,0.0,5220,0,0,20160323,500.0,48.09375,-3.318359,0.96582,-2.671875,0.357422,0.255371,0.000991,0.155884,0.108398,0.067871,1.358398,-3.291016,4.269531,0.140503,0.556152
199997,249997,132531,20041211,4.0,4,0.0,0.0,1.0,147,12.5,0.0,3795,0,0,20160316,500.0,46.15625,-3.304688,-0.015282,-0.28833,-0.687012,0.262939,0.000318,0.141846,0.07196,0.042969,2.166016,-2.417969,1.371094,-1.073242,0.270508
199998,249998,143405,20020702,40.0,1,4.0,0.0,1.0,176,15.0,0.0,61,0,0,20160327,500.0,45.5,-3.197266,-1.141602,-0.434814,-1.844727,0.282227,2.3e-05,0.067505,0.067505,0.009003,2.029297,-2.939453,0.568848,-1.717773,0.316406
199999,249999,78202,20090708,32.0,8,1.0,0.0,0.0,0,3.0,0.0,4158,0,0,20160401,500.0,44.28125,4.179688,0.546875,-0.775879,1.790039,0.231445,0.103943,0.096008,0.062317,0.110168,-3.689453,2.033203,0.109131,2.203125,0.847656


In [12]:
# 处理异常值
concat_data['power'][concat_data['power'] > 600] = 600
concat_data['power'][concat_data['power'] < 1] = 1

concat_data['v_13'][concat_data['v_13'] > 6] = 6
concat_data['v_14'][concat_data['v_14'] > 4] = 4

In [13]:
# v系列特征之间相加
for j in ['v_' + str(i) for i in range(14)]:
    for k in ['v_' + str(m) for m in range(14)]:
        concat_data[j + '+' + k] = concat_data[j] + concat_data[k]

# 原始特征与v系列特征之间相乘
for i in ['model', 'brand', 'bodyType', 'fuelType', 'gearbox', 'power', 'kilometer', 'notRepairedDamage', 'regionCode']:
    for j in ['v_' + str(k) for k in range(14)]:
        concat_data[i + '*' + j] = concat_data[i] * concat_data[j]
    
concat_data.shape

(200000, 353)

In [14]:
# 提取日期信息
date_cols = ['regDate', 'creatDate']
concat_data = date_tran(concat_data, date_cols)

concat_data.shape

100%|████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00,  2.38it/s]


(200000, 361)

In [15]:
data = concat_data.copy()

data.columns

Index(['SaleID', 'name', 'regDate', 'model', 'brand', 'bodyType', 'fuelType',
       'gearbox', 'power', 'kilometer',
       ...
       'regionCode*v_12', 'regionCode*v_13', 'regDate_year', 'regDate_month',
       'regDate_day', 'regDate_dayofweek', 'creatDate_year', 'creatDate_month',
       'creatDate_day', 'creatDate_dayofweek'],
      dtype='object', length=361)

In [16]:
# count编码
count_list = ['regDate', 'creatDate', 'model', 'brand', 'regionCode', 'bodyType', 'fuelType', 'name',
              'regDate_year', 'regDate_month', 'regDate_day', 'regDate_dayofweek',
              'creatDate_month', 'creatDate_day', 'creatDate_dayofweek', 'kilometer']
data = count_coding(data, count_list)

In [17]:
# 特征构造
# 使用时间：data['creatDate'] - data['regDate']，反应汽车使用时间，一般来说价格与使用时间成反比
# 不过要注意，数据里有时间出错的格式，所以我们需要 errors='coerce'
data['used_time1'] = (pd.to_datetime(data['creatDate'], format='%Y%m%d', errors='coerce') - 
                      pd.to_datetime(data['regDate'], format='%Y%m%d', errors='coerce')).dt.days
data['used_time2'] = (pd.datetime.now() - pd.to_datetime(data['regDate'], format='%Y%m%d', errors='coerce')).dt.days                        
data['used_time3'] = (pd.datetime.now() - pd.to_datetime(data['creatDate'], format='%Y%m%d', errors='coerce') ).dt.days

In [18]:
# 分桶，注意：kilometer应该是已经离散化了的
cut_cols = ['power'] + ['used_time1', 'used_time2', 'used_time3']
data = cut_group(data, cut_cols, 50)

In [19]:
# 用数值特征对类别特征做统计刻画，随便挑了几个跟price相关性最高的匿名特征
cross_cat = ['model', 'brand','regDate_year']
cross_num = ['v_0','v_3', 'v_4', 'v_8', 'v_12','power', 'used_time1']
data = cross_cat_num(data, cross_num, cross_cat) # 一阶交叉
# data = cross_qua_cat_num(data) # 二阶交叉

  0%|                                                                                            | 0/3 [00:00<?, ?it/s]
  0%|                                                                                            | 0/7 [00:00<?, ?it/s][A
 14%|████████████                                                                        | 1/7 [00:01<00:08,  1.34s/it][A
 29%|████████████████████████                                                            | 2/7 [00:01<00:05,  1.03s/it][A
 43%|████████████████████████████████████                                                | 3/7 [00:01<00:03,  1.22it/s][A
 57%|████████████████████████████████████████████████                                    | 4/7 [00:02<00:02,  1.49it/s][A
 71%|████████████████████████████████████████████████████████████                        | 5/7 [00:02<00:01,  1.77it/s][A
 86%|████████████████████████████████████████████████████████████████████████            | 6/7 [00:02<00:00,  2.02it/s][A
100%|██████████████

In [20]:
# 选择特征列
numerical_cols = data.columns
print(numerical_cols)

Index(['SaleID', 'name', 'regDate', 'model', 'brand', 'bodyType', 'fuelType',
       'gearbox', 'power', 'kilometer',
       ...
       'regDate_year_v_8_median', 'regDate_year_v_12_max',
       'regDate_year_v_12_min', 'regDate_year_v_12_median',
       'regDate_year_power_max', 'regDate_year_power_min',
       'regDate_year_power_median', 'regDate_year_used_time1_max',
       'regDate_year_used_time1_min', 'regDate_year_used_time1_median'],
      dtype='object', length=447)


In [21]:
cat_fea = ['SaleID', 'offerType', 'seller']
feature_cols = [col for col in numerical_cols if col not in cat_fea]
feature_cols = [col for col in feature_cols if col not in ['price']]

# 将训练集和测试集分开
X_data = data.iloc[:len(Train_data), :][feature_cols]
Y_data = Train_data['price']
X_test  = data.iloc[len(Train_data):, :][feature_cols]

In [22]:
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold, KFold
from itertools import product
class MeanEncoder:
    def __init__(self, categorical_features, n_splits=10, target_type='classification', prior_weight_func=None):
        """
        :param categorical_features: list of str, the name of the categorical columns to encode
 
        :param n_splits: the number of splits used in mean encoding
 
        :param target_type: str, 'regression' or 'classification'
 
        :param prior_weight_func:
        a function that takes in the number of observations, and outputs prior weight
        when a dict is passed, the default exponential decay function will be used:
        k: the number of observations needed for the posterior to be weighted equally as the prior
        f: larger f --> smaller slope
        """
 
        self.categorical_features = categorical_features
        self.n_splits = n_splits
        self.learned_stats = {}
 
        if target_type == 'classification':
            self.target_type = target_type
            self.target_values = []
        else:
            self.target_type = 'regression'
            self.target_values = None
 
        if isinstance(prior_weight_func, dict):
            self.prior_weight_func = eval('lambda x: 1 / (1 + np.exp((x - k) / f))', dict(prior_weight_func, np=np))
        elif callable(prior_weight_func):
            self.prior_weight_func = prior_weight_func
        else:
            self.prior_weight_func = lambda x: 1 / (1 + np.exp((x - 2) / 1))
 
    @staticmethod
    def mean_encode_subroutine(X_train, y_train, X_test, variable, target, prior_weight_func):
        X_train = X_train[[variable]].copy()
        X_test = X_test[[variable]].copy()
 
        if target is not None:
            nf_name = '{}_pred_{}'.format(variable, target)
            X_train['pred_temp'] = (y_train == target).astype(int)  # classification
        else:
            nf_name = '{}_pred'.format(variable)
            X_train['pred_temp'] = y_train  # regression
        prior = X_train['pred_temp'].mean()
 
        col_avg_y = X_train.groupby(by=variable, axis=0)['pred_temp'].agg(['mean', 'size'])
        col_avg_y['size'] = prior_weight_func(col_avg_y['size'])
        col_avg_y[nf_name] = col_avg_y['size'] * prior + (1 - col_avg_y['size']) * col_avg_y['mean']
        col_avg_y.drop(['size', 'mean'], axis=1, inplace=True)
 
        nf_train = X_train.join(col_avg_y, on=variable)[nf_name].values
        nf_test = X_test.join(col_avg_y, on=variable).fillna(prior, inplace=False)[nf_name].values
 
        return nf_train, nf_test, prior, col_avg_y
 
    def fit_transform(self, X, y):
        """
        :param X: pandas DataFrame, n_samples * n_features
        :param y: pandas Series or numpy array, n_samples
        :return X_new: the transformed pandas DataFrame containing mean-encoded categorical features
        """
        X_new = X.copy()
        if self.target_type == 'classification':
            skf = StratifiedKFold(self.n_splits)
        else:
            skf = KFold(self.n_splits)
 
        if self.target_type == 'classification':
            self.target_values = sorted(set(y))
            self.learned_stats = {'{}_pred_{}'.format(variable, target): [] for variable, target in
                                  product(self.categorical_features, self.target_values)}
            for variable, target in product(self.categorical_features, self.target_values):
                nf_name = '{}_pred_{}'.format(variable, target)
                X_new.loc[:, nf_name] = np.nan
                for large_ind, small_ind in skf.split(X, y):
                    nf_large, nf_small, prior, col_avg_y = MeanEncoder.mean_encode_subroutine(
                        X_new.iloc[large_ind], y.iloc[large_ind], X_new.iloc[small_ind], variable, target, self.prior_weight_func)
                    X_new.iloc[small_ind, -1] = nf_small
                    self.learned_stats[nf_name].append((prior, col_avg_y))
        else:
            self.learned_stats = {'{}_pred'.format(variable): [] for variable in self.categorical_features}
            for variable in self.categorical_features:
                nf_name = '{}_pred'.format(variable)
                X_new.loc[:, nf_name] = np.nan
                for large_ind, small_ind in skf.split(X, y):
                    nf_large, nf_small, prior, col_avg_y = MeanEncoder.mean_encode_subroutine(
                        X_new.iloc[large_ind], y.iloc[large_ind], X_new.iloc[small_ind], variable, None, self.prior_weight_func)
                    X_new.iloc[small_ind, -1] = nf_small
                    self.learned_stats[nf_name].append((prior, col_avg_y))
        return X_new
 
    def transform(self, X):
        """
        :param X: pandas DataFrame, n_samples * n_features
        :return X_new: the transformed pandas DataFrame containing mean-encoded categorical features
        """
        X_new = X.copy()
 
        if self.target_type == 'classification':
            for variable, target in product(self.categorical_features, self.target_values):
                nf_name = '{}_pred_{}'.format(variable, target)
                X_new[nf_name] = 0
                for prior, col_avg_y in self.learned_stats[nf_name]:
                    X_new[nf_name] += X_new[[variable]].join(col_avg_y, on=variable).fillna(prior, inplace=False)[
                        nf_name]
                X_new[nf_name] /= self.n_splits
        else:
            for variable in self.categorical_features:
                nf_name = '{}_pred'.format(variable)
                X_new[nf_name] = 0
                for prior, col_avg_y in self.learned_stats[nf_name]:
                    X_new[nf_name] += X_new[[variable]].join(col_avg_y, on=variable).fillna(prior, inplace=False)[
                        nf_name]
                X_new[nf_name] /= self.n_splits
 
        return X_new

In [23]:
class_list = ['model', 'brand', 'name', 'regionCode'] + date_cols
MeanEncodeFeature = class_list
ME = MeanEncoder(categorical_features=MeanEncodeFeature, n_splits=5, target_type='regression', prior_weight_func=None)
X_data = ME.fit_transform(X_data, Y_data)
X_test = ME.transform(X_test)

In [24]:
X_data['price'] = Train_data['price']

In [25]:
# target encoding目标编码，回归场景相对来说做目标编码的选择更多，不仅可以做均值编码，还可以做标准差编码、中位数编码等
enc_cols = []
stats_default_dict = {
    'max': X_data['price'].max(),
    'min': X_data['price'].min(),
    'median': X_data['price'].median(),
    'mean': X_data['price'].mean(),
    'sum': X_data['price'].sum(),
    'std': X_data['price'].std(),
    'skew': X_data['price'].skew(), # 偏度
    'kurt': X_data['price'].kurt(), # 峰度
    'mad': X_data['price'].mad() # mean absolute deviation 平均绝对偏差
}

# 暂且选择这三种编码
enc_stats = ['max', 'min', 'mean']
skf = KFold(n_splits=10, shuffle=True, random_state=42)
for f in tqdm(['regionCode', 'brand', 'regDate_year' ,'creatDate_year', 'kilometer', 'model']):
    enc_dict = {}
    for stat in enc_stats:
        enc_dict['{}_target_{}'.format(f, stat)] = stat
        X_data['{}_target_{}'.format(f, stat)] = 0
        X_test['{}_target_{}'.format(f, stat)] = 0
        enc_cols.append('{}_target_{}'.format(f, stat))
    for i, (trn_idx, val_idx) in enumerate(skf.split(X_data, Y_data)):
        trn_x, val_x = X_data.iloc[trn_idx].reset_index(drop=True), X_data.iloc[val_idx].reset_index(drop=True)
        enc_df = trn_x.groupby(f, as_index=False)['price'].agg(enc_dict)
        val_x = val_x[[f]].merge(enc_df, on=f, how='left')
        test_x = X_test[[f]].merge(enc_df, on=f, how='left')
        for stat in enc_stats:
            val_x['{}_target_{}'.format(f, stat)] = val_x['{}_target_{}'.format(f, stat)].fillna(stats_default_dict[stat])
            test_x['{}_target_{}'.format(f, stat)] = test_x['{}_target_{}'.format(f, stat)].fillna(stats_default_dict[stat])
            X_data.loc[val_idx, '{}_target_{}'.format(f, stat)] = val_x['{}_target_{}'.format(f, stat)].values 
            X_test['{}_target_{}'.format(f, stat)] += test_x['{}_target_{}'.format(f, stat)].values / skf.n_splits

100%|████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:23<00:00,  3.98s/it]


In [26]:
drop_list = ['regDate', 'creatDate', 'brand_power_min', 'regDate_year_power_min']
x_train = X_data.drop(drop_list + ['price'], axis=1)
x_test = X_test.drop(drop_list, axis=1)
x_train.shape

(150000, 463)

In [27]:
x_train = x_train.astype('float32')
x_test = x_test.astype('float32')

In [28]:
# 特征归一化
from sklearn.preprocessing import MinMaxScaler

min_max_scaler = MinMaxScaler()
min_max_scaler.fit(pd.concat([x_train, x_test]).values)
all_data = min_max_scaler.transform(pd.concat([x_train, x_test]).values)

In [29]:
from sklearn import decomposition
pca = decomposition.PCA(n_components=200)
all_pca = pca.fit_transform(all_data)
X_pca = all_pca[:len(x_train)]
test = all_pca[len(x_train):]
y = Train_data['price'].values

In [30]:
from keras.layers import Conv1D, Activation, MaxPool1D, Flatten, Dense
from keras.layers import Input, Dense, Concatenate, Reshape, Dropout, merge, Add
def NN_model(input_dim):
    init = keras.initializers.glorot_uniform(seed=1)
    model = keras.models.Sequential()
    model.add(Dense(units=300, input_dim=input_dim, kernel_initializer=init, activation='softplus'))
    # model.add(Dropout(0.2))
    model.add(Dense(units=300, kernel_initializer=init, activation='softplus'))
    # model.add(Dropout(0.2))
    model.add(Dense(units=64, kernel_initializer=init, activation='softplus'))
    model.add(Dense(units=32, kernel_initializer=init, activation='softplus'))
    model.add(Dense(units=8, kernel_initializer=init, activation='softplus'))
    model.add(Dense(units=1))
    return model

Using TensorFlow backend.


In [31]:
from keras.callbacks import Callback, EarlyStopping
class Metric(Callback):
    def __init__(self, model, callbacks, data):
        super().__init__()
        self.model = model
        self.callbacks = callbacks
        self.data = data

    def on_train_begin(self, logs=None):
        for callback in self.callbacks:
            callback.on_train_begin(logs)

    def on_train_end(self, logs=None):
        for callback in self.callbacks:
            callback.on_train_end(logs)

    def on_epoch_end(self, batch, logs=None):
        X_train, y_train = self.data[0][0], self.data[0][1]
        y_pred3 = self.model.predict(X_train)
        y_pred = np.zeros((len(y_pred3), ))
        y_true = np.zeros((len(y_pred3), ))
        for i in range(len(y_pred3)):
            y_pred[i] = y_pred3[i]
        for i in range(len(y_pred3)):
            y_true[i] = y_train[i]
        trn_s = mean_absolute_error(y_true, y_pred)
        logs['trn_score'] = trn_s
        
        X_val, y_val = self.data[1][0], self.data[1][1]
        y_pred3 = self.model.predict(X_val)
        y_pred = np.zeros((len(y_pred3), ))
        y_true = np.zeros((len(y_pred3), ))
        for i in range(len(y_pred3)):
            y_pred[i] = y_pred3[i]
        for i in range(len(y_pred3)):
            y_true[i] = y_val[i]
        val_s = mean_absolute_error(y_true, y_pred)
        logs['val_score'] = val_s
        print('trn_score', trn_s, 'val_score', val_s)

        for callback in self.callbacks:
            callback.on_epoch_end(batch, logs)

In [32]:
import keras.backend as K
from keras.callbacks import LearningRateScheduler
  
def scheduler(epoch):
    # 每隔100个epoch，学习率减小为原来的1/10
    if epoch % 20 == 0 and epoch != 0:
        lr = K.get_value(model.optimizer.lr)
        K.set_value(model.optimizer.lr, lr * 0.6)
        print("lr changed to {}".format(lr * 0.6))
    return K.get_value(model.optimizer.lr)
reduce_lr = LearningRateScheduler(scheduler)
# model.fit(train_x, train_y, batch_size=32, epochs=5, callbacks=[reduce_lr])

In [33]:
n_splits = 6
kf = KFold(n_splits=n_splits, shuffle=True)

import keras 

b_size = 2000
max_epochs = 145
oof_pred = np.zeros((len(X_pca), ))

sub = pd.read_csv('input/used_car_testB_20200421.csv', sep=' ')[['SaleID']].copy()
sub['price'] = 0

avg_mae = 0
for fold, (trn_idx, val_idx) in enumerate(kf.split(X_pca, y)):
    print('fold:', fold)
    X_train, y_train = X_pca[trn_idx], y[trn_idx]
    X_val, y_val = X_pca[val_idx], y[val_idx]
    
    model = NN_model(X_train.shape[1])
    simple_adam = keras.optimizers.Adam(lr = 0.015)
    
    model.compile(loss='mae', optimizer=simple_adam,metrics=['mae'])
    es = EarlyStopping(monitor='val_score', patience=10, verbose=2, mode='min', restore_best_weights=True,)
    es.set_model(model)
    metric = Metric(model, [es], [(X_train, y_train), (X_val, y_val)])
    model.fit(X_train, y_train, batch_size=b_size, epochs=max_epochs, 
              validation_data = [X_val, y_val],
              callbacks=[reduce_lr], shuffle=True, verbose=2)
    y_pred3 = model.predict(X_val)
    y_pred = np.zeros((len(y_pred3), ))
    sub['price'] += model.predict(test).reshape(-1,) / n_splits
    for i in range(len(y_pred3)):
        y_pred[i] = y_pred3[i]
        
    oof_pred[val_idx] = y_pred
    val_mae = mean_absolute_error(y[val_idx], y_pred)
    avg_mae += val_mae / n_splits
    print()
    print('val_mae is:{}'.format(val_mae))
    print()
mean_absolute_error(y, oof_pred)

fold: 0
Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.
Train on 125000 samples, validate on 25000 samples
Epoch 1/145
 - 2s - loss: 2742.0239 - mae: 2742.0237 - val_loss: 980.4724 - val_mae: 980.4724
Epoch 2/145
 - 2s - loss: 861.4175 - mae: 861.4174 - val_loss: 761.1672 - val_mae: 761.1672
Epoch 3/145
 - 2s - loss: 690.1249 - mae: 690.1250 - val_loss: 652.8037 - val_mae: 652.8036
Epoch 4/145
 - 2s - loss: 624.0540 - mae: 624.0540 - val_loss: 660.0627 - val_mae: 660.0627
Epoch 5/145
 - 2s - loss: 577.0155 - mae: 577.0156 - val_loss: 578.1737 - val_mae: 578.1737
Epoch 6/145
 - 2s - loss: 579.0853 - mae: 579.0853 - val_loss: 539.6563 - val_mae: 539.6563
Epoch 7/145
 - 2s - loss: 541.2288 - mae: 541.2288 - val_loss: 543.8874 - val_mae: 543.8874
Epoch 8/145
 - 2s - loss: 538.8325 - mae: 538.8325 - val_loss: 520.2231 - val_mae: 520.2231
Epoch 9/145
 - 2s - loss: 523.1769 - mae: 523.1769 - val_loss: 511.0036 - val_mae: 

 - 2s - loss: 376.6207 - mae: 376.6208 - val_loss: 415.0958 - val_mae: 415.0958
Epoch 82/145
 - 2s - loss: 374.8157 - mae: 374.8157 - val_loss: 415.2751 - val_mae: 415.2751
Epoch 83/145
 - 2s - loss: 374.0000 - mae: 373.9999 - val_loss: 414.5203 - val_mae: 414.5202
Epoch 84/145
 - 2s - loss: 375.3843 - mae: 375.3843 - val_loss: 414.8520 - val_mae: 414.8520
Epoch 85/145
 - 2s - loss: 374.4219 - mae: 374.4220 - val_loss: 415.9802 - val_mae: 415.9803
Epoch 86/145
 - 2s - loss: 374.2627 - mae: 374.2627 - val_loss: 415.8082 - val_mae: 415.8082
Epoch 87/145
 - 2s - loss: 373.0823 - mae: 373.0823 - val_loss: 416.6590 - val_mae: 416.6590
Epoch 88/145
 - 2s - loss: 373.2995 - mae: 373.2994 - val_loss: 415.5583 - val_mae: 415.5583
Epoch 89/145
 - 2s - loss: 373.0149 - mae: 373.0149 - val_loss: 416.0504 - val_mae: 416.0504
Epoch 90/145
 - 2s - loss: 373.7417 - mae: 373.7417 - val_loss: 415.7240 - val_mae: 415.7240
Epoch 91/145
 - 2s - loss: 372.4730 - mae: 372.4730 - val_loss: 416.6339 - val_mae:

Epoch 22/145
 - 2s - loss: 447.0935 - mae: 447.0934 - val_loss: 460.5277 - val_mae: 460.5278
Epoch 23/145
 - 2s - loss: 443.3994 - mae: 443.3995 - val_loss: 439.3816 - val_mae: 439.3817
Epoch 24/145
 - 2s - loss: 440.5118 - mae: 440.5117 - val_loss: 441.7517 - val_mae: 441.7517
Epoch 25/145
 - 2s - loss: 443.6152 - mae: 443.6152 - val_loss: 443.5074 - val_mae: 443.5073
Epoch 26/145
 - 2s - loss: 435.7277 - mae: 435.7278 - val_loss: 435.6730 - val_mae: 435.6730
Epoch 27/145
 - 2s - loss: 442.3390 - mae: 442.3391 - val_loss: 456.7089 - val_mae: 456.7088
Epoch 28/145
 - 2s - loss: 448.0997 - mae: 448.0996 - val_loss: 447.1020 - val_mae: 447.1020
Epoch 29/145
 - 2s - loss: 438.7324 - mae: 438.7323 - val_loss: 449.1345 - val_mae: 449.1346
Epoch 30/145
 - 2s - loss: 469.5106 - mae: 469.5106 - val_loss: 456.5953 - val_mae: 456.5953
Epoch 31/145
 - 2s - loss: 448.0275 - mae: 448.0275 - val_loss: 435.0923 - val_mae: 435.0923
Epoch 32/145
 - 2s - loss: 432.6475 - mae: 432.6475 - val_loss: 446.06

Epoch 109/145
 - 2s - loss: 371.7241 - mae: 371.7240 - val_loss: 410.8354 - val_mae: 410.8354
Epoch 110/145
 - 2s - loss: 371.7239 - mae: 371.7239 - val_loss: 409.6559 - val_mae: 409.6559
Epoch 111/145
 - 2s - loss: 371.6427 - mae: 371.6428 - val_loss: 409.1714 - val_mae: 409.1714
Epoch 112/145
 - 2s - loss: 371.5793 - mae: 371.5793 - val_loss: 412.6486 - val_mae: 412.6486
Epoch 113/145
 - 2s - loss: 371.4274 - mae: 371.4274 - val_loss: 412.5255 - val_mae: 412.5255
Epoch 114/145
 - 2s - loss: 371.1470 - mae: 371.1470 - val_loss: 411.3296 - val_mae: 411.3297
Epoch 115/145
 - 2s - loss: 370.8207 - mae: 370.8206 - val_loss: 409.6770 - val_mae: 409.6770
Epoch 116/145
 - 2s - loss: 371.2231 - mae: 371.2231 - val_loss: 410.3326 - val_mae: 410.3326
Epoch 117/145
 - 2s - loss: 370.8570 - mae: 370.8568 - val_loss: 410.4394 - val_mae: 410.4394
Epoch 118/145
 - 2s - loss: 369.9770 - mae: 369.9770 - val_loss: 412.4129 - val_mae: 412.4128
Epoch 119/145
 - 2s - loss: 370.3593 - mae: 370.3593 - val_l

Epoch 50/145
 - 2s - loss: 407.7019 - mae: 407.7019 - val_loss: 472.5515 - val_mae: 472.5515
Epoch 51/145
 - 2s - loss: 415.0025 - mae: 415.0025 - val_loss: 430.0688 - val_mae: 430.0688
Epoch 52/145
 - 2s - loss: 404.9827 - mae: 404.9826 - val_loss: 434.9214 - val_mae: 434.9214
Epoch 53/145
 - 2s - loss: 404.8511 - mae: 404.8510 - val_loss: 436.8258 - val_mae: 436.8258
Epoch 54/145
 - 2s - loss: 403.3484 - mae: 403.3484 - val_loss: 431.6231 - val_mae: 431.6231
Epoch 55/145
 - 2s - loss: 402.0684 - mae: 402.0684 - val_loss: 430.7159 - val_mae: 430.7159
Epoch 56/145
 - 2s - loss: 403.2676 - mae: 403.2676 - val_loss: 436.2518 - val_mae: 436.2518
Epoch 57/145
 - 2s - loss: 404.0948 - mae: 404.0948 - val_loss: 438.6568 - val_mae: 438.6568
Epoch 58/145
 - 2s - loss: 401.0609 - mae: 401.0609 - val_loss: 425.1663 - val_mae: 425.1664
Epoch 59/145
 - 2s - loss: 405.2765 - mae: 405.2766 - val_loss: 445.1706 - val_mae: 445.1707
Epoch 60/145
 - 2s - loss: 400.0802 - mae: 400.0802 - val_loss: 427.08

Epoch 137/145
 - 2s - loss: 359.9695 - mae: 359.9695 - val_loss: 417.3195 - val_mae: 417.3194
Epoch 138/145
 - 2s - loss: 360.3234 - mae: 360.3234 - val_loss: 415.0403 - val_mae: 415.0403
Epoch 139/145
 - 2s - loss: 359.1211 - mae: 359.1212 - val_loss: 415.4134 - val_mae: 415.4135
Epoch 140/145
 - 2s - loss: 358.7639 - mae: 358.7639 - val_loss: 414.9116 - val_mae: 414.9116
Epoch 141/145
lr changed to 0.0004199039773084223
 - 2s - loss: 358.1596 - mae: 358.1596 - val_loss: 414.9217 - val_mae: 414.9217
Epoch 142/145
 - 2s - loss: 357.7752 - mae: 357.7753 - val_loss: 415.1181 - val_mae: 415.1180
Epoch 143/145
 - 2s - loss: 357.5750 - mae: 357.5750 - val_loss: 414.3006 - val_mae: 414.3006
Epoch 144/145
 - 2s - loss: 357.5714 - mae: 357.5714 - val_loss: 414.8384 - val_mae: 414.8384
Epoch 145/145
 - 2s - loss: 357.5117 - mae: 357.5117 - val_loss: 416.1987 - val_mae: 416.1987

val_mae is:416.1987109146881

fold: 3
Train on 125000 samples, validate on 25000 samples
Epoch 1/145
 - 2s - loss: 28

Epoch 78/145
 - 2s - loss: 381.5580 - mae: 381.5580 - val_loss: 419.4184 - val_mae: 419.4184
Epoch 79/145
 - 2s - loss: 379.3259 - mae: 379.3260 - val_loss: 417.3739 - val_mae: 417.3739
Epoch 80/145
 - 2s - loss: 380.6265 - mae: 380.6266 - val_loss: 418.5114 - val_mae: 418.5113
Epoch 81/145
lr changed to 0.0019439998548477888
 - 2s - loss: 375.4766 - mae: 375.4766 - val_loss: 417.8200 - val_mae: 417.8199
Epoch 82/145
 - 2s - loss: 374.5808 - mae: 374.5808 - val_loss: 416.8601 - val_mae: 416.8602
Epoch 83/145
 - 2s - loss: 374.3008 - mae: 374.3009 - val_loss: 416.2990 - val_mae: 416.2991
Epoch 84/145
 - 2s - loss: 373.9483 - mae: 373.9482 - val_loss: 419.9259 - val_mae: 419.9259
Epoch 85/145
 - 2s - loss: 374.6487 - mae: 374.6488 - val_loss: 416.9067 - val_mae: 416.9067
Epoch 86/145
 - 2s - loss: 373.9052 - mae: 373.9052 - val_loss: 417.4309 - val_mae: 417.4309
Epoch 87/145
 - 2s - loss: 373.8739 - mae: 373.8740 - val_loss: 419.1764 - val_mae: 419.1765
Epoch 88/145
 - 2s - loss: 373.671

Epoch 19/145
 - 2s - loss: 486.3210 - mae: 486.3210 - val_loss: 671.6370 - val_mae: 671.6370
Epoch 20/145
 - 2s - loss: 508.8490 - mae: 508.8491 - val_loss: 496.9035 - val_mae: 496.9035
Epoch 21/145
lr changed to 0.008999999798834323
 - 2s - loss: 474.2849 - mae: 474.2849 - val_loss: 446.6661 - val_mae: 446.6660
Epoch 22/145
 - 2s - loss: 446.3491 - mae: 446.3492 - val_loss: 448.8685 - val_mae: 448.8685
Epoch 23/145
 - 2s - loss: 441.5642 - mae: 441.5641 - val_loss: 438.9290 - val_mae: 438.9290
Epoch 24/145
 - 2s - loss: 441.5792 - mae: 441.5792 - val_loss: 447.0553 - val_mae: 447.0554
Epoch 25/145
 - 2s - loss: 444.5205 - mae: 444.5205 - val_loss: 442.6793 - val_mae: 442.6793
Epoch 26/145
 - 2s - loss: 439.3547 - mae: 439.3547 - val_loss: 443.9923 - val_mae: 443.9922
Epoch 27/145
 - 2s - loss: 440.9442 - mae: 440.9442 - val_loss: 437.3435 - val_mae: 437.3436
Epoch 28/145
 - 2s - loss: 436.5928 - mae: 436.5928 - val_loss: 436.6674 - val_mae: 436.6674
Epoch 29/145
 - 2s - loss: 450.3144

 - 2s - loss: 370.4476 - mae: 370.4475 - val_loss: 409.0104 - val_mae: 409.0104
Epoch 106/145
 - 2s - loss: 371.5492 - mae: 371.5493 - val_loss: 408.6798 - val_mae: 408.6798
Epoch 107/145
 - 2s - loss: 371.2585 - mae: 371.2585 - val_loss: 409.6740 - val_mae: 409.6740
Epoch 108/145
 - 2s - loss: 370.5381 - mae: 370.5381 - val_loss: 408.9044 - val_mae: 408.9044
Epoch 109/145
 - 2s - loss: 369.7541 - mae: 369.7541 - val_loss: 409.0427 - val_mae: 409.0427
Epoch 110/145
 - 2s - loss: 370.2755 - mae: 370.2755 - val_loss: 408.5195 - val_mae: 408.5195
Epoch 111/145
 - 2s - loss: 369.5392 - mae: 369.5393 - val_loss: 408.2675 - val_mae: 408.2675
Epoch 112/145
 - 2s - loss: 369.4269 - mae: 369.4269 - val_loss: 409.1914 - val_mae: 409.1914
Epoch 113/145
 - 2s - loss: 369.3304 - mae: 369.3304 - val_loss: 407.9892 - val_mae: 407.9892
Epoch 114/145
 - 2s - loss: 368.7506 - mae: 368.7505 - val_loss: 408.5433 - val_mae: 408.5432
Epoch 115/145
 - 2s - loss: 368.7279 - mae: 368.7280 - val_loss: 410.7921 

Epoch 46/145
 - 2s - loss: 407.9708 - mae: 407.9709 - val_loss: 434.8809 - val_mae: 434.8809
Epoch 47/145
 - 2s - loss: 406.4875 - mae: 406.4874 - val_loss: 435.9003 - val_mae: 435.9004
Epoch 48/145
 - 2s - loss: 414.7071 - mae: 414.7072 - val_loss: 437.6923 - val_mae: 437.6923
Epoch 49/145
 - 2s - loss: 405.7546 - mae: 405.7546 - val_loss: 437.2132 - val_mae: 437.2132
Epoch 50/145
 - 2s - loss: 411.1179 - mae: 411.1179 - val_loss: 438.4213 - val_mae: 438.4213
Epoch 51/145
 - 2s - loss: 404.7778 - mae: 404.7778 - val_loss: 435.5018 - val_mae: 435.5019
Epoch 52/145
 - 2s - loss: 402.4938 - mae: 402.4938 - val_loss: 436.0253 - val_mae: 436.0253
Epoch 53/145
 - 2s - loss: 400.5695 - mae: 400.5694 - val_loss: 433.4523 - val_mae: 433.4523
Epoch 54/145
 - 2s - loss: 401.3205 - mae: 401.3206 - val_loss: 433.6615 - val_mae: 433.6615
Epoch 55/145
 - 2s - loss: 400.8299 - mae: 400.8299 - val_loss: 431.1335 - val_mae: 431.1335
Epoch 56/145
 - 2s - loss: 400.1628 - mae: 400.1628 - val_loss: 452.10

Epoch 133/145
 - 2s - loss: 357.5971 - mae: 357.5971 - val_loss: 417.5558 - val_mae: 417.5558
Epoch 134/145
 - 2s - loss: 358.3248 - mae: 358.3248 - val_loss: 419.3984 - val_mae: 419.3983
Epoch 135/145
 - 2s - loss: 357.6371 - mae: 357.6371 - val_loss: 418.6940 - val_mae: 418.6940
Epoch 136/145
 - 2s - loss: 357.4288 - mae: 357.4288 - val_loss: 417.7966 - val_mae: 417.7966
Epoch 137/145
 - 2s - loss: 357.4254 - mae: 357.4254 - val_loss: 419.3640 - val_mae: 419.3640
Epoch 138/145
 - 2s - loss: 357.2710 - mae: 357.2710 - val_loss: 418.5160 - val_mae: 418.5160
Epoch 139/145
 - 2s - loss: 356.9453 - mae: 356.9453 - val_loss: 418.4182 - val_mae: 418.4182
Epoch 140/145
 - 2s - loss: 356.7634 - mae: 356.7634 - val_loss: 417.5063 - val_mae: 417.5063
Epoch 141/145
lr changed to 0.0004199039773084223
 - 2s - loss: 355.7663 - mae: 355.7663 - val_loss: 417.9978 - val_mae: 417.9978
Epoch 142/145
 - 2s - loss: 355.5031 - mae: 355.5032 - val_loss: 418.1028 - val_mae: 418.1028
Epoch 143/145
 - 2s - lo

413.12414704614935

In [34]:
sub.head(20)

Unnamed: 0,SaleID,price
0,200000,1274.86235
1,200001,2006.420776
2,200002,8742.022217
3,200003,1254.729889
4,200004,1982.045349
5,200005,1118.820114
6,200006,442.411957
7,200007,3531.691284
8,200008,13751.573242
9,200009,602.167061


In [35]:
sub.to_csv('submit/nn_sub_{}_{}.csv'.format('mae', sub['price'].mean()), index=False)