In [1]:
import numpy as np
import pandas as pd
from datetime import datetime
import os
import gc
import xgboost as xgb
import math
from joblib import Parallel, delayed

from tqdm import tqdm
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import mean_absolute_error, mean_squared_error
import matplotlib.pyplot as plt
import time
import warnings

warnings.filterwarnings('ignore')

pd.set_option('max_columns', None)
pd.set_option('max_rows', None)

In [2]:
def timestamp2string(timeStamp):
    try:
        d = datetime.fromtimestamp(timeStamp)
        str1 = d.strftime("%Y-%m-%d %H:%M:%S")
        # 2015-08-28 16:43:37'
        return datetime.strptime(str1,'%Y-%m-%d %H:%M:%S')
    except Exception as e:
        return ''

In [3]:
def get_psi(c):
    psi_res = pd.DataFrame()
    psi_dict={}
    # for c in tqdm(f_cols):
    try:
        t_train = x_train[c].fillna(-998)
        t_test = x_test[c].fillna(-998)
        #获取切分点
        bins=[]
        for i in np.arange(0,1.1,0.2):
            bins.append(t_train.quantile(i))
        bins=sorted(set(bins))
        bins[0]=-np.inf
        bins[-1]=np.inf
        #计算psi
        t_psi = pd.DataFrame()
        t_psi['train'] = pd.cut(t_train,bins).value_counts().sort_index()
        t_psi['test'] = pd.cut(t_test,bins).value_counts()
        t_psi.index=[str(x) for x in t_psi.index]
        t_psi.loc['总计',:] = t_psi.sum()
        t_psi['train_rate'] = t_psi['train']/t_psi.loc['总计','train']
        t_psi['test_rate'] = t_psi['test']/t_psi.loc['总计','test']
        t_psi['psi'] = (t_psi['test_rate']-t_psi['train_rate'])*(np.log(t_psi['test_rate'])-np.log(t_psi['train_rate']))
        t_psi.loc['总计','psi'] = t_psi['psi'].sum()
        t_psi.index.name=c
        #汇总
        t_res = pd.DataFrame([[c,t_psi.loc['总计','psi']]],
                             columns=['变量名','PSI'])
        psi_res = pd.concat([psi_res,t_res])
        psi_dict[c]=t_psi
        print(c,'done')
    except:
        print(c,'error')
    return psi_res #, psi_dict

In [4]:
def correlation(df, threshold=0.98):
    """
    特征相关性计算
    @param df: 
    @param threshold: 
    @return: 
    """
    col_corr = set()
    corr_matrix = df.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if abs(corr_matrix.iloc[i, j]) > threshold:
                colName = corr_matrix.columns[i]
                col_corr.add(colName)
    return col_corr

In [5]:
train_df = pd.read_hdf('../input/train.h5')
test_df = pd.read_hdf('../input/test.h5')
sub = pd.DataFrame(test_df['time'])

In [6]:
train_df = train_df[train_df['temperature'].notnull()]
train_df = train_df.fillna(method='bfill')
test_df = test_df.fillna(method='bfill')
gc.collect()

64

In [7]:
train_df.columns = ['time', 'year', 'month', 'day', 'hour', 'min', 'sec', 'outdoorTemp', 'outdoorHum', 'outdoorAtmo',
                    'indoorHum', 'indoorAtmo', 'temperature']
test_df.columns = ['time', 'year', 'month', 'day', 'hour', 'min', 'sec', 'outdoorTemp', 'outdoorHum', 'outdoorAtmo',
                   'indoorHum', 'indoorAtmo']

In [8]:
print('train_df.shape: ', train_df.shape)
train_df = train_df.loc[(train_df['outdoorTemp'] >= test_df['outdoorTemp'].min()) & (train_df['outdoorTemp'] <= test_df['outdoorTemp'].max())]
print('处理后 train_df.shape: ', train_df.shape)

train_df.shape:  (24807, 13)
处理后 train_df.shape:  (19338, 13)


In [9]:
train_count = train_df.shape[0]
y_train = train_df['temperature'].values - train_df['outdoorTemp'].values

In [10]:
train_df['indoorHum-outdoorHum'] = train_df['indoorHum'] - train_df['outdoorHum']
train_df['indoorAtmo-outdoorAtmo'] = train_df['indoorAtmo'] - train_df['outdoorAtmo']

test_df['indoorHum-outdoorHum'] = test_df['indoorHum'] - test_df['outdoorHum']
test_df['indoorAtmo-outdoorAtmo'] = test_df['indoorAtmo'] - test_df['outdoorAtmo']

In [11]:
train_df.loc[train_df['indoorAtmo-outdoorAtmo'] > 400, 'indoorAtmo-outdoorAtmo'] = 75
train_df.loc[train_df['indoorAtmo-outdoorAtmo'] < -400, 'indoorAtmo-outdoorAtmo'] = -93

test_df.loc[test_df['indoorAtmo-outdoorAtmo'] > 400, 'indoorAtmo-outdoorAtmo'] = 75
test_df.loc[test_df['indoorAtmo-outdoorAtmo'] < -400, 'indoorAtmo-outdoorAtmo'] = -93

In [12]:
train_df.loc[train_df['indoorHum-outdoorHum'] > 15, 'indoorHum-outdoorHum'] = 15
test_df.loc[test_df['indoorHum-outdoorHum'] > 15, 'indoorHum-outdoorHum'] = 15

In [13]:
data_df = pd.concat([train_df, test_df], axis=0, ignore_index=True)

del train_df, test_df
gc.collect()

60

In [14]:
numerical_features = ['outdoorTemp', 'outdoorHum', 'outdoorAtmo', 'indoorHum', 'indoorAtmo', 'indoorHum-outdoorHum', 'indoorAtmo-outdoorAtmo']
diff_features = ['{}_diff'.format(i) for i in numerical_features]
numerical_diff_features = numerical_features + diff_features

In [15]:
for i in tqdm(numerical_features):
    data_df['{}_diff'.format(i)] = data_df[i].diff()

100%|██████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 1169.96it/s]


In [16]:
data_df.fillna(method='bfill', inplace=True)

In [17]:
data_df['datetime'] = data_df['time'].apply(timestamp2string)

In [18]:
for i in numerical_diff_features:
    for j in ['1D', '2D', '3D']:
        data_df.set_index('datetime', inplace=True)
        tmp = data_df.groupby('hour')[i].rolling(j, closed='left', min_periods=2).agg({
            '{}_{}_rolling_mean'.format(i, j): 'mean',
            '{}_{}_rolling_median'.format(i, j): 'median',
            '{}_{}_rolling_max'.format(i, j): 'max',
            '{}_{}_rolling_min'.format(i, j): 'min',
            '{}_{}_rolling_sum'.format(i, j): 'sum',
            '{}_{}_rolling_std'.format(i, j): 'std',
            '{}_{}_rolling_skew'.format(i, j): 'skew'
        })
        tmp.reset_index(inplace=True)
        data_df.reset_index(inplace=True)
        data_df = data_df.merge(tmp, on=['datetime', 'hour'], how='left')
        del tmp
        gc.collect()


for i in numerical_diff_features:
    data_df.set_index('datetime', inplace=True)
    tmp = data_df.groupby('hour')[i].expanding(min_periods=2).agg({
        '{}_expanding_mean'.format(i): 'mean',
        '{}_expanding_median'.format(i): 'median',
        '{}_expanding_max'.format(i): 'max',
        '{}_expanding_min'.format(i): 'min',
        '{}_expanding_sum'.format(i): 'sum',
        '{}_expanding_std'.format(i): 'std',
        '{}_expanding_skew'.format(i): 'skew',
    })
    tmp.reset_index(inplace=True)
    data_df.reset_index(inplace=True)
    data_df = data_df.merge(tmp, on=['datetime', 'hour'], how='left')
    del tmp
    gc.collect()

In [19]:
data_df.drop('datetime', axis=1, inplace=True)

In [20]:
data_df.fillna(method='bfill', inplace=True)

In [21]:
# 基本聚合特征
group_feats = []
for f in tqdm(numerical_diff_features):
    data_df['MDH_{}_medi'.format(f)] = data_df.groupby(['month', 'day', 'hour'])[f].transform('median')
    data_df['MDH_{}_mean'.format(f)] = data_df.groupby(['month', 'day', 'hour'])[f].transform('mean')
    data_df['MDH_{}_max'.format(f)] = data_df.groupby(['month', 'day', 'hour'])[f].transform('max')
    data_df['MDH_{}_min'.format(f)] = data_df.groupby(['month', 'day', 'hour'])[f].transform('min')
    data_df['MDH_{}_sum'.format(f)] = data_df.groupby(['month', 'day', 'hour'])[f].transform('sum')
    data_df['MDH_{}_std'.format(f)] = data_df.groupby(['month', 'day', 'hour'])[f].transform('std')
    data_df['MDH_{}_skew'.format(f)] = data_df.groupby(['month', 'day', 'hour'])[f].transform('skew')
    
    data_df['MD_{}_medi'.format(f)] = data_df.groupby(['month', 'day'])[f].transform('median')
    data_df['MD_{}_mean'.format(f)] = data_df.groupby(['month', 'day'])[f].transform('mean')
    data_df['MD_{}_max'.format(f)] = data_df.groupby(['month', 'day'])[f].transform('max')
    data_df['MD_{}_min'.format(f)] = data_df.groupby(['month', 'day'])[f].transform('min')
    data_df['MD_{}_sum'.format(f)] = data_df.groupby(['month', 'day'])[f].transform('sum')
    data_df['MD_{}_std'.format(f)] = data_df.groupby(['month', 'day'])[f].transform('std')
    data_df['MD_{}_skew'.format(f)] = data_df.groupby(['month', 'day'])[f].transform('skew')

    group_feats.append('MDH_{}_medi'.format(f))
    group_feats.append('MDH_{}_mean'.format(f))
#     group_feats.append('MDH_{}_max'.format(f))
#     group_feats.append('MDH_{}_min'.format(f))
#     group_feats.append('MDH_{}_sum'.format(f))
#     group_feats.append('MDH_{}_std'.format(f))
#     group_feats.append('MDH_{}_skew'.format(f))
    
    group_feats.append('MD_{}_medi'.format(f))
    group_feats.append('MD_{}_mean'.format(f))
#     group_feats.append('MD_{}_max'.format(f))
#     group_feats.append('MD_{}_min'.format(f))
#     group_feats.append('MD_{}_sum'.format(f))
#     group_feats.append('MD_{}_std'.format(f))
#     group_feats.append('MD_{}_skew'.format(f))

100%|██████████████████████████████████████████████████████████████████████████████████| 14/14 [00:18<00:00,  1.31s/it]


In [22]:
cross_features = numerical_features + group_feats

# 基本交叉特征
for f1 in tqdm(cross_features):
    for f2 in cross_features:
        if f1 != f2:          
            colname_multiply = '{}_{}_multyply'.format(f1, f2)
            colname_ratio = '{}_{}_ratio'.format(f1, f2)
            
            data_df[colname_multiply] = data_df[f1].values / (data_df[f2].values + 0.001)
            data_df[colname_ratio] = data_df[f1].values * data_df[f2].values

100%|██████████████████████████████████████████████████████████████████████████████████| 63/63 [01:03<00:00,  1.01s/it]


In [23]:
exclude_cols = ['indoorHum_outdoorHum_subtract', 'outdoorHum_indoorHum_subtract',
                'indoorAtmo_outdoorAtmo_subtract', 'outdoorAtmo_indoorAtmo_subtract']
for i in tqdm(range(len(cross_features))):
    for j in range(i + 1, len(cross_features)):
        # 加
        colname_add = '{}_{}_add'.format(f1, f2)
        data_df[colname_add] = data_df[f1].values + data_df[f2].values
        
        # 减
        if '{}_{}_subtract'.format(cross_features[i], cross_features[j]) not in exclude_cols:
            colname_substract = '{}_{}_subtract'.format(cross_features[i], cross_features[j])
            data_df[colname_substract] = data_df[cross_features[i]].values - data_df[cross_features[j]].values

100%|██████████████████████████████████████████████████████████████████████████████████| 63/63 [00:32<00:00,  1.92it/s]


In [24]:
data_df.fillna(method='bfill', inplace=True)

In [25]:
# 历史信息提取
# data_df['dt'] = data_df['day'].values + (data_df['month'].values - 3) * 31

In [26]:
# def get_t_sts(f):
#     tmp_df = pd.DataFrame()
#     for t in tqdm(range(15, 45)):
#         tmp_data = data_df.loc[data_df['dt'] < t, :]
#         tmp = tmp_data.groupby(['hour'], as_index=False)[f].agg({
#             'hit_{}_mean'.format(f): 'mean',
#             'hit_{}_median'.format(f): 'median',
#             'hit_{}_max'.format(f): 'max',
#             'hit_{}_min'.format(f): 'min',
#             'hit_{}_sum'.format(f): 'sum',
#             'hit_{}_std'.format(f): 'std',
#             'hit_{}_skew'.format(f): 'skew'
#         })
#         tmp['dt'] = t
#         tmp_df = tmp_df.append(tmp)
#         del tmp
#         gc.collect()
    
#     data_df = data_df.merge(tmp_df, on=['dt', 'hour'], how='left')
#     del tmp_df
#     gc.collect()
    
# Parallel(n_jobs=4)(delayed(get_t_sts)(f) for f in tqdm(numerical_diff_features + ['temperature']))

In [27]:
# def get_t_1_sts(f):
#     tmp_df = pd.DataFrame()
#     for t in tqdm(range(15, 45)):
#         tmp = data_df.loc[(data_df['dt'] < t) & (data_df['dt'] >= t - 1), :].groupby(['hour'], as_index=False)[f].agg({
#             'hit_t_1_{}_mean'.format(f): 'mean',
#             'hit_t_1_{}_median'.format(f): 'median',
#             'hit_t_1_{}_max'.format(f): 'max',
#             'hit_t_1_{}_min'.format(f): 'min',
#             'hit_t_1_{}_sum'.format(f): 'sum',
#             'hit_t_1_{}_std'.format(f): 'std',
#             'hit_t_1_{}_skew'.format(f): 'skew'
#         })
#         tmp['dt'] = t
#         tmp_df = tmp_df.append(tmp)
#         del tmp
#         gc.collect()

#     data_df = data_df.merge(tmp_df, on=['dt', 'hour'], how='left')
#     del tmp_df
#     gc.collect()

# Parallel(n_jobs=4)(delayed(get_t_1_sts)(f) for f in tqdm(numerical_diff_features + ['temperature']))


In [28]:
# def get_t_2_sts(f):
#     tmp_df = pd.DataFrame()
#     for t in tqdm(range(15, 45)):
#         tmp = data_df.loc[(data_df['dt'] < t) & (data_df['dt'] >= t - 2), :].groupby(['hour'], as_index=False)[f].agg({
#             'hit_t_1_{}_mean'.format(f): 'mean',
#             'hit_t_1_{}_median'.format(f): 'median',
#             'hit_t_1_{}_max'.format(f): 'max',
#             'hit_t_1_{}_min'.format(f): 'min',
#             'hit_t_1_{}_sum'.format(f): 'sum',
#             'hit_t_1_{}_std'.format(f): 'std',
#             'hit_t_1_{}_skew'.format(f): 'skew'
#         })
#         tmp['dt'] = t
#         tmp_df = tmp_df.append(tmp)
#         del tmp
#         gc.collect()

#     data_df = data_df.merge(tmp_df, on=['dt', 'hour'], how='left')
#     del tmp_df
#     gc.collect()

# Parallel(n_jobs=4)(delayed(get_t_2_sts)(f) for f in tqdm(numerical_diff_features + ['temperature']))

In [29]:
data_df.fillna(method='bfill', inplace=True)

In [30]:
# 离散化
for f in numerical_features:
    data_df[f + '_20_bin'] = pd.cut(data_df[f], 20, duplicates='drop').apply(lambda x: x.left).astype(int)
    data_df[f + '_50_bin'] = pd.cut(data_df[f], 50, duplicates='drop').apply(lambda x: x.left).astype(int)
    data_df[f + '_100_bin'] = pd.cut(data_df[f], 100, duplicates='drop').apply(lambda x: x.left).astype(int)
    data_df[f + '_200_bin'] = pd.cut(data_df[f], 200, duplicates='drop').apply(lambda x: x.left).astype(int)

In [31]:
# for f1 in tqdm(['outdoorTemp_20_bin', 'outdoorHum_20_bin', 'outdoorAtmo_20_bin', 'indoorHum_20_bin', 'indoorAtmo_20_bin']):
#     for f2 in ['outdoorTemp', 'outdoorHum', 'outdoorAtmo', 'indoorHum', 'indoorAtmo']:
#         data_df['{}_{}_medi'.format(f1, f2)] = data_df.groupby([f1])[f2].transform('median')
#         data_df['{}_{}_mean'.format(f1, f2)] = data_df.groupby([f1])[f2].transform('mean')
#         data_df['{}_{}_max'.format(f1, f2)] = data_df.groupby([f1])[f2].transform('max')
#         data_df['{}_{}_min'.format(f1, f2)] = data_df.groupby([f1])[f2].transform('min')
#         data_df['{}_{}_sum'.format(f1, f2)] = data_df.groupby([f1])[f2].transform('sum')
#         data_df['{}_{}_std'.format(f1, f2)] = data_df.groupby([f1])[f2].transform('std')
#         data_df['{}_{}_skew'.format(f1, f2)] = data_df.groupby([f1])[f2].transform('skew')

# for f1 in tqdm(['outdoorTemp_50_bin', 'outdoorHum_50_bin', 'outdoorAtmo_50_bin', 'indoorHum_50_bin', 'indoorAtmo_50_bin']):
#     for f2 in ['outdoorTemp', 'outdoorHum', 'outdoorAtmo', 'indoorHum', 'indoorAtmo']:
#         data_df['{}_{}_medi'.format(f1, f2)] = data_df.groupby([f1])[f2].transform('median')
#         data_df['{}_{}_mean'.format(f1, f2)] = data_df.groupby([f1])[f2].transform('mean')
#         data_df['{}_{}_max'.format(f1, f2)] = data_df.groupby([f1])[f2].transform('max')
#         data_df['{}_{}_min'.format(f1, f2)] = data_df.groupby([f1])[f2].transform('min')
#         data_df['{}_{}_sum'.format(f1, f2)] = data_df.groupby([f1])[f2].transform('sum')
#         data_df['{}_{}_std'.format(f1, f2)] = data_df.groupby([f1])[f2].transform('std')
#         data_df['{}_{}_skew'.format(f1, f2)] = data_df.groupby([f1])[f2].transform('skew')

# for f1 in tqdm(['outdoorTemp_100_bin', 'outdoorHum_100_bin', 'outdoorAtmo_100_bin', 'indoorHum_100_bin',
#                 'indoorAtmo_100_bin']):
#     for f2 in ['outdoorTemp', 'outdoorHum', 'outdoorAtmo', 'indoorHum', 'indoorAtmo']:
#         data_df['{}_{}_medi'.format(f1, f2)] = data_df.groupby([f1])[f2].transform('median')
#         data_df['{}_{}_mean'.format(f1, f2)] = data_df.groupby([f1])[f2].transform('mean')
#         data_df['{}_{}_max'.format(f1, f2)] = data_df.groupby([f1])[f2].transform('max')
#         data_df['{}_{}_min'.format(f1, f2)] = data_df.groupby([f1])[f2].transform('min')
#         data_df['{}_{}_sum'.format(f1, f2)] = data_df.groupby([f1])[f2].transform('sum')
#         data_df['{}_{}_std'.format(f1, f2)] = data_df.groupby([f1])[f2].transform('std')
#         data_df['{}_{}_skew'.format(f1, f2)] = data_df.groupby([f1])[f2].transform('skew')

# for f1 in tqdm(['outdoorTemp_200_bin', 'outdoorHum_200_bin', 'outdoorAtmo_200_bin', 'indoorHum_200_bin',
#                 'indoorAtmo_200_bin']):
#     for f2 in ['outdoorTemp', 'outdoorHum', 'outdoorAtmo', 'indoorHum', 'indoorAtmo']:
#         data_df['{}_{}_medi'.format(f1, f2)] = data_df.groupby([f1])[f2].transform('median')
#         data_df['{}_{}_mean'.format(f1, f2)] = data_df.groupby([f1])[f2].transform('mean')
#         data_df['{}_{}_max'.format(f1, f2)] = data_df.groupby([f1])[f2].transform('max')
#         data_df['{}_{}_min'.format(f1, f2)] = data_df.groupby([f1])[f2].transform('min')
#         data_df['{}_{}_sum'.format(f1, f2)] = data_df.groupby([f1])[f2].transform('sum')
#         data_df['{}_{}_std'.format(f1, f2)] = data_df.groupby([f1])[f2].transform('std')
#         data_df['{}_{}_skew'.format(f1, f2)] = data_df.groupby([f1])[f2].transform('skew')

In [32]:
for f1 in tqdm(['{}_20_bin'.format(i) for i in numerical_features] +
               ['{}_50_bin'.format(i) for i in numerical_features] +
               ['{}_100_bin'.format(i) for i in numerical_features] +
               ['{}_200_bin'.format(i) for i in numerical_features]):
    for f2 in numerical_features:
        tmp = data_df.groupby(f1, as_index=False)[f2].agg({
            '{}_{}_medi'.format(f1, f2): 'median',
            '{}_{}_mean'.format(f1, f2): 'mean',
            '{}_{}_max'.format(f1, f2): 'max',
            '{}_{}_min'.format(f1, f2): 'min',
            '{}_{}_sum'.format(f1, f2): 'sum',
            '{}_{}_std'.format(f1, f2): 'std',
            '{}_{}_skew'.format(f1, f2): 'skew'
        })
        data_df = data_df.merge(tmp, on=f1, how='left')
        del tmp
        gc.collect()

100%|██████████████████████████████████████████████████████████████████████████████████| 28/28 [17:23<00:00, 37.28s/it]


In [33]:
train_df = data_df[:train_count].copy().reset_index(drop=True)
test_df = data_df[train_count:].copy().reset_index(drop=True)

del data_df
gc.collect()

20

In [34]:
drop_columns = ["time", "year", "sec", "temperature"]

features = train_df[:1].drop(drop_columns, axis=1).columns
x_train = train_df[features]
x_test = test_df[features]

In [35]:
psi_res = Parallel(n_jobs=4)(delayed(get_psi)(c) for c in tqdm(features))
psi_df = pd.concat(psi_res)

100%|████████████████████████████████████████████████████████████████████████████| 11770/11770 [05:30<00:00, 35.62it/s]


In [36]:
features = list(psi_df[psi_df['PSI'] <= 0.2]['变量名'].values) + ['outdoorTemp']

In [37]:
x_train = x_train[features]
x_test = x_test[features]
gc.collect()

40

In [38]:
col_corr = correlation(x_train, 0.98)
print(col_corr)

{'indoorAtmo-outdoorAtmo_200_bin_indoorAtmo-outdoorAtmo_sum', 'MDH_indoorHum-outdoorHum_mean_MDH_outdoorAtmo_mean_ratio', 'MD_indoorAtmo-outdoorAtmo_diff_medi_MD_outdoorHum_mean_multyply', 'MD_outdoorAtmo_diff_medi_MDH_indoorAtmo_diff_medi_ratio', 'MD_outdoorHum_medi_MDH_indoorHum-outdoorHum_medi_multyply', 'MD_outdoorAtmo_diff_medi_MDH_outdoorAtmo_diff_mean_ratio', 'outdoorHum_200_bin_outdoorTemp_max', 'indoorHum-outdoorHum_20_bin_indoorHum-outdoorHum_mean', 'indoorHum-outdoorHum_200_bin_outdoorHum_mean', 'outdoorAtmo_MDH_indoorHum_mean_multyply', 'indoorHum-outdoorHum_50_bin_outdoorTemp_mean', 'indoorAtmo-outdoorAtmo_MD_outdoorAtmo_diff_mean_subtract', 'outdoorAtmo_MDH_indoorHum-outdoorHum_mean_multyply', 'MD_indoorHum-outdoorHum_mean_MDH_outdoorTemp_mean_ratio', 'MDH_indoorHum-outdoorHum_medi_MDH_indoorHum-outdoorHum_diff_medi_subtract', 'outdoorHum_50_bin_outdoorTemp_min', 'outdoorHum_20_bin_indoorHum-outdoorHum_mean', 'indoorAtmo_indoorHum-outdoorHum_ratio', 'MD_outdoorHum_mean_MD

In [39]:
x_train.drop(list(col_corr), axis=1, inplace=True)
x_test.drop(list(col_corr), axis=1, inplace=True)

In [40]:
nums = int(x_train.shape[0] * 0.8)

trn_x, trn_y, val_x, val_y = x_train[:nums], y_train[:nums], x_train[nums:], y_train[nums:]

train_matrix = xgb.DMatrix(trn_x, label=trn_y, missing=np.nan)
valid_matrix = xgb.DMatrix(val_x, label=val_y, missing=np.nan)
train_all_matrix = xgb.DMatrix(x_train, y_train, missing=np.nan)
test_matrix = xgb.DMatrix(x_test, label=val_y, missing=np.nan)

params = {
    'booster': 'gbtree',
    'eval_metric': 'rmse',
    'min_child_weight': 5,
    'max_depth': 8,
    'subsample': 0.5,
    'colsample_bytree': 0.5,
    'eta': 0.01,
    'seed': 2020,
    'nthread': 36,
    'silent': 1
}

watchlist = [(train_matrix, 'train'), (valid_matrix, 'eval')]

model_eval = xgb.train(params,
                       train_matrix,
                       num_boost_round=50000,
                       evals=watchlist,
                       verbose_eval=500,
                       early_stopping_rounds=1000)
val_pred = model_eval.predict(valid_matrix, ntree_limit=model_eval.best_ntree_limit).reshape(-1, 1)

[0]	train-rmse:1.00052	eval-rmse:0.41800
Multiple eval metrics have been passed: 'eval-rmse' will be used for early stopping.

Will train until eval-rmse hasn't improved in 1000 rounds.
[500]	train-rmse:0.06485	eval-rmse:0.18130
[1000]	train-rmse:0.04748	eval-rmse:0.17904
[1500]	train-rmse:0.03846	eval-rmse:0.17895
[2000]	train-rmse:0.03219	eval-rmse:0.17879
[2500]	train-rmse:0.02752	eval-rmse:0.17887
Stopping. Best iteration:
[1828]	train-rmse:0.03411	eval-rmse:0.17872



In [41]:
mse = mean_squared_error(val_y, val_pred)
print("mse_score:", mse)
print("mse_score:", str(mse)[2:7])

mse_score: 0.03193901899096151
mse_score: 03193


In [42]:
feat_imp_dict = model_eval.get_score(importance_type='gain')

In [43]:
feat_imp = pd.Series(feat_imp_dict).sort_values(ascending=False).reset_index().rename(columns={'index': 'feature',
                                                                                               0: 'importance'})
feat_imp['normalized_importance'] = feat_imp['importance'] / feat_imp['importance'].sum()
feat_imp['cumulative_importance'] = np.cumsum(feat_imp['normalized_importance'])
record_low_importance = feat_imp[feat_imp['cumulative_importance'] > 0.95]

to_drop = list(record_low_importance['feature'])
print(to_drop)

['indoorHum-outdoorHum_MDH_indoorAtmo-outdoorAtmo_mean_subtract', 'MDH_indoorHum-outdoorHum_medi_MD_indoorAtmo-outdoorAtmo_mean_multyply', 'outdoorHum_MDH_outdoorAtmo_mean_subtract', 'MDH_indoorHum-outdoorHum_diff_mean_MDH_indoorAtmo-outdoorAtmo_diff_mean_multyply', 'indoorHum_50_bin_indoorAtmo_max', 'MD_outdoorTemp_diff_mean_MDH_indoorHum_diff_mean_multyply', 'MDH_indoorHum_diff_mean_MDH_outdoorTemp_diff_mean_multyply', 'outdoorTemp_20_bin_outdoorAtmo_skew', 'MD_indoorHum_diff_mean_MDH_outdoorTemp_diff_mean_multyply', 'MDH_outdoorHum_diff_mean_MDH_outdoorAtmo_diff_mean_multyply', 'outdoorTemp_200_bin_outdoorTemp_skew', 'outdoorAtmo_50_bin', 'MDH_indoorAtmo_diff_mean_MDH_outdoorHum_diff_mean_multyply', 'MDH_outdoorHum_diff_mean_MD_outdoorAtmo_diff_mean_multyply', 'outdoorTemp_100_bin_outdoorAtmo_skew', 'outdoorHum_50_bin_indoorAtmo-outdoorAtmo_skew', 'MDH_outdoorTemp_diff_sum', 'MDH_outdoorAtmo_diff_mean_MD_outdoorTemp_diff_mean_multyply', 'outdoorTemp_100_bin_indoorAtmo-outdoorAtmo_ma

In [44]:
x_train.drop(to_drop, axis=1, inplace=True)
x_test.drop(to_drop, axis=1, inplace=True)

In [45]:
train_all_matrix = xgb.DMatrix(x_train, y_train, missing=np.nan)
test_matrix = xgb.DMatrix(x_test, label=val_y, missing=np.nan)

model = xgb.train(params,
                  train_all_matrix,
                  num_boost_round=model_eval.best_ntree_limit + 20)

test_pred = model.predict(test_matrix, ntree_limit=model.best_ntree_limit).reshape(-1, 1)

In [46]:
sub['temperature'] = test_pred[:, 0] + test_df['outdoorTemp'].values
sub.to_csv('../sub/sub_{}_{}.csv'.format(time.strftime('%Y%m%d'), str(mse)[2:7]), index=False)