In [1]:
import numpy as np
import pandas as pd
from datetime import datetime
import os
import gc
import xgboost as xgb
import math
from joblib import Parallel, delayed

from tqdm import tqdm
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import mean_absolute_error, mean_squared_error
import matplotlib.pyplot as plt
import time
import warnings

warnings.filterwarnings('ignore')

pd.set_option('max_columns', None)
pd.set_option('max_rows', None)

In [2]:
def timestamp2string(timeStamp):
    try:
        d = datetime.fromtimestamp(timeStamp)
        str1 = d.strftime("%Y-%m-%d %H:%M:%S")
        # 2015-08-28 16:43:37'
        return datetime.strptime(str1,'%Y-%m-%d %H:%M:%S')
    except Exception as e:
        return ''

In [3]:
def get_psi(c):
    psi_res = pd.DataFrame()
    psi_dict={}
    # for c in tqdm(f_cols):
    try:
        t_train = x_train[c].fillna(-998)
        t_test = x_test[c].fillna(-998)
        #获取切分点
        bins=[]
        for i in np.arange(0,1.1,0.2):
            bins.append(t_train.quantile(i))
        bins=sorted(set(bins))
        bins[0]=-np.inf
        bins[-1]=np.inf
        #计算psi
        t_psi = pd.DataFrame()
        t_psi['train'] = pd.cut(t_train,bins).value_counts().sort_index()
        t_psi['test'] = pd.cut(t_test,bins).value_counts()
        t_psi.index=[str(x) for x in t_psi.index]
        t_psi.loc['总计',:] = t_psi.sum()
        t_psi['train_rate'] = t_psi['train']/t_psi.loc['总计','train']
        t_psi['test_rate'] = t_psi['test']/t_psi.loc['总计','test']
        t_psi['psi'] = (t_psi['test_rate']-t_psi['train_rate'])*(np.log(t_psi['test_rate'])-np.log(t_psi['train_rate']))
        t_psi.loc['总计','psi'] = t_psi['psi'].sum()
        t_psi.index.name=c
        #汇总
        t_res = pd.DataFrame([[c,t_psi.loc['总计','psi']]],
                             columns=['变量名','PSI'])
        psi_res = pd.concat([psi_res,t_res])
        psi_dict[c]=t_psi
        print(c,'done')
    except:
        print(c,'error')
    return psi_res,psi_dict

In [4]:
def correlation(df, threshold=0.98):
    """
    特征相关性计算
    @param df: 
    @param threshold: 
    @return: 
    """
    col_corr = set()
    corr_matrix = df.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if abs(corr_matrix.iloc[i, j]) > threshold:
                colName = corr_matrix.columns[i]
                col_corr.add(colName)
    return col_corr

In [5]:
train_df = pd.read_hdf('../input/train.h5')
test_df = pd.read_hdf('../input/test.h5')
sub = pd.DataFrame(test_df['time'])

In [6]:
train_df = train_df[train_df['temperature'].notnull()]
train_df = train_df.fillna(method='bfill')
test_df = test_df.fillna(method='bfill')
gc.collect()

64

In [7]:
train_df.columns = ['time', 'year', 'month', 'day', 'hour', 'min', 'sec', 'outdoorTemp', 'outdoorHum', 'outdoorAtmo',
                    'indoorHum', 'indoorAtmo', 'temperature']
test_df.columns = ['time', 'year', 'month', 'day', 'hour', 'min', 'sec', 'outdoorTemp', 'outdoorHum', 'outdoorAtmo',
                   'indoorHum', 'indoorAtmo']

In [8]:
print('train_df.shape: ', train_df.shape)
train_df = train_df.loc[(train_df['outdoorTemp'] >= test_df['outdoorTemp'].min()) & (train_df['outdoorTemp'] <= test_df['outdoorTemp'].max())]
print('处理后 train_df.shape: ', train_df.shape)

train_df.shape:  (24807, 13)
处理后 train_df.shape:  (19338, 13)


In [9]:
data_df = pd.concat([train_df, test_df], axis=0, ignore_index=True)

In [10]:
data_df.head(10).append(data_df.tail(10))

Unnamed: 0,time,year,month,day,hour,min,sec,outdoorTemp,outdoorHum,outdoorAtmo,indoorHum,indoorAtmo,temperature
0,1552496443,2019,3,14,1,0,43,14.6,85.0,993.1,80.0,992.4,15.4
1,1552496503,2019,3,14,1,1,43,14.7,84.0,993.0,80.0,992.6,15.4
2,1552496565,2019,3,14,1,2,45,14.7,84.0,992.7,80.0,992.8,15.4
3,1552496624,2019,3,14,1,3,44,14.6,85.0,993.2,80.0,992.6,15.4
4,1552496682,2019,3,14,1,4,42,14.6,85.0,992.7,80.0,992.6,15.3
5,1552496745,2019,3,14,1,5,45,14.6,85.0,993.1,80.0,992.9,15.3
6,1552496804,2019,3,14,1,6,44,14.5,85.0,993.6,80.0,409.6,15.3
7,1552496862,2019,3,14,1,7,42,14.5,85.0,993.6,80.0,992.8,15.3
8,1552496925,2019,3,14,1,8,45,14.5,85.0,993.4,80.0,993.2,15.3
9,1552496982,2019,3,14,1,9,42,14.5,85.0,993.8,80.0,993.2,15.3


In [11]:
data_df['datetime'] = data_df['time'].apply(timestamp2string)

In [12]:
numerical_features = ['outdoorTemp', 'outdoorHum', 'outdoorAtmo', 'indoorHum', 'indoorAtmo']
diff_features = ['{}_diff'.format(i) for i in numerical_features]
numerical_diff_features = numerical_features + diff_features

In [13]:
for i in tqdm(numerical_features):
    data_df['{}_diff'.format(i)] = data_df[i].diff()

100%|██████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 1260.69it/s]


In [14]:
data_df.fillna(method='bfill', inplace=True)

In [15]:
data_df.set_index('datetime', inplace=True)

for i in numerical_diff_features:
    for j in ['90s', '2h', '1D', '2D', '3D']:
        data_df['{}_{}_mean'.format(i, j)] = data_df[i].rolling(j, closed='left', min_periods=2).mean()

In [16]:
data_df.fillna(method='bfill', inplace=True)

In [17]:
data_df.head(10).append(data_df.tail(10))

Unnamed: 0_level_0,time,year,month,day,hour,min,sec,outdoorTemp,outdoorHum,outdoorAtmo,indoorHum,indoorAtmo,temperature,outdoorTemp_diff,outdoorHum_diff,outdoorAtmo_diff,indoorHum_diff,indoorAtmo_diff,outdoorTemp_1h_mean,outdoorTemp_2h_mean,outdoorTemp_1D_mean,outdoorTemp_2D_mean,outdoorTemp_3D_mean,outdoorHum_1h_mean,outdoorHum_2h_mean,outdoorHum_1D_mean,outdoorHum_2D_mean,outdoorHum_3D_mean,outdoorAtmo_1h_mean,outdoorAtmo_2h_mean,outdoorAtmo_1D_mean,outdoorAtmo_2D_mean,outdoorAtmo_3D_mean,indoorHum_1h_mean,indoorHum_2h_mean,indoorHum_1D_mean,indoorHum_2D_mean,indoorHum_3D_mean,indoorAtmo_1h_mean,indoorAtmo_2h_mean,indoorAtmo_1D_mean,indoorAtmo_2D_mean,indoorAtmo_3D_mean,outdoorTemp_diff_1h_mean,outdoorTemp_diff_2h_mean,outdoorTemp_diff_1D_mean,outdoorTemp_diff_2D_mean,outdoorTemp_diff_3D_mean,outdoorHum_diff_1h_mean,outdoorHum_diff_2h_mean,outdoorHum_diff_1D_mean,outdoorHum_diff_2D_mean,outdoorHum_diff_3D_mean,outdoorAtmo_diff_1h_mean,outdoorAtmo_diff_2h_mean,outdoorAtmo_diff_1D_mean,outdoorAtmo_diff_2D_mean,outdoorAtmo_diff_3D_mean,indoorHum_diff_1h_mean,indoorHum_diff_2h_mean,indoorHum_diff_1D_mean,indoorHum_diff_2D_mean,indoorHum_diff_3D_mean,indoorAtmo_diff_1h_mean,indoorAtmo_diff_2h_mean,indoorAtmo_diff_1D_mean,indoorAtmo_diff_2D_mean,indoorAtmo_diff_3D_mean
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1
2019-03-14 01:00:43,1552496443,2019,3,14,1,0,43,14.6,85.0,993.1,80.0,992.4,15.4,0.1,-1.0,-0.1,0.0,0.2,14.633333,14.633333,14.633333,14.633333,14.633333,84.666667,84.666667,84.666667,84.666667,84.666667,992.966667,992.966667,992.966667,992.966667,992.966667,80.0,80.0,80.0,80.0,80.0,992.65,992.65,992.65,992.65,992.65,0.016667,0.016667,0.016667,0.016667,0.016667,-0.166667,-0.166667,-0.166667,-0.166667,-0.166667,-0.016667,-0.016667,-0.016667,-0.016667,-0.016667,0.0,0.0,0.0,0.0,0.0,0.116667,0.116667,0.116667,0.116667,0.116667
2019-03-14 01:01:43,1552496503,2019,3,14,1,1,43,14.7,84.0,993.0,80.0,992.6,15.4,0.1,-1.0,-0.1,0.0,0.2,14.633333,14.633333,14.633333,14.633333,14.633333,84.666667,84.666667,84.666667,84.666667,84.666667,992.966667,992.966667,992.966667,992.966667,992.966667,80.0,80.0,80.0,80.0,80.0,992.65,992.65,992.65,992.65,992.65,0.016667,0.016667,0.016667,0.016667,0.016667,-0.166667,-0.166667,-0.166667,-0.166667,-0.166667,-0.016667,-0.016667,-0.016667,-0.016667,-0.016667,0.0,0.0,0.0,0.0,0.0,0.116667,0.116667,0.116667,0.116667,0.116667
2019-03-14 01:02:45,1552496565,2019,3,14,1,2,45,14.7,84.0,992.7,80.0,992.8,15.4,0.0,0.0,-0.3,0.0,0.2,14.633333,14.633333,14.633333,14.633333,14.633333,84.666667,84.666667,84.666667,84.666667,84.666667,992.966667,992.966667,992.966667,992.966667,992.966667,80.0,80.0,80.0,80.0,80.0,992.65,992.65,992.65,992.65,992.65,0.016667,0.016667,0.016667,0.016667,0.016667,-0.166667,-0.166667,-0.166667,-0.166667,-0.166667,-0.016667,-0.016667,-0.016667,-0.016667,-0.016667,0.0,0.0,0.0,0.0,0.0,0.116667,0.116667,0.116667,0.116667,0.116667
2019-03-14 01:03:44,1552496624,2019,3,14,1,3,44,14.6,85.0,993.2,80.0,992.6,15.4,-0.1,1.0,0.5,0.0,-0.2,14.633333,14.633333,14.633333,14.633333,14.633333,84.666667,84.666667,84.666667,84.666667,84.666667,992.966667,992.966667,992.966667,992.966667,992.966667,80.0,80.0,80.0,80.0,80.0,992.65,992.65,992.65,992.65,992.65,0.016667,0.016667,0.016667,0.016667,0.016667,-0.166667,-0.166667,-0.166667,-0.166667,-0.166667,-0.016667,-0.016667,-0.016667,-0.016667,-0.016667,0.0,0.0,0.0,0.0,0.0,0.116667,0.116667,0.116667,0.116667,0.116667
2019-03-14 01:04:42,1552496682,2019,3,14,1,4,42,14.6,85.0,992.7,80.0,992.6,15.3,0.0,0.0,-0.5,0.0,0.0,14.633333,14.633333,14.633333,14.633333,14.633333,84.666667,84.666667,84.666667,84.666667,84.666667,992.966667,992.966667,992.966667,992.966667,992.966667,80.0,80.0,80.0,80.0,80.0,992.65,992.65,992.65,992.65,992.65,0.016667,0.016667,0.016667,0.016667,0.016667,-0.166667,-0.166667,-0.166667,-0.166667,-0.166667,-0.016667,-0.016667,-0.016667,-0.016667,-0.016667,0.0,0.0,0.0,0.0,0.0,0.116667,0.116667,0.116667,0.116667,0.116667
2019-03-14 01:05:45,1552496745,2019,3,14,1,5,45,14.6,85.0,993.1,80.0,992.9,15.3,0.0,0.0,0.4,0.0,0.3,14.633333,14.633333,14.633333,14.633333,14.633333,84.666667,84.666667,84.666667,84.666667,84.666667,992.966667,992.966667,992.966667,992.966667,992.966667,80.0,80.0,80.0,80.0,80.0,992.65,992.65,992.65,992.65,992.65,0.016667,0.016667,0.016667,0.016667,0.016667,-0.166667,-0.166667,-0.166667,-0.166667,-0.166667,-0.016667,-0.016667,-0.016667,-0.016667,-0.016667,0.0,0.0,0.0,0.0,0.0,0.116667,0.116667,0.116667,0.116667,0.116667
2019-03-14 01:06:44,1552496804,2019,3,14,1,6,44,14.5,85.0,993.6,80.0,409.6,15.3,-0.1,0.0,0.5,0.0,-583.3,14.633333,14.633333,14.633333,14.633333,14.633333,84.666667,84.666667,84.666667,84.666667,84.666667,992.966667,992.966667,992.966667,992.966667,992.966667,80.0,80.0,80.0,80.0,80.0,992.65,992.65,992.65,992.65,992.65,0.016667,0.016667,0.016667,0.016667,0.016667,-0.166667,-0.166667,-0.166667,-0.166667,-0.166667,-0.016667,-0.016667,-0.016667,-0.016667,-0.016667,0.0,0.0,0.0,0.0,0.0,0.116667,0.116667,0.116667,0.116667,0.116667
2019-03-14 01:07:42,1552496862,2019,3,14,1,7,42,14.5,85.0,993.6,80.0,992.8,15.3,0.0,0.0,0.0,0.0,583.2,14.614286,14.614286,14.614286,14.614286,14.614286,84.714286,84.714286,84.714286,84.714286,84.714286,993.057143,993.057143,993.057143,993.057143,993.057143,80.0,80.0,80.0,80.0,80.0,909.357143,909.357143,909.357143,909.357143,909.357143,0.0,0.0,0.0,0.0,0.0,-0.142857,-0.142857,-0.142857,-0.142857,-0.142857,0.057143,0.057143,0.057143,0.057143,0.057143,0.0,0.0,0.0,0.0,0.0,-83.228571,-83.228571,-83.228571,-83.228571,-83.228571
2019-03-14 01:08:45,1552496925,2019,3,14,1,8,45,14.5,85.0,993.4,80.0,993.2,15.3,0.0,0.0,-0.2,0.0,0.4,14.6,14.6,14.6,14.6,14.6,84.75,84.75,84.75,84.75,84.75,993.125,993.125,993.125,993.125,993.125,80.0,80.0,80.0,80.0,80.0,919.7875,919.7875,919.7875,919.7875,919.7875,0.0,0.0,0.0,0.0,0.0,-0.125,-0.125,-0.125,-0.125,-0.125,0.05,0.05,0.05,0.05,0.05,0.0,0.0,0.0,0.0,0.0,0.075,0.075,0.075,0.075,0.075
2019-03-14 01:09:42,1552496982,2019,3,14,1,9,42,14.5,85.0,993.8,80.0,993.2,15.3,0.0,0.0,0.4,0.0,0.0,14.588889,14.588889,14.588889,14.588889,14.588889,84.777778,84.777778,84.777778,84.777778,84.777778,993.155556,993.155556,993.155556,993.155556,993.155556,80.0,80.0,80.0,80.0,80.0,927.944444,927.944444,927.944444,927.944444,927.944444,0.0,0.0,0.0,0.0,0.0,-0.111111,-0.111111,-0.111111,-0.111111,-0.111111,0.022222,0.022222,0.022222,0.022222,0.022222,0.0,0.0,0.0,0.0,0.0,0.111111,0.111111,0.111111,0.111111,0.111111


In [18]:
# 基本聚合特征
group_feats = []
for f in tqdm(numerical_diff_features):
    data_df['MDH_{}_medi'.format(f)] = data_df.groupby(['month', 'day', 'hour'])[f].transform('median')
    data_df['MDH_{}_mean'.format(f)] = data_df.groupby(['month', 'day', 'hour'])[f].transform('mean')
    data_df['MDH_{}_max'.format(f)] = data_df.groupby(['month', 'day', 'hour'])[f].transform('max')
    data_df['MDH_{}_min'.format(f)] = data_df.groupby(['month', 'day', 'hour'])[f].transform('min')
    data_df['MDH_{}_sum'.format(f)] = data_df.groupby(['month', 'day', 'hour'])[f].transform('sum')
    data_df['MDH_{}_std'.format(f)] = data_df.groupby(['month', 'day', 'hour'])[f].transform('std')
    data_df['MDH_{}_skew'.format(f)] = data_df.groupby(['month', 'day', 'hour'])[f].transform('skew')
    
    data_df['MD_{}_medi'.format(f)] = data_df.groupby(['month', 'day'])[f].transform('median')
    data_df['MD_{}_mean'.format(f)] = data_df.groupby(['month', 'day'])[f].transform('mean')
    data_df['MD_{}_max'.format(f)] = data_df.groupby(['month', 'day'])[f].transform('max')
    data_df['MD_{}_min'.format(f)] = data_df.groupby(['month', 'day'])[f].transform('min')
    data_df['MD_{}_sum'.format(f)] = data_df.groupby(['month', 'day'])[f].transform('sum')
    data_df['MD_{}_std'.format(f)] = data_df.groupby(['month', 'day'])[f].transform('std')
    data_df['MD_{}_skew'.format(f)] = data_df.groupby(['month', 'day'])[f].transform('skew')

    group_feats.append('MDH_{}_medi'.format(f))
    group_feats.append('MDH_{}_mean'.format(f))
    group_feats.append('MDH_{}_max'.format(f))
    group_feats.append('MDH_{}_min'.format(f))
#     group_feats.append('MDH_{}_sum'.format(f))
#     group_feats.append('MDH_{}_std'.format(f))
#     group_feats.append('MDH_{}_skew'.format(f))
    
    group_feats.append('MD_{}_medi'.format(f))
    group_feats.append('MD_{}_mean'.format(f))
    group_feats.append('MD_{}_max'.format(f))
    group_feats.append('MD_{}_min'.format(f))
#     group_feats.append('MD_{}_sum'.format(f))
#     group_feats.append('MD_{}_std'.format(f))
#     group_feats.append('MD_{}_skew'.format(f))

100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:05<00:00,  1.89it/s]


In [13]:
# 基本交叉特征
for f1 in tqdm(numerical_diff_features + group_feats):
    for f2 in numerical_diff_features + group_feats:
        if f1 != f2:
            colname_substract = '{}_{}_subtract'.format(f1, f2)
            colname_add = '{}_{}_add'.format(f1, f2)            
            colname_multiply = '{}_{}_multyply'.format(f1, f2)
            colname_ratio = '{}_{}_ratio'.format(f1, f2)
            
            data_df[colname_substract] = data_df[f1].values - data_df[f2].values
            data_df[colname_add] = data_df[f1].values + data_df[f2].values
            data_df[colname_multiply] = data_df[f1].values / (data_df[f2].values + 0.001)
            data_df[colname_ratio] = data_df[f1].values * data_df[f2].values

100%|██████████████████████████████████████████████████████████████████████████████████| 90/90 [16:45<00:00, 11.17s/it]


In [14]:
data_df.fillna(method='bfill', inplace=True)

In [15]:
# 历史信息提取
data_df['dt'] = data_df['day'].values + (data_df['month'].values - 3) * 31

In [16]:
def get_t_sts(f):
    tmp_df = pd.DataFrame()
    for t in tqdm(range(15, 45)):
        tmp_data = data_df.loc[data_df['dt'] < t, :]
        tmp = tmp_data.groupby(['hour'], as_index=False)[f].agg({
            'hit_{}_mean'.format(f): 'mean',
            'hit_{}_median'.format(f): 'median',
            'hit_{}_max'.format(f): 'max',
            'hit_{}_min'.format(f): 'min',
            'hit_{}_sum'.format(f): 'sum',
            'hit_{}_std'.format(f): 'std',
            'hit_{}_skew'.format(f): 'skew'
        })
        tmp['dt'] = t
        tmp_df = tmp_df.append(tmp)
        del tmp
        gc.collect()
    
    data_df = data_df.merge(tmp_df, on=['dt', 'hour'], how='left')
    del tmp_df
    gc.collect()
    
Parallel(n_jobs=4)(delayed(get_t_sts)(f) for f in tqdm(numerical_diff_features + ['temperature']))

100%|██████████████████████████████████████████████████████████████████████████████████| 11/11 [00:01<00:00,  7.26it/s]


UnboundLocalError: local variable 'data_df' referenced before assignment

In [None]:
def get_t_1_sts(f):
    tmp_df = pd.DataFrame()
    for t in tqdm(range(15, 45)):
        tmp = data_df.loc[(data_df['dt'] < t) & (data_df['dt'] >= t - 1), :].groupby(['hour'], as_index=False)[f].agg({
            'hit_t_1_{}_mean'.format(f): 'mean',
            'hit_t_1_{}_median'.format(f): 'median',
            'hit_t_1_{}_max'.format(f): 'max',
            'hit_t_1_{}_min'.format(f): 'min',
            'hit_t_1_{}_sum'.format(f): 'sum',
            'hit_t_1_{}_std'.format(f): 'std',
            'hit_t_1_{}_skew'.format(f): 'skew'
        })
        tmp['dt'] = t
        tmp_df = tmp_df.append(tmp)
        del tmp
        gc.collect()

    data_df = data_df.merge(tmp_df, on=['dt', 'hour'], how='left')
    del tmp_df
    gc.collect()

Parallel(n_jobs=4)(delayed(get_t_1_sts)(f) for f in tqdm(numerical_diff_features + ['temperature']))


In [None]:
def get_t_2_sts(f):
    tmp_df = pd.DataFrame()
    for t in tqdm(range(15, 45)):
        tmp = data_df.loc[(data_df['dt'] < t) & (data_df['dt'] >= t - 2), :].groupby(['hour'], as_index=False)[f].agg({
            'hit_t_1_{}_mean'.format(f): 'mean',
            'hit_t_1_{}_median'.format(f): 'median',
            'hit_t_1_{}_max'.format(f): 'max',
            'hit_t_1_{}_min'.format(f): 'min',
            'hit_t_1_{}_sum'.format(f): 'sum',
            'hit_t_1_{}_std'.format(f): 'std',
            'hit_t_1_{}_skew'.format(f): 'skew'
        })
        tmp['dt'] = t
        tmp_df = tmp_df.append(tmp)
        del tmp
        gc.collect()

    data_df = data_df.merge(tmp_df, on=['dt', 'hour'], how='left')
    del tmp_df
    gc.collect()

Parallel(n_jobs=4)(delayed(get_t_2_sts)(f) for f in tqdm(numerical_diff_features + ['temperature']))

In [None]:
data_df.fillna(method='bfill', inplace=True)

In [None]:
# 离散化
for f in ['outdoorTemp', 'outdoorHum', 'outdoorAtmo', 'indoorHum', 'indoorAtmo']:
    data_df[f + '_20_bin'] = pd.cut(data_df[f], 20, duplicates='drop').apply(lambda x: x.left).astype(int)
    data_df[f + '_50_bin'] = pd.cut(data_df[f], 50, duplicates='drop').apply(lambda x: x.left).astype(int)
    data_df[f + '_100_bin'] = pd.cut(data_df[f], 100, duplicates='drop').apply(lambda x: x.left).astype(int)
    data_df[f + '_200_bin'] = pd.cut(data_df[f], 200, duplicates='drop').apply(lambda x: x.left).astype(int)

In [None]:
def get_bin_sts(f1):
    for f2 in numerical_features:
        data_df['{}_{}_medi'.format(f1, f2)] = data_df.groupby([f1])[f2].transform('median')
        data_df['{}_{}_mean'.format(f1, f2)] = data_df.groupby([f1])[f2].transform('mean')
        data_df['{}_{}_max'.format(f1, f2)] = data_df.groupby([f1])[f2].transform('max')
        data_df['{}_{}_min'.format(f1, f2)] = data_df.groupby([f1])[f2].transform('min')
        data_df['{}_{}_sum'.format(f1, f2)] = data_df.groupby([f1])[f2].transform('sum')
        data_df['{}_{}_std'.format(f1, f2)] = data_df.groupby([f1])[f2].transform('std')
        data_df['{}_{}_skew'.format(f1, f2)] = data_df.groupby([f1])[f2].transform('skew')

Parallel(n_jobs=4)(delayed(get_bin_sts)(f1) for f1 in tqdm(['{}_20_bin'.format(i) for i in numerical_features] +
                                                           ['{}_50_bin'.format(i) for i in numerical_features] +
                                                           ['{}_100_bin'.format(i) for i in numerical_features] +
                                                           ['{}_200_bin'.format(i) for i in numerical_features]))

In [None]:
train_count = train_df.shape[0]
train_df = data_df[:train_count].copy().reset_index(drop=True)
test_df = data_df[train_count:].copy().reset_index(drop=True)

y_train = train_df['temperature'].values - train_df['outdoorTemp'].values

del data_df
gc.collect()

In [None]:
drop_columns = ["time", "year", "sec", "temperature"]

features = train_df[:1].drop(drop_columns, axis=1).columns
x_train = train_df[features]
x_test = test_df[features]

In [None]:
psi_res, psi_dict = Parallel(n_jobs=4)(delayed(get_psi)(c) for c in tqdm(features))

In [None]:
features = list(psi_res[psi_res['PSI'] <= 0.2]['变量名'].values) + ['outdoorTemp']

In [None]:
x_train = x_train[features]
x_test = x_test[features]
gc.collect()

In [None]:
col_corr = correlation(x_train, 0.98)
print(col_corr)

In [None]:
x_train.drop(list(col_corr), axis=1, inplace=True)
x_test.drop(list(col_corr), axis=1, inplace=True)

In [None]:
nums = int(x_train.shape[0] * 0.8)

trn_x, trn_y, val_x, val_y = x_train[:nums], y_train[:nums], x_train[nums:], y_train[nums:]

train_matrix = xgb.DMatrix(trn_x, label=trn_y, missing=np.nan)
valid_matrix = xgb.DMatrix(val_x, label=val_y, missing=np.nan)
train_all_matrix = xgb.DMatrix(x_train, y_train, missing=np.nan)
test_matrix = xgb.DMatrix(x_test, label=val_y, missing=np.nan)

params = {
    'booster': 'gbtree',
    'eval_metric': 'mae',
    'min_child_weight': 5,
    'max_depth': 8,
    'subsample': 0.5,
    'colsample_bytree': 0.5,
    'eta': 0.05,
    'seed': 2020,
    'nthread': 36,
    'silent': 1
}

watchlist = [(train_matrix, 'train'), (valid_matrix, 'eval')]

model_eval = xgb.train(params,
                       train_matrix,
                       num_boost_round=50000,
                       evals=watchlist,
                       verbose_eval=500,
                       early_stopping_rounds=1000)
val_pred = model_eval.predict(valid_matrix, ntree_limit=model_eval.best_ntree_limit).reshape(-1, 1)

In [None]:
mse = mean_squared_error(val_y, val_pred)
print("mse_score:", mse)
print("mse_score:", str(mse)[2:7])

In [None]:
feat_imp_dict = model_eval.get_score(importance_type='gain')

In [None]:
feat_imp = pd.Series(feat_imp_dict).sort_values(ascending=False).reset_index().rename(columns={'index': 'feature',
                                                                                               0: 'importance'})
feat_imp['normalized_importance'] = feat_imp['importance'] / feat_imp['importance'].sum()
feat_imp['cumulative_importance'] = np.cumsum(feat_imp['normalized_importance'])
record_low_importance = feat_imp[feat_imp['cumulative_importance'] > 0.95]

to_drop = list(record_low_importance['feature'])
print(to_drop)

In [None]:
x_train.drop(to_drop, axis=1, inplace=True)
x_test.drop(to_drop, axis=1, inplace=True)

In [None]:
train_all_matrix = xgb.DMatrix(x_train, y_train, missing=np.nan)
test_matrix = xgb.DMatrix(x_test, label=val_y, missing=np.nan)

model = xgb.train(params,
                  train_all_matrix,
                  num_boost_round=model_eval.best_ntree_limit + 20)

test_pred = model.predict(test_matrix, ntree_limit=model.best_ntree_limit).reshape(-1, 1)

In [None]:
sub['temperature'] = test_pred[:, 0] + test_df['outdoorTemp'].values
sub.to_csv('../sub/sub_{}_{}.csv'.format(time.strftime('%Y%m%d'), str(mse)[2:7]), index=False)