In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import os
import gc
import math
from joblib import Parallel, delayed

from tqdm import tqdm
import time
import warnings

warnings.filterwarnings('ignore')

pd.set_option('max_columns', None)
pd.set_option('max_rows', None)

In [2]:
def timestamp2string(timeStamp):
    try:
        d = datetime.fromtimestamp(timeStamp)
        str1 = d.strftime("%Y-%m-%d %H:%M:%S")
        # 2015-08-28 16:43:37'
        return datetime.strptime(str1,'%Y-%m-%d %H:%M:%S')
    except Exception as e:
        return ''

In [3]:
def get_psi(c):
    psi_res = pd.DataFrame()
    psi_dict={}
    # for c in tqdm(f_cols):
    try:
        t_train = x_train[c].fillna(-998)
        t_test = x_test[c].fillna(-998)
        #获取切分点
        bins=[]
        for i in np.arange(0,1.1,0.2):
            bins.append(t_train.quantile(i))
        bins=sorted(set(bins))
        bins[0]=-np.inf
        bins[-1]=np.inf
        #计算psi
        t_psi = pd.DataFrame()
        t_psi['train'] = pd.cut(t_train,bins).value_counts().sort_index()
        t_psi['test'] = pd.cut(t_test,bins).value_counts()
        t_psi.index=[str(x) for x in t_psi.index]
        t_psi.loc['总计',:] = t_psi.sum()
        t_psi['train_rate'] = t_psi['train']/t_psi.loc['总计','train']
        t_psi['test_rate'] = t_psi['test']/t_psi.loc['总计','test']
        t_psi['psi'] = (t_psi['test_rate']-t_psi['train_rate'])*(np.log(t_psi['test_rate'])-np.log(t_psi['train_rate']))
        t_psi.loc['总计','psi'] = t_psi['psi'].sum()
        t_psi.index.name=c
        #汇总
        t_res = pd.DataFrame([[c,t_psi.loc['总计','psi']]],
                             columns=['变量名','PSI'])
        psi_res = pd.concat([psi_res,t_res])
        psi_dict[c]=t_psi
        print(c,'done')
    except:
        print(c,'error')
    return psi_res #, psi_dict

In [4]:
def correlation(df, threshold=0.98):
    """
    特征相关性计算
    @param df: 
    @param threshold: 
    @return: 
    """
    col_corr = set()
    corr_matrix = df.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if abs(corr_matrix.iloc[i, j]) > threshold:
                colName = corr_matrix.columns[i]
                col_corr.add(colName)
    return col_corr

In [5]:
train_df = pd.read_hdf('../../input/train.h5')
test_df = pd.read_hdf('../../input/test.h5')
# sub = pd.DataFrame(test_df['time'])

In [6]:
train_df = train_df[train_df['temperature'].notnull()]
train_df = train_df.fillna(method='bfill')
test_df = test_df.fillna(method='bfill')

In [7]:
train_df.columns = ['time', 'year', 'month', 'day', 'hour', 'min', 'sec', 'outdoorTemp', 'outdoorHum', 'outdoorAtmo',
                    'indoorHum', 'indoorAtmo', 'temperature']
test_df.columns = ['time', 'year', 'month', 'day', 'hour', 'min', 'sec', 'outdoorTemp', 'outdoorHum', 'outdoorAtmo',
                   'indoorHum', 'indoorAtmo']

In [8]:
train_df.drop(['year', 'sec'], axis=1, inplace=True)
test_df.drop(['year', 'sec'], axis=1, inplace=True)

In [9]:
print('train_df.shape: ', train_df.shape)
train_df = train_df.loc[(train_df['outdoorTemp'] >= test_df['outdoorTemp'].min()) & (train_df['outdoorTemp'] <= test_df['outdoorTemp'].max())]
print('处理后 train_df.shape: ', train_df.shape)

train_df.shape:  (24807, 11)
处理后 train_df.shape:  (19338, 11)


In [10]:
train_count = train_df.shape[0]
y_train = train_df['temperature'].values - train_df['outdoorTemp'].values

In [11]:
train_df.shape

(19338, 11)

In [12]:
data_df = pd.concat([train_df, test_df], axis=0, ignore_index=True)

del train_df, test_df
gc.collect()

20

In [13]:
numerical_features = ['outdoorTemp', 'outdoorHum', 'outdoorAtmo', 'indoorHum', 'indoorAtmo']
diff_features = ['{}_diff'.format(i) for i in numerical_features]
numerical_diff_features = numerical_features + diff_features

In [14]:
for i in tqdm(numerical_features):
    data_df['{}_diff'.format(i)] = data_df[i].diff()

100%|███████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 835.49it/s]


In [15]:
data_df.fillna(method='bfill', inplace=True)

## rolling

In [16]:
data_df['datetime'] = data_df['time'].apply(timestamp2string)

In [17]:
for i in numerical_diff_features:
    for j in ['1D', '2D', '3D']:
        data_df.set_index('datetime', inplace=True)
        tmp = data_df.groupby('hour')[i].rolling(j, closed='left', min_periods=2).agg({
            '{}_{}_rolling_mean'.format(i, j): 'mean',
            '{}_{}_rolling_median'.format(i, j): 'median',
            '{}_{}_rolling_max'.format(i, j): 'max',
            '{}_{}_rolling_min'.format(i, j): 'min',
            '{}_{}_rolling_sum'.format(i, j): 'sum',
            '{}_{}_rolling_std'.format(i, j): 'std',
            '{}_{}_rolling_skew'.format(i, j): 'skew'
        })
        tmp.reset_index(inplace=True)
        data_df.reset_index(inplace=True)
        data_df = data_df.merge(tmp, on=['datetime', 'hour'], how='left')
        del tmp
        gc.collect()


for i in numerical_diff_features:
    data_df.set_index('datetime', inplace=True)
    tmp = data_df.groupby('hour')[i].expanding(min_periods=2).agg({
        '{}_expanding_mean'.format(i): 'mean',
        '{}_expanding_median'.format(i): 'median',
        '{}_expanding_max'.format(i): 'max',
        '{}_expanding_min'.format(i): 'min',
        '{}_expanding_sum'.format(i): 'sum',
        '{}_expanding_std'.format(i): 'std',
        '{}_expanding_skew'.format(i): 'skew',
    })
    tmp.reset_index(inplace=True)
    data_df.reset_index(inplace=True)
    data_df = data_df.merge(tmp, on=['datetime', 'hour'], how='left')
    del tmp
    gc.collect()

In [18]:
data_df.drop('datetime', axis=1, inplace=True)

In [19]:
data_df.fillna(method='bfill', inplace=True)

## 计算psi

In [20]:
train_df = data_df[:train_count].copy().reset_index(drop=True)
test_df = data_df[train_count:].copy().reset_index(drop=True)

del data_df
gc.collect()

60

In [21]:
train_time = train_df['time']
test_time = test_df['time']

In [22]:
drop_columns = ["time", "temperature"]

features = train_df[:1].drop(drop_columns, axis=1).columns
x_train = train_df[features]
x_test = test_df[features]

In [23]:
print(len(features))

294


In [24]:
psi_res = Parallel(n_jobs=4)(delayed(get_psi)(c) for c in tqdm(features))
psi_df = pd.concat(psi_res)

100%|████████████████████████████████████████████████████████████████████████████████| 294/294 [00:05<00:00, 50.61it/s]


In [25]:
features = list(psi_df[psi_df['PSI'] > 0.2]['变量名'].values)

In [26]:
print(features)

['day', 'outdoorTemp', 'outdoorAtmo', 'indoorAtmo', 'outdoorTemp_diff', 'outdoorHum_diff', 'outdoorAtmo_diff', 'indoorHum_diff', 'indoorAtmo_diff', 'outdoorTemp_1D_rolling_mean', 'outdoorTemp_1D_rolling_median', 'outdoorTemp_1D_rolling_max', 'outdoorTemp_1D_rolling_min', 'outdoorTemp_1D_rolling_sum', 'outdoorTemp_1D_rolling_std', 'outdoorTemp_1D_rolling_skew', 'outdoorTemp_2D_rolling_mean', 'outdoorTemp_2D_rolling_median', 'outdoorTemp_2D_rolling_max', 'outdoorTemp_2D_rolling_min', 'outdoorTemp_2D_rolling_sum', 'outdoorTemp_2D_rolling_skew', 'outdoorTemp_3D_rolling_mean', 'outdoorTemp_3D_rolling_median', 'outdoorTemp_3D_rolling_max', 'outdoorTemp_3D_rolling_min', 'outdoorTemp_3D_rolling_sum', 'outdoorHum_1D_rolling_mean', 'outdoorHum_1D_rolling_median', 'outdoorHum_1D_rolling_max', 'outdoorHum_1D_rolling_min', 'outdoorHum_1D_rolling_sum', 'outdoorHum_1D_rolling_std', 'outdoorHum_1D_rolling_skew', 'outdoorHum_2D_rolling_sum', 'outdoorHum_2D_rolling_skew', 'outdoorHum_3D_rolling_sum', 'o

In [27]:
print(len(features))

240


In [28]:
x_train.drop(features, axis=1, inplace=True)
x_test.drop(features, axis=1, inplace=True)
gc.collect()

665

In [29]:
x_train.shape

(19338, 54)

In [30]:
x_train.head(10).append(x_train.tail(10))

Unnamed: 0,month,hour,min,outdoorHum,indoorHum,outdoorTemp_2D_rolling_std,outdoorTemp_3D_rolling_std,outdoorTemp_3D_rolling_skew,outdoorHum_2D_rolling_mean,outdoorHum_2D_rolling_median,outdoorHum_2D_rolling_max,outdoorHum_2D_rolling_min,outdoorHum_2D_rolling_std,outdoorHum_3D_rolling_mean,outdoorHum_3D_rolling_median,outdoorHum_3D_rolling_max,outdoorHum_3D_rolling_min,outdoorHum_3D_rolling_std,outdoorHum_3D_rolling_skew,indoorHum_2D_rolling_mean,indoorHum_2D_rolling_median,indoorHum_2D_rolling_min,indoorHum_2D_rolling_std,indoorHum_3D_rolling_mean,indoorHum_3D_rolling_median,indoorHum_3D_rolling_min,indoorHum_3D_rolling_std,indoorHum_3D_rolling_skew,indoorAtmo_2D_rolling_mean,indoorAtmo_3D_rolling_mean,outdoorTemp_diff_1D_rolling_max,outdoorTemp_diff_2D_rolling_sum,outdoorTemp_diff_2D_rolling_skew,outdoorTemp_diff_3D_rolling_sum,outdoorTemp_diff_3D_rolling_skew,outdoorHum_diff_1D_rolling_sum,outdoorHum_diff_2D_rolling_sum,outdoorHum_diff_3D_rolling_sum,outdoorAtmo_diff_2D_rolling_sum,outdoorAtmo_diff_3D_rolling_sum,indoorHum_diff_2D_rolling_sum,indoorHum_diff_3D_rolling_sum,indoorHum_diff_3D_rolling_skew,indoorAtmo_diff_2D_rolling_sum,indoorAtmo_diff_3D_rolling_sum,indoorAtmo_expanding_max,outdoorTemp_diff_expanding_mean,outdoorHum_diff_expanding_mean,outdoorHum_diff_expanding_skew,outdoorAtmo_diff_expanding_skew,indoorHum_diff_expanding_mean,indoorAtmo_diff_expanding_mean,indoorAtmo_diff_expanding_median,indoorAtmo_diff_expanding_skew
0,3,1,0,85.0,80.0,0.070711,0.070711,-1.732051,84.5,84.5,85.0,84.0,0.707107,84.5,84.5,85.0,84.0,0.707107,1.732051,80.0,80.0,80.0,0.0,80.0,80.0,80.0,0.0,3.605551,992.5,992.5,0.1,0.2,-1.732051,0.2,-1.732051,-2.0,-2.0,-2.0,-0.2,-0.2,0.0,0.0,3.605551,0.4,0.4,992.6,0.1,-1.0,1.732051,-1.732051,0.0,0.2,0.2,-2.0
1,3,1,1,84.0,80.0,0.070711,0.070711,-1.732051,84.5,84.5,85.0,84.0,0.707107,84.5,84.5,85.0,84.0,0.707107,1.732051,80.0,80.0,80.0,0.0,80.0,80.0,80.0,0.0,3.605551,992.5,992.5,0.1,0.2,-1.732051,0.2,-1.732051,-2.0,-2.0,-2.0,-0.2,-0.2,0.0,0.0,3.605551,0.4,0.4,992.6,0.1,-1.0,1.732051,-1.732051,0.0,0.2,0.2,-2.0
2,3,1,2,84.0,80.0,0.070711,0.070711,-1.732051,84.5,84.5,85.0,84.0,0.707107,84.5,84.5,85.0,84.0,0.707107,1.732051,80.0,80.0,80.0,0.0,80.0,80.0,80.0,0.0,3.605551,992.5,992.5,0.1,0.2,-1.732051,0.2,-1.732051,-2.0,-2.0,-2.0,-0.2,-0.2,0.0,0.0,3.605551,0.4,0.4,992.8,0.066667,-0.666667,1.732051,-1.732051,0.0,0.2,0.2,-2.0
3,3,1,3,85.0,80.0,0.057735,0.057735,-1.732051,84.333333,84.0,85.0,84.0,0.57735,84.333333,84.0,85.0,84.0,0.57735,1.732051,80.0,80.0,80.0,0.0,80.0,80.0,80.0,0.0,3.605551,992.6,992.6,0.1,0.2,-1.732051,0.2,-1.732051,-2.0,-2.0,-2.0,-0.5,-0.5,0.0,0.0,3.605551,0.6,0.6,992.8,0.025,-0.25,0.854563,1.539601,0.0,0.1,0.2,-2.0
4,3,1,4,85.0,80.0,0.057735,0.057735,1.380341e-08,84.5,84.5,85.0,84.0,0.57735,84.5,84.5,85.0,84.0,0.57735,0.0,80.0,80.0,80.0,0.0,80.0,80.0,80.0,0.0,3.605551,992.6,992.6,0.1,0.1,-0.854563,0.1,-0.854563,-1.0,-1.0,-1.0,0.0,0.0,0.0,0.0,3.605551,0.4,0.4,992.8,0.02,-0.2,0.512241,1.145405,0.0,0.08,0.2,-1.257788
5,3,1,5,85.0,80.0,0.054772,0.054772,0.6085807,84.6,85.0,85.0,84.0,0.547723,84.6,85.0,85.0,84.0,0.547723,-0.608581,80.0,80.0,80.0,0.0,80.0,80.0,80.0,0.0,3.605551,992.6,992.6,0.1,0.1,-0.512241,0.1,-0.512241,-1.0,-1.0,-1.0,-0.5,-0.5,0.0,0.0,3.605551,0.4,0.4,992.9,0.016667,-0.166667,0.31257,0.365772,0.0,0.116667,0.2,-1.235703
6,3,1,6,85.0,80.0,0.05164,0.05164,0.9682458,84.666667,85.0,85.0,84.0,0.516398,84.666667,85.0,85.0,84.0,0.516398,-0.968246,80.0,80.0,80.0,0.0,80.0,80.0,80.0,0.0,3.605551,992.65,992.65,0.1,0.1,-0.31257,0.1,-0.31257,-1.0,-1.0,-1.0,-0.1,-0.1,0.0,0.0,3.605551,0.7,0.7,992.9,0.0,-0.142857,0.173897,-0.042172,0.0,-83.228571,0.2,-2.645748
7,3,1,7,85.0,80.0,0.069007,0.069007,-0.1738965,84.714286,85.0,85.0,84.0,0.48795,84.714286,85.0,85.0,84.0,0.48795,-1.229634,80.0,80.0,80.0,0.0,80.0,80.0,80.0,0.0,3.605551,909.357143,909.357143,0.1,0.0,0.0,0.0,0.0,-1.0,-1.0,-1.0,0.4,0.4,0.0,0.0,3.605551,-582.6,-582.6,992.9,0.0,-0.125,0.067843,0.031749,0.0,0.075,0.2,-0.001604
8,3,1,8,85.0,80.0,0.075593,0.075593,8.823872e-10,84.75,85.0,85.0,84.0,0.46291,84.75,85.0,85.0,84.0,0.46291,-1.440165,80.0,80.0,80.0,0.0,80.0,80.0,80.0,0.0,3.605551,919.7875,919.7875,0.1,0.0,0.0,0.0,0.0,-1.0,-1.0,-1.0,0.4,0.4,0.0,0.0,3.605551,0.6,0.6,993.2,0.0,-0.111111,-0.018287,0.273395,0.0,0.111111,0.2,-0.002131
9,3,1,9,85.0,80.0,0.078174,0.078174,0.2159696,84.777778,85.0,85.0,84.0,0.440959,84.777778,85.0,85.0,84.0,0.440959,-1.619848,80.0,80.0,80.0,0.0,80.0,80.0,80.0,0.0,3.605551,927.944444,927.944444,0.1,0.0,0.0,0.0,0.0,-1.0,-1.0,-1.0,0.2,0.2,0.0,0.0,3.605551,1.0,1.0,993.2,0.0,-0.1,-0.09112,0.002096,0.0,0.1,0.2,-0.002046


In [31]:
x_test.head()

Unnamed: 0,month,hour,min,outdoorHum,indoorHum,outdoorTemp_2D_rolling_std,outdoorTemp_3D_rolling_std,outdoorTemp_3D_rolling_skew,outdoorHum_2D_rolling_mean,outdoorHum_2D_rolling_median,outdoorHum_2D_rolling_max,outdoorHum_2D_rolling_min,outdoorHum_2D_rolling_std,outdoorHum_3D_rolling_mean,outdoorHum_3D_rolling_median,outdoorHum_3D_rolling_max,outdoorHum_3D_rolling_min,outdoorHum_3D_rolling_std,outdoorHum_3D_rolling_skew,indoorHum_2D_rolling_mean,indoorHum_2D_rolling_median,indoorHum_2D_rolling_min,indoorHum_2D_rolling_std,indoorHum_3D_rolling_mean,indoorHum_3D_rolling_median,indoorHum_3D_rolling_min,indoorHum_3D_rolling_std,indoorHum_3D_rolling_skew,indoorAtmo_2D_rolling_mean,indoorAtmo_3D_rolling_mean,outdoorTemp_diff_1D_rolling_max,outdoorTemp_diff_2D_rolling_sum,outdoorTemp_diff_2D_rolling_skew,outdoorTemp_diff_3D_rolling_sum,outdoorTemp_diff_3D_rolling_skew,outdoorHum_diff_1D_rolling_sum,outdoorHum_diff_2D_rolling_sum,outdoorHum_diff_3D_rolling_sum,outdoorAtmo_diff_2D_rolling_sum,outdoorAtmo_diff_3D_rolling_sum,indoorHum_diff_2D_rolling_sum,indoorHum_diff_3D_rolling_sum,indoorHum_diff_3D_rolling_skew,indoorAtmo_diff_2D_rolling_sum,indoorAtmo_diff_3D_rolling_sum,indoorAtmo_expanding_max,outdoorTemp_diff_expanding_mean,outdoorHum_diff_expanding_mean,outdoorHum_diff_expanding_skew,outdoorAtmo_diff_expanding_skew,indoorHum_diff_expanding_mean,indoorAtmo_diff_expanding_mean,indoorAtmo_diff_expanding_median,indoorAtmo_diff_expanding_skew
0,4,1,0,91.0,88.0,0.120877,0.235342,-0.688199,82.59322,83.0,83.0,81.0,0.560746,85.102564,83.0,91.0,81.0,3.001105,0.538118,80.169492,80.0,79.0,0.460605,81.282051,81.0,79.0,1.536039,0.703722,991.627119,988.876068,0.2,0.3,0.176742,-1.4,-6.438394,-4.0,0.0,1.0,2.3,7.9,4.0,5.0,-1.267147,1.8,6.5,995.8,-0.047023,0.192366,10.523273,-0.345797,0.177099,0.018168,0.0,-0.004077
1,4,1,30,91.0,88.0,0.117958,0.252035,-0.570844,82.580645,82.0,91.0,81.0,1.668816,84.876404,83.0,91.0,81.0,3.309148,0.734793,80.645161,80.0,80.0,1.45025,81.314607,80.0,79.0,1.787555,1.135313,991.625806,989.862921,0.2,-2.0,-5.393795,-1.9,-8.544548,-4.0,12.0,17.0,5.4,6.9,14.0,19.0,8.041896,6.6,5.9,995.8,-0.046799,0.192073,10.531475,-0.346022,0.176829,0.017683,0.0,-0.00406
2,4,2,0,92.0,89.0,0.107304,0.296221,0.009143,82.4,82.0,83.0,81.0,0.643086,86.680672,83.0,92.0,81.0,4.382037,0.01542,80.583333,81.0,80.0,0.530164,82.865546,82.0,80.0,2.386134,0.051572,990.843333,983.503361,0.2,-0.4,-1.129685,-0.6,-0.299898,-4.0,2.0,3.0,-1.1,-1.4,0.0,1.0,0.06957,-0.8,0.7,995.8,-0.0063,0.033926,0.675681,-0.004801,0.025848,-0.008239,0.0,0.000459
3,4,2,30,92.0,89.0,0.062562,0.287648,-0.59424,82.645161,82.0,92.0,81.0,1.835727,85.344444,83.0,92.0,81.0,4.232462,0.687464,80.935484,81.0,80.0,1.590293,82.166667,81.0,80.0,2.352383,0.807142,990.609677,989.276667,0.2,-0.2,-0.352053,-0.5,-0.456159,-4.0,3.0,4.0,0.2,-0.5,2.0,1.0,0.083174,1.1,-0.2,995.8,-0.006452,0.033871,0.676566,-0.00482,0.025806,-0.007581,0.0,0.000425
4,4,3,0,91.0,88.0,0.183646,0.160623,1.339179,85.9,87.0,88.0,83.0,1.580603,88.084034,88.0,91.0,83.0,2.512949,-0.533343,82.433333,82.5,81.0,0.889995,83.907563,84.0,81.0,1.780267,0.114274,990.656667,988.213445,0.2,-0.4,-0.323357,-0.3,-1.025059,-4.0,4.0,2.0,0.5,1.8,3.0,1.0,0.042123,-1.5,-1.2,995.7,-0.002708,0.001805,-0.808475,0.000218,0.00361,-0.008123,0.0,-0.002081


## 计算相关性

In [32]:
col_corr = correlation(x_train, 0.98)
print(len(col_corr))

9


In [33]:
x_train.drop(list(col_corr), axis=1, inplace=True)
x_test.drop(list(col_corr), axis=1, inplace=True)

In [34]:
x_train['time'] = train_time
x_test['time'] = test_time

In [35]:
x_train.head()

Unnamed: 0,month,hour,min,outdoorHum,outdoorTemp_2D_rolling_std,outdoorTemp_3D_rolling_std,outdoorTemp_3D_rolling_skew,outdoorHum_2D_rolling_mean,outdoorHum_2D_rolling_median,outdoorHum_2D_rolling_max,outdoorHum_2D_rolling_min,outdoorHum_2D_rolling_std,outdoorHum_3D_rolling_mean,outdoorHum_3D_rolling_median,outdoorHum_3D_rolling_max,outdoorHum_3D_rolling_min,outdoorHum_3D_rolling_std,outdoorHum_3D_rolling_skew,indoorHum_3D_rolling_skew,indoorAtmo_2D_rolling_mean,indoorAtmo_3D_rolling_mean,outdoorTemp_diff_1D_rolling_max,outdoorTemp_diff_2D_rolling_sum,outdoorTemp_diff_2D_rolling_skew,outdoorTemp_diff_3D_rolling_sum,outdoorTemp_diff_3D_rolling_skew,outdoorHum_diff_1D_rolling_sum,outdoorHum_diff_2D_rolling_sum,outdoorHum_diff_3D_rolling_sum,outdoorAtmo_diff_2D_rolling_sum,outdoorAtmo_diff_3D_rolling_sum,indoorHum_diff_2D_rolling_sum,indoorHum_diff_3D_rolling_sum,indoorHum_diff_3D_rolling_skew,indoorAtmo_diff_2D_rolling_sum,indoorAtmo_diff_3D_rolling_sum,indoorAtmo_expanding_max,outdoorTemp_diff_expanding_mean,outdoorHum_diff_expanding_mean,outdoorHum_diff_expanding_skew,outdoorAtmo_diff_expanding_skew,indoorHum_diff_expanding_mean,indoorAtmo_diff_expanding_mean,indoorAtmo_diff_expanding_median,indoorAtmo_diff_expanding_skew,time
0,3,1,0,85.0,0.070711,0.070711,-1.732051,84.5,84.5,85.0,84.0,0.707107,84.5,84.5,85.0,84.0,0.707107,1.732051,3.605551,992.5,992.5,0.1,0.2,-1.732051,0.2,-1.732051,-2.0,-2.0,-2.0,-0.2,-0.2,0.0,0.0,3.605551,0.4,0.4,992.6,0.1,-1.0,1.732051,-1.732051,0.0,0.2,0.2,-2.0,1552496443
1,3,1,1,84.0,0.070711,0.070711,-1.732051,84.5,84.5,85.0,84.0,0.707107,84.5,84.5,85.0,84.0,0.707107,1.732051,3.605551,992.5,992.5,0.1,0.2,-1.732051,0.2,-1.732051,-2.0,-2.0,-2.0,-0.2,-0.2,0.0,0.0,3.605551,0.4,0.4,992.6,0.1,-1.0,1.732051,-1.732051,0.0,0.2,0.2,-2.0,1552496503
2,3,1,2,84.0,0.070711,0.070711,-1.732051,84.5,84.5,85.0,84.0,0.707107,84.5,84.5,85.0,84.0,0.707107,1.732051,3.605551,992.5,992.5,0.1,0.2,-1.732051,0.2,-1.732051,-2.0,-2.0,-2.0,-0.2,-0.2,0.0,0.0,3.605551,0.4,0.4,992.8,0.066667,-0.666667,1.732051,-1.732051,0.0,0.2,0.2,-2.0,1552496565
3,3,1,3,85.0,0.057735,0.057735,-1.732051,84.333333,84.0,85.0,84.0,0.57735,84.333333,84.0,85.0,84.0,0.57735,1.732051,3.605551,992.6,992.6,0.1,0.2,-1.732051,0.2,-1.732051,-2.0,-2.0,-2.0,-0.5,-0.5,0.0,0.0,3.605551,0.6,0.6,992.8,0.025,-0.25,0.854563,1.539601,0.0,0.1,0.2,-2.0,1552496624
4,3,1,4,85.0,0.057735,0.057735,1.380341e-08,84.5,84.5,85.0,84.0,0.57735,84.5,84.5,85.0,84.0,0.57735,0.0,3.605551,992.6,992.6,0.1,0.1,-0.854563,0.1,-0.854563,-1.0,-1.0,-1.0,0.0,0.0,0.0,0.0,3.605551,0.4,0.4,992.8,0.02,-0.2,0.512241,1.145405,0.0,0.08,0.2,-1.257788,1552496682


In [36]:
x_test.head()

Unnamed: 0,month,hour,min,outdoorHum,outdoorTemp_2D_rolling_std,outdoorTemp_3D_rolling_std,outdoorTemp_3D_rolling_skew,outdoorHum_2D_rolling_mean,outdoorHum_2D_rolling_median,outdoorHum_2D_rolling_max,outdoorHum_2D_rolling_min,outdoorHum_2D_rolling_std,outdoorHum_3D_rolling_mean,outdoorHum_3D_rolling_median,outdoorHum_3D_rolling_max,outdoorHum_3D_rolling_min,outdoorHum_3D_rolling_std,outdoorHum_3D_rolling_skew,indoorHum_3D_rolling_skew,indoorAtmo_2D_rolling_mean,indoorAtmo_3D_rolling_mean,outdoorTemp_diff_1D_rolling_max,outdoorTemp_diff_2D_rolling_sum,outdoorTemp_diff_2D_rolling_skew,outdoorTemp_diff_3D_rolling_sum,outdoorTemp_diff_3D_rolling_skew,outdoorHum_diff_1D_rolling_sum,outdoorHum_diff_2D_rolling_sum,outdoorHum_diff_3D_rolling_sum,outdoorAtmo_diff_2D_rolling_sum,outdoorAtmo_diff_3D_rolling_sum,indoorHum_diff_2D_rolling_sum,indoorHum_diff_3D_rolling_sum,indoorHum_diff_3D_rolling_skew,indoorAtmo_diff_2D_rolling_sum,indoorAtmo_diff_3D_rolling_sum,indoorAtmo_expanding_max,outdoorTemp_diff_expanding_mean,outdoorHum_diff_expanding_mean,outdoorHum_diff_expanding_skew,outdoorAtmo_diff_expanding_skew,indoorHum_diff_expanding_mean,indoorAtmo_diff_expanding_mean,indoorAtmo_diff_expanding_median,indoorAtmo_diff_expanding_skew,time
0,4,1,0,91.0,0.120877,0.235342,-0.688199,82.59322,83.0,83.0,81.0,0.560746,85.102564,83.0,91.0,81.0,3.001105,0.538118,0.703722,991.627119,988.876068,0.2,0.3,0.176742,-1.4,-6.438394,-4.0,0.0,1.0,2.3,7.9,4.0,5.0,-1.267147,1.8,6.5,995.8,-0.047023,0.192366,10.523273,-0.345797,0.177099,0.018168,0.0,-0.004077,1554224413
1,4,1,30,91.0,0.117958,0.252035,-0.570844,82.580645,82.0,91.0,81.0,1.668816,84.876404,83.0,91.0,81.0,3.309148,0.734793,1.135313,991.625806,989.862921,0.2,-2.0,-5.393795,-1.9,-8.544548,-4.0,12.0,17.0,5.4,6.9,14.0,19.0,8.041896,6.6,5.9,995.8,-0.046799,0.192073,10.531475,-0.346022,0.176829,0.017683,0.0,-0.00406,1554226217
2,4,2,0,92.0,0.107304,0.296221,0.009143,82.4,82.0,83.0,81.0,0.643086,86.680672,83.0,92.0,81.0,4.382037,0.01542,0.051572,990.843333,983.503361,0.2,-0.4,-1.129685,-0.6,-0.299898,-4.0,2.0,3.0,-1.1,-1.4,0.0,1.0,0.06957,-0.8,0.7,995.8,-0.0063,0.033926,0.675681,-0.004801,0.025848,-0.008239,0.0,0.000459,1554228020
3,4,2,30,92.0,0.062562,0.287648,-0.59424,82.645161,82.0,92.0,81.0,1.835727,85.344444,83.0,92.0,81.0,4.232462,0.687464,0.807142,990.609677,989.276667,0.2,-0.2,-0.352053,-0.5,-0.456159,-4.0,3.0,4.0,0.2,-0.5,2.0,1.0,0.083174,1.1,-0.2,995.8,-0.006452,0.033871,0.676566,-0.00482,0.025806,-0.007581,0.0,0.000425,1554229823
4,4,3,0,91.0,0.183646,0.160623,1.339179,85.9,87.0,88.0,83.0,1.580603,88.084034,88.0,91.0,83.0,2.512949,-0.533343,0.114274,990.656667,988.213445,0.2,-0.4,-0.323357,-0.3,-1.025059,-4.0,4.0,2.0,0.5,1.8,3.0,1.0,0.042123,-1.5,-1.2,995.7,-0.002708,0.001805,-0.808475,0.000218,0.00361,-0.008123,0.0,-0.002081,1554231625


In [37]:
print(x_train.columns.to_list())

['month', 'hour', 'min', 'outdoorHum', 'outdoorTemp_2D_rolling_std', 'outdoorTemp_3D_rolling_std', 'outdoorTemp_3D_rolling_skew', 'outdoorHum_2D_rolling_mean', 'outdoorHum_2D_rolling_median', 'outdoorHum_2D_rolling_max', 'outdoorHum_2D_rolling_min', 'outdoorHum_2D_rolling_std', 'outdoorHum_3D_rolling_mean', 'outdoorHum_3D_rolling_median', 'outdoorHum_3D_rolling_max', 'outdoorHum_3D_rolling_min', 'outdoorHum_3D_rolling_std', 'outdoorHum_3D_rolling_skew', 'indoorHum_3D_rolling_skew', 'indoorAtmo_2D_rolling_mean', 'indoorAtmo_3D_rolling_mean', 'outdoorTemp_diff_1D_rolling_max', 'outdoorTemp_diff_2D_rolling_sum', 'outdoorTemp_diff_2D_rolling_skew', 'outdoorTemp_diff_3D_rolling_sum', 'outdoorTemp_diff_3D_rolling_skew', 'outdoorHum_diff_1D_rolling_sum', 'outdoorHum_diff_2D_rolling_sum', 'outdoorHum_diff_3D_rolling_sum', 'outdoorAtmo_diff_2D_rolling_sum', 'outdoorAtmo_diff_3D_rolling_sum', 'indoorHum_diff_2D_rolling_sum', 'indoorHum_diff_3D_rolling_sum', 'indoorHum_diff_3D_rolling_skew', 'ind

In [38]:
data_df = pd.concat([x_train, x_test])
data_df.to_hdf('../../input/features/rolling.h5', 'df')