In [1]:
import numpy as np
import pandas as pd
from datetime import datetime
import os
import gc
import xgboost as xgb
import math
from joblib import Parallel, delayed

from tqdm import tqdm
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import mean_absolute_error, mean_squared_error
import matplotlib.pyplot as plt
import time
import warnings

warnings.filterwarnings('ignore')

pd.set_option('max_columns', None)
pd.set_option('max_rows', None)

In [2]:
def correlation(df, threshold=0.98):
    """
    特征相关性计算
    @param df: 
    @param threshold: 
    @return: 
    """
    col_corr = set()
    corr_matrix = df.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if abs(corr_matrix.iloc[i, j]) > threshold:
                colName = corr_matrix.columns[i]
                col_corr.add(colName)
    return col_corr

In [3]:
train_df = pd.read_hdf('../../input/train.h5')
test_df = pd.read_hdf('../../input/test.h5')
sub = pd.DataFrame(test_df['time'])

In [4]:
train_df = train_df[train_df['temperature'].notnull()]
train_df = train_df.fillna(method='bfill')
test_df = test_df.fillna(method='bfill')
gc.collect()

64

In [5]:
train_df.columns = ['time', 'year', 'month', 'day', 'hour', 'min', 'sec', 'outdoorTemp', 'outdoorHum', 'outdoorAtmo',
                    'indoorHum', 'indoorAtmo', 'temperature']
test_df.columns = ['time', 'year', 'month', 'day', 'hour', 'min', 'sec', 'outdoorTemp', 'outdoorHum', 'outdoorAtmo',
                   'indoorHum', 'indoorAtmo']

In [6]:
train_count = 24807
y_train = train_df['temperature'].values - train_df['outdoorTemp'].values
test_temp = test_df['outdoorTemp'].values

del train_df, test_df
gc.collect()

40

In [7]:
rolling = pd.read_hdf('../../input/features/rolling.h5')
agg = pd.read_hdf('../../input/features/agg.h5')
combine = pd.read_hdf('../../input/features/combine.h5')

In [8]:
print(rolling.shape)
print(agg.shape)
print(combine.shape)

(25213, 62)
(25213, 275)
(25213, 148)


In [9]:
data_df = pd.merge(rolling, agg, on='time')
del rolling, agg
gc.collect()

data_df = pd.merge(data_df, combine, on='time')
del combine
gc.collect()

0

In [10]:
train_df = data_df[:train_count].copy().reset_index(drop=True)
test_df = data_df[train_count:].copy().reset_index(drop=True)

del data_df
gc.collect()

20

In [11]:
drop_columns = ['time']

x_train = train_df.drop(drop_columns, axis=1)
x_test = test_df.drop(drop_columns, axis=1)

del train_df, test_df
gc.collect()

20

In [12]:
col_corr = correlation(x_train, 0.98)
print(col_corr)

{'indoorAtmo-outdoorAtmo_y', 'hour_y', 'min_y', 'month_y', 'indoorHum-outdoorHum_y', 'indoorHum_y'}


In [13]:
print(len(col_corr))

6


In [14]:
x_train.drop(list(col_corr), axis=1, inplace=True)
x_test.drop(list(col_corr), axis=1, inplace=True)
gc.collect()

40

In [15]:
nums = int(x_train.shape[0] * 0.8)

trn_x, trn_y, val_x, val_y = x_train[:nums], y_train[:nums], x_train[nums:], y_train[nums:]

train_matrix = xgb.DMatrix(trn_x, label=trn_y, missing=np.nan)
valid_matrix = xgb.DMatrix(val_x, label=val_y, missing=np.nan)
train_all_matrix = xgb.DMatrix(x_train, y_train, missing=np.nan)
test_matrix = xgb.DMatrix(x_test, label=val_y, missing=np.nan)

params = {
    'booster': 'gbtree',
    'eval_metric': 'mae',
    'min_child_weight': 5,
    'max_depth': 8,
    'subsample': 0.5,
    'colsample_bytree': 0.5,
    'eta': 0.01,
    'seed': 2020,
    'nthread': 36,
    'silent': 1
}

watchlist = [(train_matrix, 'train'), (valid_matrix, 'eval')]

model_eval = xgb.train(params,
                       train_matrix,
                       num_boost_round=50000,
                       evals=watchlist,
                       verbose_eval=500,
                       early_stopping_rounds=1000)
val_pred = model_eval.predict(valid_matrix, ntree_limit=model_eval.best_ntree_limit).reshape(-1, 1)

[0]	train-mae:0.63441	eval-mae:0.33368
Multiple eval metrics have been passed: 'eval-mae' will be used for early stopping.

Will train until eval-mae hasn't improved in 1000 rounds.
[500]	train-mae:0.05305	eval-mae:0.16394
[1000]	train-mae:0.04030	eval-mae:0.16246
[1500]	train-mae:0.03351	eval-mae:0.16178
[2000]	train-mae:0.02870	eval-mae:0.16120
[2500]	train-mae:0.02498	eval-mae:0.16090
[3000]	train-mae:0.02203	eval-mae:0.16074
[3500]	train-mae:0.01970	eval-mae:0.16086
[4000]	train-mae:0.01778	eval-mae:0.16068
[4500]	train-mae:0.01615	eval-mae:0.16051
[5000]	train-mae:0.01474	eval-mae:0.16034
[5500]	train-mae:0.01354	eval-mae:0.16033
[6000]	train-mae:0.01249	eval-mae:0.16032
[6500]	train-mae:0.01156	eval-mae:0.16013
[7000]	train-mae:0.01074	eval-mae:0.16005
[7500]	train-mae:0.01001	eval-mae:0.16002
[8000]	train-mae:0.00934	eval-mae:0.16003
Stopping. Best iteration:
[7241]	train-mae:0.01039	eval-mae:0.15997



In [16]:
mse = mean_squared_error(val_y, val_pred)
print("mse_score:", mse)
print("mse_score:", str(mse)[2:7])

mse_score: 0.04100635758796118
mse_score: 04100


In [17]:
feat_imp_dict = model_eval.get_score(importance_type='gain')

In [18]:
feat_imp = pd.Series(feat_imp_dict).sort_values(ascending=False).reset_index().rename(columns={'index': 'feature',
                                                                                               0: 'importance'})
feat_imp['normalized_importance'] = feat_imp['importance'] / feat_imp['importance'].sum()
feat_imp['cumulative_importance'] = np.cumsum(feat_imp['normalized_importance'])
record_low_importance = feat_imp[feat_imp['cumulative_importance'] > 0.95]

to_drop = list(record_low_importance['feature'])
print(to_drop)

['outdoorTemp_MDH_outdoorTemp_medi_multyply', 'outdoorTemp_200_bin_indoorAtmo_skew', 'outdoorTemp_50_bin_outdoorTemp_skew', 'outdoorTemp_200_bin_indoorAtmo-outdoorAtmo_sum', 'outdoorTemp_200_bin_indoorAtmo-outdoorAtmo_skew', 'outdoorHum_50_bin_indoorAtmo-outdoorAtmo_skew', 'MD_outdoorTemp_skew', 'indoorHum_20_bin_indoorHum-outdoorHum_max', 'MDH_indoorHum-outdoorHum_max_MDH_indoorAtmo-outdoorAtmo_medi_multyply', 'hit_t_4_outdoorTemp_skew', 'indoorHum-outdoorHum_20_bin_outdoorTemp_skew', 'MD_indoorHum-outdoorHum_medi_MDH_indoorAtmo-outdoorAtmo_medi_subtract', 'MDH_indoorAtmo-outdoorAtmo_medi_indoorHum-outdoorHum_multyply', 'indoorHum_4D_rolling_median', 'outdoorHum_20_bin_indoorAtmo-outdoorAtmo_min', 'outdoorTemp_100_bin_indoorAtmo-outdoorAtmo_skew', 'MDH_indoorAtmo-outdoorAtmo_medi_MDH_indoorHum-outdoorHum_max_multyply', 'outdoorHum_50_bin_outdoorTemp_skew', 'indoorAtmo_expanding_max', 'outdoorAtmo_50_bin_outdoorTemp_skew', 'indoorHum-outdoorHum_MDH_indoorAtmo_mean_subtract', 'outdoorHu

In [19]:
x_train.drop(to_drop, axis=1, inplace=True)
x_test.drop(to_drop, axis=1, inplace=True)

In [20]:
train_all_matrix = xgb.DMatrix(x_train, y_train, missing=np.nan)
test_matrix = xgb.DMatrix(x_test, label=val_y, missing=np.nan)

model = xgb.train(params,
                  train_all_matrix,
                  num_boost_round=model_eval.best_ntree_limit + 20)

test_pred = model.predict(test_matrix, ntree_limit=model.best_ntree_limit).reshape(-1, 1)

In [22]:
sub['temperature'] = test_pred[:, 0] + test_temp
sub.to_csv('../../sub/sub_{}_{}.csv'.format(time.strftime('%Y%m%d'), str(mse)[2:7]), index=False)