In [1]:
import numpy as np
import pandas as pd
from datetime import datetime
import os
import gc
import xgboost as xgb
import math
from joblib import Parallel, delayed

from tqdm import tqdm
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import mean_absolute_error, mean_squared_error
import matplotlib.pyplot as plt
import time
import warnings

warnings.filterwarnings('ignore')

pd.set_option('max_columns', None)
pd.set_option('max_rows', None)

In [2]:
# train_df = pd.read_hdf('../input/train.h5')
test_df = pd.read_hdf('../input/test.h5')
sub = pd.DataFrame(test_df['time'])
gc.collect()

11

In [3]:
# train_df.columns = ['time', 'year', 'month', 'day', 'hour', 'min', 'sec', 'outdoorTemp', 'outdoorHum', 'outdoorAtmo',
#                     'indoorHum', 'indoorAtmo', 'temperature']
test_df.columns = ['time', 'year', 'month', 'day', 'hour', 'min', 'sec', 'outdoorTemp', 'outdoorHum', 'outdoorAtmo',
                   'indoorHum', 'indoorAtmo']

In [4]:
x_train = pd.read_hdf('../input/train_features_corr_95.h5')
x_test = pd.read_hdf('../input/test_features_corr_95.h5')
y_train = pd.read_hdf('../input/y_train.h5')

In [5]:
x_train.shape, x_test.shape

((19338, 2300), (406, 2300))

In [6]:
params = {
    'booster': 'gbtree',
    'eval_metric': 'mae',
    'min_child_weight': 5,
    'max_depth': 8,
    'subsample': 0.5,
    'colsample_bytree': 0.5,
    'eta': 0.01,
    'seed': 2020,
    'njobs': -1,
    'silent': 1
}

In [7]:
nums = int(x_train.shape[0] * 0.8)

trn_x, trn_y, val_x, val_y = x_train[:nums], y_train[:nums], x_train[nums:], y_train[nums:]

train_matrix = xgb.DMatrix(trn_x, label=trn_y, missing=np.nan)
valid_matrix = xgb.DMatrix(val_x, label=val_y, missing=np.nan)
train_all_matrix = xgb.DMatrix(x_train, y_train, missing=np.nan)
test_matrix = xgb.DMatrix(x_test, label=val_y, missing=np.nan)

watchlist = [(train_matrix, 'train'), (valid_matrix, 'eval')]

model_eval = xgb.train(params,
                       train_matrix,
                       num_boost_round=50000,
                       evals=watchlist,
                       verbose_eval=500,
                       early_stopping_rounds=500)
val_pred = model_eval.predict(valid_matrix, ntree_limit=model_eval.best_ntree_limit).reshape(-1, 1)

[0]	train-mae:0.72620	eval-mae:0.31547
Multiple eval metrics have been passed: 'eval-mae' will be used for early stopping.

Will train until eval-mae hasn't improved in 500 rounds.
[500]	train-mae:0.05033	eval-mae:0.15396
[1000]	train-mae:0.03889	eval-mae:0.15277
[1500]	train-mae:0.03199	eval-mae:0.15218
Stopping. Best iteration:
[1491]	train-mae:0.03210	eval-mae:0.15214



In [8]:
mse = mean_squared_error(val_y, val_pred)
print("mse_score:", mse)
print("mse_score:", str(mse)[2:7])

mse_score: 0.03916418754601699
mse_score: 03916


In [9]:
feat_imp_dict = model_eval.get_score(importance_type='gain')

In [10]:
feat_imp = pd.Series(feat_imp_dict).sort_values(ascending=False).reset_index().rename(columns={'index': 'feature',
                                                                                               0: 'importance'})
feat_imp['normalized_importance'] = feat_imp['importance'] / feat_imp['importance'].sum()
feat_imp['cumulative_importance'] = np.cumsum(feat_imp['normalized_importance'])

# feat_imp.to_csv('../input/feat_imp.csv', index=False)

In [11]:
record_low_importance = feat_imp[feat_imp['cumulative_importance'] > 0.98]
to_drop = list(record_low_importance['feature'])
print(to_drop)

['outdoorTemp_diff_3D_rolling_skew', 'indoorHum_50_bin_outdoorHum_skew', 'indoorHum_50_bin_indoorAtmo_mean', 'outdoorHum_100_bin_indoorAtmo_mean', 'MDH_indoorAtmo_mean_MDH_outdoorHum_diff_sum_subtract', 'outdoorAtmo_100_bin_outdoorHum_min', 'outdoorTemp_diff_expanding_mean', 'outdoorHum_100_bin_indoorAtmo_skew', 'MDH_outdoorAtmo_diff_sum_MDH_outdoorHum_diff_min_multyply', 'MDH_outdoorHum_diff_sum_MDH_indoorAtmo_diff_sum_multyply', 'indoorHum_50_bin_outdoorTemp_sum', 'MD_outdoorHum_diff_min_MDH_outdoorHum_diff_sum_multyply', 'indoorAtmo_100_bin_outdoorHum_std', 'indoorAtmo_MD_indoorHum_max_ratio', 'outdoorHum_20_bin_outdoorHum_skew', 'MDH_outdoorHum_diff_sum_MD_outdoorHum_diff_min_multyply', 'indoorHum_20_bin_outdoorAtmo_std', 'indoorHum_50_bin_indoorAtmo_min', 'indoorAtmo_diff_expanding_skew', 'MD_outdoorAtmo_diff_sum_MDH_outdoorHum_diff_sum_multyply', 'indoorHum_100_bin_outdoorHum_sum', 'outdoorTemp_3D_rolling_std', 'outdoorHum_50_bin_indoorAtmo_skew', 'outdoorAtmo_MD_indoorHum_max_ra

In [12]:
x_train.drop(to_drop, axis=1, inplace=True)
x_test.drop(to_drop, axis=1, inplace=True)

In [13]:
x_train.shape, x_test.shape

((19338, 2127), (406, 2127))

In [14]:
train_all_matrix = xgb.DMatrix(x_train, y_train, missing=np.nan)
test_matrix = xgb.DMatrix(x_test, label=val_y, missing=np.nan)

model = xgb.train(params,
                  train_all_matrix,
                  num_boost_round=model_eval.best_ntree_limit + 20)

test_pred = model.predict(test_matrix, ntree_limit=model.best_ntree_limit).reshape(-1, 1)

In [15]:
sub['temperature'] = test_pred[:, 0] + test_df['outdoorTemp'].values
sub.to_csv('../sub/sub_{}_{}.csv'.format(time.strftime('%Y%m%d'), str(mse)[2:7]), index=False)