In [1]:
import numpy as np
import pandas as pd
from datetime import datetime
import os
import gc
import xgboost as xgb
import math
from joblib import Parallel, delayed

from tqdm import tqdm
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import mean_absolute_error, mean_squared_error
import matplotlib.pyplot as plt
import time
import warnings

warnings.filterwarnings('ignore')

pd.set_option('max_columns', None)
pd.set_option('max_rows', None)

In [2]:
# train_df = pd.read_hdf('../input/train.h5')
test_df = pd.read_hdf('../input/test.h5')
test_df.fillna(method='bfill', inplace=True)
sub = pd.DataFrame(test_df['time'])
gc.collect()

11

In [3]:
# train_df.columns = ['time', 'year', 'month', 'day', 'hour', 'min', 'sec', 'outdoorTemp', 'outdoorHum', 'outdoorAtmo',
#                     'indoorHum', 'indoorAtmo', 'temperature']
test_df.columns = ['time', 'year', 'month', 'day', 'hour', 'min', 'sec', 'outdoorTemp', 'outdoorHum', 'outdoorAtmo',
                   'indoorHum', 'indoorAtmo']

In [4]:
test_df['outdoorTemp'].isnull().sum()

0

In [5]:
x_train = pd.read_hdf('../input/train_features_corr.h5')
x_test = pd.read_hdf('../input/test_features_corr.h5')
y_train = pd.read_hdf('../input/y_train.h5')

In [6]:
x_train.shape, x_test.shape

((19338, 2557), (406, 2557))

In [7]:
params = {
    'booster': 'gbtree',
    'eval_metric': 'mae',
    'min_child_weight': 5,
    'max_depth': 8,
    'subsample': 0.5,
    'colsample_bytree': 0.5,
    'eta': 0.01,
    'seed': 2020,
    'njobs': -1,
    'silent': 1
}

In [8]:
nums = int(x_train.shape[0] * 0.8)

trn_x, trn_y, val_x, val_y = x_train[:nums], y_train[:nums], x_train[nums:], y_train[nums:]

train_matrix = xgb.DMatrix(trn_x, label=trn_y, missing=np.nan)
valid_matrix = xgb.DMatrix(val_x, label=val_y, missing=np.nan)
train_all_matrix = xgb.DMatrix(x_train, y_train, missing=np.nan)
test_matrix = xgb.DMatrix(x_test, label=val_y, missing=np.nan)

watchlist = [(train_matrix, 'train'), (valid_matrix, 'eval')]

model_eval = xgb.train(params,
                       train_matrix,
                       num_boost_round=50000,
                       evals=watchlist,
                       verbose_eval=500,
                       early_stopping_rounds=500)
val_pred = model_eval.predict(valid_matrix, ntree_limit=model_eval.best_ntree_limit).reshape(-1, 1)

[0]	train-mae:0.72620	eval-mae:0.31621
Multiple eval metrics have been passed: 'eval-mae' will be used for early stopping.

Will train until eval-mae hasn't improved in 500 rounds.
[500]	train-mae:0.04913	eval-mae:0.15253
[1000]	train-mae:0.03814	eval-mae:0.15209
[1500]	train-mae:0.03132	eval-mae:0.15193
Stopping. Best iteration:
[1319]	train-mae:0.03360	eval-mae:0.15176



In [9]:
mse = mean_squared_error(val_y, val_pred)
print("mse_score:", mse)
print("mse_score:", str(mse)[2:7])

mse_score: 0.037816330069916776
mse_score: 03781


In [10]:
feat_imp_dict = model_eval.get_score(importance_type='gain')

In [11]:
feat_imp = pd.Series(feat_imp_dict).sort_values(ascending=False).reset_index().rename(columns={'index': 'feature',
                                                                                               0: 'importance'})
feat_imp['normalized_importance'] = feat_imp['importance'] / feat_imp['importance'].sum()
feat_imp['cumulative_importance'] = np.cumsum(feat_imp['normalized_importance'])

# feat_imp.to_csv('../input/feat_imp.csv', index=False)

In [12]:
record_low_importance = feat_imp[feat_imp['cumulative_importance'] > 0.90]
to_drop = list(record_low_importance['feature'])
print(to_drop)

['MD_outdoorHum_medi_MD_indoorHum_mean_multyply', 'MD_outdoorTemp_diff_sum_MDH_outdoorTemp_diff_min_multyply', 'MDH_outdoorAtmo_mean_MD_outdoorTemp_diff_min_subtract', 'MD_outdoorHum_min_MDH_outdoorAtmo_diff_sum_multyply', 'MDH_outdoorHum_diff_max_MDH_indoorAtmo_diff_sum_ratio', 'MD_outdoorHum_diff_sum_MDH_indoorHum_diff_sum_ratio', 'outdoorTemp_200_bin_indoorAtmo_skew', 'outdoorHum_3D_rolling_min', 'MDH_indoorHum_diff_mean_MDH_outdoorTemp_diff_mean_multyply', 'outdoorHum_MDH_outdoorAtmo_medi_subtract', 'MD_outdoorHum_diff_max_MDH_indoorHum_diff_sum_multyply', 'MDH_indoorHum_diff_medi_MDH_indoorHum_diff_min_subtract', 'MDH_indoorHum_diff_sum_MDH_indoorAtmo_min_multyply', 'MD_outdoorTemp_diff_max_MD_indoorAtmo_diff_sum_ratio', 'MDH_outdoorHum_diff_sum_MD_outdoorAtmo_diff_sum_ratio', 'outdoorTemp_20_bin_outdoorAtmo_std', 'MD_outdoorHum_sum_MDH_outdoorAtmo_diff_mean_ratio', 'MDH_indoorHum_diff_mean_MD_outdoorTemp_diff_medi_multyply', 'MDH_outdoorTemp_diff_sum_MD_outdoorHum_diff_min_multyp

In [13]:
x_train.drop(to_drop, axis=1, inplace=True)
x_test.drop(to_drop, axis=1, inplace=True)

In [14]:
x_train.shape, x_test.shape

((19338, 2003), (406, 2003))

In [15]:
train_all_matrix = xgb.DMatrix(x_train, y_train, missing=np.nan)
test_matrix = xgb.DMatrix(x_test, label=val_y, missing=np.nan)

model = xgb.train(params,
                  train_all_matrix,
                  num_boost_round=model_eval.best_ntree_limit + 20)

test_pred = model.predict(test_matrix, ntree_limit=model.best_ntree_limit).reshape(-1, 1)

In [17]:
sub['temperature'] = test_pred[:, 0] + test_df['outdoorTemp'].values
sub.to_csv('../sub/sub_{}_{}.csv'.format(time.strftime('%Y%m%d'), str(mse)[2:7]), index=False)