In [1]:
import numpy as np
import pandas as pd
from datetime import datetime
import os
import gc
import xgboost as xgb
import math
from joblib import Parallel, delayed

from tqdm import tqdm
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import mean_absolute_error, mean_squared_error
import matplotlib.pyplot as plt
import time
import warnings

warnings.filterwarnings('ignore')

pd.set_option('max_columns', None)
pd.set_option('max_rows', None)

In [2]:
def correlation(df, threshold=0.98):
    """
    特征相关性计算
    @param df: 
    @param threshold: 
    @return: 
    """
    col_corr = set()
    corr_matrix = df.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if abs(corr_matrix.iloc[i, j]) > threshold:
                colName = corr_matrix.columns[i]
                col_corr.add(colName)
    return col_corr

In [3]:
train_df = pd.read_hdf('../../input/train.h5')
test_df = pd.read_hdf('../../input/test.h5')
sub = pd.DataFrame(test_df['time'])

In [4]:
train_df = train_df[train_df['temperature'].notnull()]
train_df = train_df.fillna(method='bfill')
test_df = test_df.fillna(method='bfill')
gc.collect()

64

In [5]:
train_df.columns = ['time', 'year', 'month', 'day', 'hour', 'min', 'sec', 'outdoorTemp', 'outdoorHum', 'outdoorAtmo',
                    'indoorHum', 'indoorAtmo', 'temperature']
test_df.columns = ['time', 'year', 'month', 'day', 'hour', 'min', 'sec', 'outdoorTemp', 'outdoorHum', 'outdoorAtmo',
                   'indoorHum', 'indoorAtmo']

In [6]:
print('train_df.shape: ', train_df.shape)
train_df = train_df.loc[(train_df['outdoorTemp'] >= test_df['outdoorTemp'].min()) & (train_df['outdoorTemp'] <= test_df['outdoorTemp'].max())]
print('处理后 train_df.shape: ', train_df.shape)

train_df.shape:  (24807, 13)
处理后 train_df.shape:  (19338, 13)


In [7]:
train_count = train_df.shape[0]
y_train = train_df['temperature'].values - train_df['outdoorTemp'].values
test_temp = test_df['outdoorTemp'].values

del train_df, test_df
gc.collect()

60

In [8]:
rolling = pd.read_hdf('../../input/features/rolling.h5')
agg = pd.read_hdf('../../input/features/agg.h5')
combine = pd.read_hdf('../../input/features/combine.h5')

In [9]:
combine.drop(['month', 'hour', 'min'], axis=1, inplace=True)
gc.collect()

150

In [10]:
print(rolling.shape)
print(agg.shape)
print(combine.shape)

(19744, 46)
(19744, 209)
(25213, 1561)


In [11]:
data_df = pd.merge(rolling, agg, on='time')
del rolling, agg
gc.collect()

data_df = pd.merge(data_df, combine, on='time')
del combine
gc.collect()

0

In [12]:
data_df.shape

(19744, 1814)

In [13]:
train_df = data_df[:train_count].copy().reset_index(drop=True)
test_df = data_df[train_count:].copy().reset_index(drop=True)

del data_df
gc.collect()

40

In [14]:
print(train_df.shape)
print(test_df.shape)

(19338, 1814)
(406, 1814)


In [15]:
drop_columns = ['time']

x_train = train_df.drop(drop_columns, axis=1)
x_test = test_df.drop(drop_columns, axis=1)

del train_df, test_df
gc.collect()

40

In [16]:
col_corr = correlation(x_train, 0.98)
print(col_corr)

{'MDH_indoorHum_diff_min*MD_indoorHum_diff_sum', 'MDH_outdoorTemp_diff_sum/MDH_indoorAtmo_diff_sum', 'indoorAtmo*MDH_indoorAtmo_medi', 'MD_outdoorAtmo_diff_sum*MDH_indoorAtmo_diff_sum', 'MDH_outdoorTemp_diff_medi*MDH_outdoorTemp_diff_sum', 'indoorHum', 'MDH_outdoorTemp_diff_mean/MDH_outdoorAtmo_diff_min'}


In [17]:
print(len(col_corr))

7


In [18]:
x_train.drop(list(col_corr), axis=1, inplace=True)
x_test.drop(list(col_corr), axis=1, inplace=True)
gc.collect()

40

In [19]:
nums = int(x_train.shape[0] * 0.8)

trn_x, trn_y, val_x, val_y = x_train[:nums], y_train[:nums], x_train[nums:], y_train[nums:]

train_matrix = xgb.DMatrix(trn_x, label=trn_y, missing=np.nan)
valid_matrix = xgb.DMatrix(val_x, label=val_y, missing=np.nan)
train_all_matrix = xgb.DMatrix(x_train, y_train, missing=np.nan)
test_matrix = xgb.DMatrix(x_test, label=val_y, missing=np.nan)

params = {
    'booster': 'gbtree',
    'eval_metric': 'rmse',
    'min_child_weight': 5,
    'max_depth': 8,
    'subsample': 0.5,
    'colsample_bytree': 0.5,
    'eta': 0.01,
    'seed': 2020,
    'nthread': 36,
    'silent': 1
}

watchlist = [(train_matrix, 'train'), (valid_matrix, 'eval')]

model_eval = xgb.train(params,
                       train_matrix,
                       num_boost_round=50000,
                       evals=watchlist,
                       verbose_eval=500,
                       early_stopping_rounds=1000)
val_pred = model_eval.predict(valid_matrix, ntree_limit=model_eval.best_ntree_limit).reshape(-1, 1)

[0]	train-rmse:1.00065	eval-rmse:0.41772
Multiple eval metrics have been passed: 'eval-rmse' will be used for early stopping.

Will train until eval-rmse hasn't improved in 1000 rounds.
[500]	train-rmse:0.07173	eval-rmse:0.20304
[1000]	train-rmse:0.05227	eval-rmse:0.20381
Stopping. Best iteration:
[417]	train-rmse:0.08016	eval-rmse:0.20286



In [20]:
mse = mean_squared_error(val_y, val_pred)
print("mse_score:", mse)
print("mse_score:", str(mse)[2:7])

mse_score: 0.041154087522244774
mse_score: 04115


In [21]:
mae = mean_absolute_error(val_y, val_pred)
print("mae_score:", mae)
print("mae_score:", str(mae)[2:7])

mae_score: 0.16277104511533425
mae_score: 16277


In [22]:
feat_imp_dict = model_eval.get_score(importance_type='gain')

In [23]:
feat_imp = pd.Series(feat_imp_dict).sort_values(ascending=False).reset_index().rename(columns={'index': 'feature',
                                                                                               0: 'importance'})
feat_imp['normalized_importance'] = feat_imp['importance'] / feat_imp['importance'].sum()
feat_imp['cumulative_importance'] = np.cumsum(feat_imp['normalized_importance'])
record_low_importance = feat_imp[feat_imp['cumulative_importance'] > 0.95]

to_drop = list(record_low_importance['feature'])
print(to_drop)

['outdoorTemp*MD_outdoorTemp_max', 'MDH_indoorHum_diff_sum+MD_outdoorAtmo_diff_sum', 'MDH_indoorAtmo_sum*MD_indoorHum_sum', 'outdoorHum*MDH_outdoorHum_medi', 'MDH_outdoorHum_diff_sum*MD_outdoorTemp_diff_sum', 'MDH_indoorAtmo_diff_sum*MDH_indoorHum_diff_sum', 'outdoorTemp*MDH_indoorAtmo_diff_sum', 'outdoorHum_20_bin_outdoorTemp_skew', 'indoorHum_20_bin_outdoorAtmo_min', 'outdoorTemp*MDH_outdoorTemp_diff_sum', 'indoorHum_20_bin_outdoorHum_sum', 'indoorHum_50_bin_outdoorHum_min', 'indoorHum*MDH_outdoorTemp_diff_sum', 'indoorHum*MD_outdoorAtmo_diff_sum', 'MDH_outdoorHum_diff_min*MDH_indoorHum_diff_min', 'MD_outdoorTemp_diff_sum*MDH_outdoorTemp_diff_sum', 'MD_outdoorTemp_diff_mean*MD_outdoorAtmo_diff_mean', 'MDH_outdoorAtmo_diff_sum/MD_outdoorAtmo_diff_sum', 'MDH_outdoorHum_diff_mean*MDH_indoorHum_diff_mean', 'MDH_indoorAtmo_diff_mean*MDH_outdoorTemp_diff_mean', 'outdoorTemp_20_bin_outdoorAtmo_skew', 'MDH_indoorHum_diff_sum-MDH_outdoorHum_diff_sum', 'indoorHum_20_bin_outdoorHum_std', 'indoo

In [24]:
x_train.drop(to_drop, axis=1, inplace=True)
x_test.drop(to_drop, axis=1, inplace=True)

In [25]:
train_all_matrix = xgb.DMatrix(x_train, y_train, missing=np.nan)
test_matrix = xgb.DMatrix(x_test, label=val_y, missing=np.nan)

model = xgb.train(params,
                  train_all_matrix,
                  num_boost_round=model_eval.best_ntree_limit + 20)

test_pred = model.predict(test_matrix, ntree_limit=model.best_ntree_limit).reshape(-1, 1)

In [26]:
sub['temperature'] = test_pred[:, 0] + test_temp
sub.to_csv('../../sub/sub_{}_{}.csv'.format(time.strftime('%Y%m%d'), str(mse)[2:7]), index=False)