In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from datetime import datetime
import time

import warnings
warnings.filterwarnings('ignore')

import lightgbm as lgb

pd.set_option('max_columns', None)
pd.set_option('max_rows', None)

In [2]:
train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')
sub = pd.DataFrame(test['time'])

In [3]:
train.columns = ['time', 'year', 'month', 'day', 'hour', 'min', 'sec',
                 'outdoorTemp', 'outdoorHum', 'outdoorAtmo',
                 'indoorHum', 'indoorAtmo', 'temperature']
test.columns = ['time', 'year', 'month', 'day', 'hour', 'min', 'sec',
                'outdoorTemp', 'outdoorHum', 'outdoorAtmo',
                'indoorHum', 'indoorAtmo']

In [4]:
train = train[train['temperature'].notnull()]
train = train.sort_values(by='time')
test = test.sort_values(by='time')

In [5]:
train.head().append(train.tail())

Unnamed: 0,time,year,month,day,hour,min,sec,outdoorTemp,outdoorHum,outdoorAtmo,indoorHum,indoorAtmo,temperature
0,1552496443,2019,3,14,1,0,43,14.6,85.0,993.1,80.0,992.4,15.4
1,1552496503,2019,3,14,1,1,43,14.7,84.0,993.0,80.0,992.6,15.4
2,1552496565,2019,3,14,1,2,45,14.7,84.0,992.7,80.0,992.8,15.4
3,1552496624,2019,3,14,1,3,44,14.6,85.0,993.2,80.0,992.6,15.4
4,1552496682,2019,3,14,1,4,42,14.6,85.0,992.7,80.0,992.6,15.3
25492,1554224151,2019,4,3,0,55,51,17.3,78.0,983.8,75.0,982.2,17.4
25493,1554224212,2019,4,3,0,56,52,17.3,77.0,983.4,75.0,982.5,17.4
25494,1554224272,2019,4,3,0,57,52,17.3,77.0,983.1,75.0,982.2,17.4
25495,1554224332,2019,4,3,0,58,52,17.3,77.0,983.3,75.0,982.5,17.4
25496,1554224391,2019,4,3,0,59,51,17.2,77.0,982.9,75.0,982.4,17.4


In [6]:
test.head(5).append(test.tail())

Unnamed: 0,time,year,month,day,hour,min,sec,outdoorTemp,outdoorHum,outdoorAtmo,indoorHum,indoorAtmo
0,1554224413,2019,4,3,1,0,13,15.3,91.0,988.0,88.0,988.3
1,1554226217,2019,4,3,1,30,17,15.4,91.0,987.8,88.0,988.0
2,1554228020,2019,4,3,2,0,20,15.3,92.0,988.0,89.0,988.3
3,1554229823,2019,4,3,2,30,23,15.2,92.0,988.1,89.0,988.7
4,1554231625,2019,4,3,3,0,25,15.5,91.0,987.9,88.0,987.9
401,1555081072,2019,4,12,22,57,52,21.9,66.0,980.8,66.0,978.6
402,1555082935,2019,4,12,23,28,55,21.6,66.0,981.7,65.0,979.6
403,1555084737,2019,4,12,23,58,57,22.1,65.0,979.8,65.0,980.5
404,1555086487,2019,4,13,0,28,7,26.7,44.0,975.9,48.0,976.8
405,1555088229,2019,4,13,0,57,9,27.1,43.0,975.7,47.0,976.8


In [7]:
train.fillna(method='bfill', inplace=True)
test.fillna(method='bfill', inplace=True)

data = pd.concat([train, test])

In [8]:
data.head()

Unnamed: 0,time,year,month,day,hour,min,sec,outdoorTemp,outdoorHum,outdoorAtmo,indoorHum,indoorAtmo,temperature
0,1552496443,2019,3,14,1,0,43,14.6,85.0,993.1,80.0,992.4,15.4
1,1552496503,2019,3,14,1,1,43,14.7,84.0,993.0,80.0,992.6,15.4
2,1552496565,2019,3,14,1,2,45,14.7,84.0,992.7,80.0,992.8,15.4
3,1552496624,2019,3,14,1,3,44,14.6,85.0,993.2,80.0,992.6,15.4
4,1552496682,2019,3,14,1,4,42,14.6,85.0,992.7,80.0,992.6,15.3


In [9]:
data['datetime'] = data.apply(lambda x: datetime(x['year'].astype(int), x['month'].astype(int), x['day'].astype(int), x['hour'].astype(int), x['min'].astype(int), x['sec'].astype(int)), axis=1)

In [10]:
data.head()

Unnamed: 0,time,year,month,day,hour,min,sec,outdoorTemp,outdoorHum,outdoorAtmo,indoorHum,indoorAtmo,temperature,datetime
0,1552496443,2019,3,14,1,0,43,14.6,85.0,993.1,80.0,992.4,15.4,2019-03-14 01:00:43
1,1552496503,2019,3,14,1,1,43,14.7,84.0,993.0,80.0,992.6,15.4,2019-03-14 01:01:43
2,1552496565,2019,3,14,1,2,45,14.7,84.0,992.7,80.0,992.8,15.4,2019-03-14 01:02:45
3,1552496624,2019,3,14,1,3,44,14.6,85.0,993.2,80.0,992.6,15.4,2019-03-14 01:03:44
4,1552496682,2019,3,14,1,4,42,14.6,85.0,992.7,80.0,992.6,15.3,2019-03-14 01:04:42


In [11]:
numerical_features = ['outdoorTemp', 'outdoorHum', 'outdoorAtmo', 'indoorHum', 'indoorAtmo']

In [12]:
# data['indoorHum-outdoorHum'] = data['indoorHum'] - data['outdoorHum']
# data['indoorAtmo-outdoorAtmo'] = data['indoorAtmo'] - data['outdoorAtmo']

In [13]:
# data.set_index('datetime', inplace=True)

# for i in numerical_features:
#     for j in ['60s', '90s', '120s']:
#         data['{}_{}_mean'.format(i, j)] = data[i].rolling(j).mean()

In [14]:
params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'mse',
    'min_child_weight': 5,
    'num_leaves': 2 ** 8,
    'feature_fraction': 0.5,
    'bagging_fraction': 0.5,
    'bagging_freq': 1,
    'learning_rate': 0.01,
    'seed': 2020
}

In [15]:
time_cols = ['year', 'month', 'day', 'hour', 'min', 'sec'] + ['time', 'datetime']

In [16]:
train_cols = [i for i in data.columns if i not in time_cols]
y = 'temperature'

train = data[data['temperature'].notnull()]
test = data[data['temperature'].isnull()]

train_train = train.loc[train['datetime'] < '2019-03-30 00:00:00', :]
train_valid = train.loc[train['datetime'] >= '2019-03-30 00:00:00', :]
print('train_train.shape: ', train_train.shape)
print('train_valid.shape: ', train_valid.shape)

X_train = train_train[train_cols]
y_train = train_train[y]
X_valid = train_valid[train_cols]
y_valid = train_valid[y]

train_data = lgb.Dataset(X_train, label=y_train)
valid_data = lgb.Dataset(X_valid, label=y_valid, reference=train_data)
train_all = lgb.Dataset(train[train_cols], label=train[y], reference=train_data)

train_train.shape:  (19537, 14)
train_valid.shape:  (5270, 14)


In [17]:
train_cols

['outdoorTemp',
 'outdoorHum',
 'outdoorAtmo',
 'indoorHum',
 'indoorAtmo',
 'temperature']

In [18]:
model_valid = lgb.train(params,
                        train_data,
                        valid_sets=[train_data, valid_data],
                        num_boost_round=100000,
                        early_stopping_rounds=200,
                        verbose_eval=200)

Training until validation scores don't improve for 200 rounds
[200]	training's l2: 0.310361	valid_1's l2: 0.224557
[400]	training's l2: 0.00981576	valid_1's l2: 0.188822
[600]	training's l2: 0.00267759	valid_1's l2: 0.181695
[800]	training's l2: 0.00176421	valid_1's l2: 0.171019
[1000]	training's l2: 0.00132552	valid_1's l2: 0.163552
[1200]	training's l2: 0.00105565	valid_1's l2: 0.157784
[1400]	training's l2: 0.000869325	valid_1's l2: 0.153316
[1600]	training's l2: 0.000731895	valid_1's l2: 0.149622
[1800]	training's l2: 0.000625256	valid_1's l2: 0.146572
[2000]	training's l2: 0.000547167	valid_1's l2: 0.144107
[2200]	training's l2: 0.000482866	valid_1's l2: 0.141842
[2400]	training's l2: 0.000430032	valid_1's l2: 0.140011
[2600]	training's l2: 0.000387503	valid_1's l2: 0.138501
[2800]	training's l2: 0.000349941	valid_1's l2: 0.136701
[3000]	training's l2: 0.000317939	valid_1's l2: 0.135262
[3200]	training's l2: 0.000291199	valid_1's l2: 0.133902
[3400]	training's l2: 0.000267856	vali

[28600]	training's l2: 9.99524e-06	valid_1's l2: 0.106769
[28800]	training's l2: 9.89046e-06	valid_1's l2: 0.106737
[29000]	training's l2: 9.78783e-06	valid_1's l2: 0.106676
[29200]	training's l2: 9.68768e-06	valid_1's l2: 0.106646
[29400]	training's l2: 9.59065e-06	valid_1's l2: 0.106608
[29600]	training's l2: 9.49431e-06	valid_1's l2: 0.106572
[29800]	training's l2: 9.39652e-06	valid_1's l2: 0.106538
[30000]	training's l2: 9.30679e-06	valid_1's l2: 0.106515
[30200]	training's l2: 9.21774e-06	valid_1's l2: 0.106468
[30400]	training's l2: 9.12899e-06	valid_1's l2: 0.106439
[30600]	training's l2: 9.04051e-06	valid_1's l2: 0.106403
[30800]	training's l2: 8.95409e-06	valid_1's l2: 0.106363
[31000]	training's l2: 8.87154e-06	valid_1's l2: 0.106331
[31200]	training's l2: 8.78856e-06	valid_1's l2: 0.106302
[31400]	training's l2: 8.70571e-06	valid_1's l2: 0.106271
[31600]	training's l2: 8.62332e-06	valid_1's l2: 0.106225
[31800]	training's l2: 8.54263e-06	valid_1's l2: 0.106193
[32000]	traini

[57000]	training's l2: 4.08218e-06	valid_1's l2: 0.103778
[57200]	training's l2: 4.06701e-06	valid_1's l2: 0.103769
[57400]	training's l2: 4.05233e-06	valid_1's l2: 0.103759
[57600]	training's l2: 4.03826e-06	valid_1's l2: 0.103752
[57800]	training's l2: 4.02404e-06	valid_1's l2: 0.103736
[58000]	training's l2: 4.0107e-06	valid_1's l2: 0.103728
[58200]	training's l2: 3.99698e-06	valid_1's l2: 0.103718
[58400]	training's l2: 3.98365e-06	valid_1's l2: 0.103702
[58600]	training's l2: 3.96933e-06	valid_1's l2: 0.103696
[58800]	training's l2: 3.95555e-06	valid_1's l2: 0.103686
[59000]	training's l2: 3.9427e-06	valid_1's l2: 0.103674
[59200]	training's l2: 3.92901e-06	valid_1's l2: 0.103664
[59400]	training's l2: 3.91567e-06	valid_1's l2: 0.103659
[59600]	training's l2: 3.90296e-06	valid_1's l2: 0.10365
[59800]	training's l2: 3.88871e-06	valid_1's l2: 0.103641
[60000]	training's l2: 3.87569e-06	valid_1's l2: 0.103637
[60200]	training's l2: 3.86302e-06	valid_1's l2: 0.103622
[60400]	training'

In [19]:
pred_valid = model_valid.predict(X_valid)

mse = int(np.round(mean_squared_error(y_valid, pred_valid), 5) * 10000)
mse

1035

In [20]:
model = lgb.train(params,
                  train_all,
                  num_boost_round=model_valid.best_iteration + 20)

In [21]:
pred = model.predict(test[train_cols])

In [22]:
sub['temperature'] = pred
sub.to_csv('../sub/sub{}{}.csv'.format(mse, time.strftime('%Y%m%d')), index=False)