In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from datetime import datetime

import warnings
warnings.filterwarnings('ignore')

import lightgbm as lgb

In [2]:
train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')
sub = pd.DataFrame(test['time'])

In [3]:
train.columns = ['time', 'year', 'month', 'day', 'hour', 'min', 'sec',
                 'outdoorTemp', 'outdoorHum', 'outdoorAtmo',
                 'indoorHum', 'indoorAtmo', 'temperature']
test.columns = ['time', 'year', 'month', 'day', 'hour', 'min', 'sec',
                'outdoorTemp', 'outdoorHum', 'outdoorAtmo',
                'indoorHum', 'indoorAtmo']

In [4]:
train = train[train['temperature'].notnull()]
train = train.sort_values(by='time')
test = test.sort_values(by='time')

In [5]:
train.head().append(train.tail())

Unnamed: 0,time,year,month,day,hour,min,sec,outdoorTemp,outdoorHum,outdoorAtmo,indoorHum,indoorAtmo,temperature
0,1552496443,2019,3,14,1,0,43,14.6,85.0,993.1,80.0,992.4,15.4
1,1552496503,2019,3,14,1,1,43,14.7,84.0,993.0,80.0,992.6,15.4
2,1552496565,2019,3,14,1,2,45,14.7,84.0,992.7,80.0,992.8,15.4
3,1552496624,2019,3,14,1,3,44,14.6,85.0,993.2,80.0,992.6,15.4
4,1552496682,2019,3,14,1,4,42,14.6,85.0,992.7,80.0,992.6,15.3
25492,1554224151,2019,4,3,0,55,51,17.3,78.0,983.8,75.0,982.2,17.4
25493,1554224212,2019,4,3,0,56,52,17.3,77.0,983.4,75.0,982.5,17.4
25494,1554224272,2019,4,3,0,57,52,17.3,77.0,983.1,75.0,982.2,17.4
25495,1554224332,2019,4,3,0,58,52,17.3,77.0,983.3,75.0,982.5,17.4
25496,1554224391,2019,4,3,0,59,51,17.2,77.0,982.9,75.0,982.4,17.4


In [6]:
test.head(5).append(test.tail())

Unnamed: 0,time,year,month,day,hour,min,sec,outdoorTemp,outdoorHum,outdoorAtmo,indoorHum,indoorAtmo
0,1554224413,2019,4,3,1,0,13,15.3,91.0,988.0,88.0,988.3
1,1554226217,2019,4,3,1,30,17,15.4,91.0,987.8,88.0,988.0
2,1554228020,2019,4,3,2,0,20,15.3,92.0,988.0,89.0,988.3
3,1554229823,2019,4,3,2,30,23,15.2,92.0,988.1,89.0,988.7
4,1554231625,2019,4,3,3,0,25,15.5,91.0,987.9,88.0,987.9
401,1555081072,2019,4,12,22,57,52,21.9,66.0,980.8,66.0,978.6
402,1555082935,2019,4,12,23,28,55,21.6,66.0,981.7,65.0,979.6
403,1555084737,2019,4,12,23,58,57,22.1,65.0,979.8,65.0,980.5
404,1555086487,2019,4,13,0,28,7,26.7,44.0,975.9,48.0,976.8
405,1555088229,2019,4,13,0,57,9,27.1,43.0,975.7,47.0,976.8


In [9]:
train.fillna(method='bfill', inplace=True)
test.fillna(method='bfill', inplace=True)

data = pd.concat([train, test])

In [10]:
data.head()

Unnamed: 0,time,year,month,day,hour,min,sec,outdoorTemp,outdoorHum,outdoorAtmo,indoorHum,indoorAtmo,temperature
0,1552496443,2019,3,14,1,0,43,14.6,85.0,993.1,80.0,992.4,15.4
1,1552496503,2019,3,14,1,1,43,14.7,84.0,993.0,80.0,992.6,15.4
2,1552496565,2019,3,14,1,2,45,14.7,84.0,992.7,80.0,992.8,15.4
3,1552496624,2019,3,14,1,3,44,14.6,85.0,993.2,80.0,992.6,15.4
4,1552496682,2019,3,14,1,4,42,14.6,85.0,992.7,80.0,992.6,15.3


In [12]:
data['datetime'] = data.apply(lambda x: datetime(x['year'].astype(int), x['month'].astype(int), x['day'].astype(int), x['hour'].astype(int), x['min'].astype(int), x['sec'].astype(int)), axis=1)

In [13]:
data.head()

Unnamed: 0,time,year,month,day,hour,min,sec,outdoorTemp,outdoorHum,outdoorAtmo,indoorHum,indoorAtmo,temperature,datetime
0,1552496443,2019,3,14,1,0,43,14.6,85.0,993.1,80.0,992.4,15.4,2019-03-14 01:00:43
1,1552496503,2019,3,14,1,1,43,14.7,84.0,993.0,80.0,992.6,15.4,2019-03-14 01:01:43
2,1552496565,2019,3,14,1,2,45,14.7,84.0,992.7,80.0,992.8,15.4,2019-03-14 01:02:45
3,1552496624,2019,3,14,1,3,44,14.6,85.0,993.2,80.0,992.6,15.4,2019-03-14 01:03:44
4,1552496682,2019,3,14,1,4,42,14.6,85.0,992.7,80.0,992.6,15.3,2019-03-14 01:04:42


In [14]:
numerical_features = ['outdoorTemp', 'outdoorHum', 'outdoorAtmo', 'indoorHum', 'indoorAtmo']

In [15]:
data.set_index('datetime', inplace=True)

In [None]:
data['indoorHum-outdoorHum'] = data['indoorHum'] - data['outdoorHum']
data['indoorAtmo-outdoorAtmo'] = data['indoorAtmo'] - data['outdoorAtmo']

In [16]:
# for i in numerical_features:
#     for j in ['60s', '90s', '120s']:
#         data['{}_{}_mean'.format(i, j)] = data[i].rolling(j).mean()

In [17]:
params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'mse',
    'min_child_weight': 5,
    'num_leaves': 2 ** 8,
    'feature_fraction': 0.5,
    'bagging_fraction': 0.5,
    'bagging_freq': 1,
    'learning_rate': 0.1,
    'seed': 2020
}

In [18]:
time_cols = ['year', 'month', 'day', 'hour', 'min', 'sec'] + ['time', 'datetime']

In [19]:
train_cols = [i for i in data.columns if i not in time_cols]
y = 'temperature'

data.reset_index(inplace=True)

train = data[data['temperature'].notnull()]
test = data[data['temperature'].isnull()]

X_train, X_valid, y_train, y_valid = train_test_split(train[train_cols], train[y],
                                                      test_size=0.2,
                                                      random_state=2020) 
train_data = lgb.Dataset(X_train, label=y_train)
valid_data = lgb.Dataset(X_valid, label=y_valid, reference=train_data)
train_all = lgb.Dataset(train[train_cols], label=train[y], reference=train_data)

In [26]:
data.head()

Unnamed: 0,datetime,time,year,month,day,hour,min,sec,outdoorTemp,outdoorHum,...,outdoorHum_120s_mean,outdoorAtmo_60s_mean,outdoorAtmo_90s_mean,outdoorAtmo_120s_mean,indoorHum_60s_mean,indoorHum_90s_mean,indoorHum_120s_mean,indoorAtmo_60s_mean,indoorAtmo_90s_mean,indoorAtmo_120s_mean
0,2019-03-14 01:00:43,1552496443,2019,3,14,1,0,43,14.6,85.0,...,85.0,993.1,993.1,993.1,80.0,80.0,80.0,992.4,992.4,992.4
1,2019-03-14 01:01:43,1552496503,2019,3,14,1,1,43,14.7,84.0,...,84.5,993.0,993.05,993.05,80.0,80.0,80.0,992.6,992.5,992.5
2,2019-03-14 01:02:45,1552496565,2019,3,14,1,2,45,14.7,84.0,...,84.0,992.7,992.85,992.85,80.0,80.0,80.0,992.8,992.7,992.7
3,2019-03-14 01:03:44,1552496624,2019,3,14,1,3,44,14.6,85.0,...,84.5,992.95,992.95,992.95,80.0,80.0,80.0,992.7,992.7,992.7
4,2019-03-14 01:04:42,1552496682,2019,3,14,1,4,42,14.6,85.0,...,84.666667,992.95,992.95,992.866667,80.0,80.0,80.0,992.6,992.6,992.666667


In [27]:
pd.set_option('max_columns', None)
pd.set_option('max_rows', None)
data.head(60)

Unnamed: 0,datetime,time,year,month,day,hour,min,sec,outdoorTemp,outdoorHum,outdoorAtmo,indoorHum,indoorAtmo,temperature,outdoorTemp_60s_mean,outdoorTemp_90s_mean,outdoorTemp_120s_mean,outdoorHum_60s_mean,outdoorHum_90s_mean,outdoorHum_120s_mean,outdoorAtmo_60s_mean,outdoorAtmo_90s_mean,outdoorAtmo_120s_mean,indoorHum_60s_mean,indoorHum_90s_mean,indoorHum_120s_mean,indoorAtmo_60s_mean,indoorAtmo_90s_mean,indoorAtmo_120s_mean
0,2019-03-14 01:00:43,1552496443,2019,3,14,1,0,43,14.6,85.0,993.1,80.0,992.4,15.4,14.6,14.6,14.6,85.0,85.0,85.0,993.1,993.1,993.1,80.0,80.0,80.0,992.4,992.4,992.4
1,2019-03-14 01:01:43,1552496503,2019,3,14,1,1,43,14.7,84.0,993.0,80.0,992.6,15.4,14.7,14.65,14.65,84.0,84.5,84.5,993.0,993.05,993.05,80.0,80.0,80.0,992.6,992.5,992.5
2,2019-03-14 01:02:45,1552496565,2019,3,14,1,2,45,14.7,84.0,992.7,80.0,992.8,15.4,14.7,14.7,14.7,84.0,84.0,84.0,992.7,992.85,992.85,80.0,80.0,80.0,992.8,992.7,992.7
3,2019-03-14 01:03:44,1552496624,2019,3,14,1,3,44,14.6,85.0,993.2,80.0,992.6,15.4,14.65,14.65,14.65,84.5,84.5,84.5,992.95,992.95,992.95,80.0,80.0,80.0,992.7,992.7,992.7
4,2019-03-14 01:04:42,1552496682,2019,3,14,1,4,42,14.6,85.0,992.7,80.0,992.6,15.3,14.6,14.6,14.633333,85.0,85.0,84.666667,992.95,992.95,992.866667,80.0,80.0,80.0,992.6,992.6,992.666667
5,2019-03-14 01:05:45,1552496745,2019,3,14,1,5,45,14.6,85.0,993.1,80.0,992.9,15.3,14.6,14.6,14.6,85.0,85.0,85.0,993.1,992.9,992.9,80.0,80.0,80.0,992.9,992.75,992.75
6,2019-03-14 01:06:44,1552496804,2019,3,14,1,6,44,14.5,85.0,993.6,80.0,409.6,15.3,14.55,14.55,14.55,85.0,85.0,85.0,993.35,993.35,993.35,80.0,80.0,80.0,701.25,701.25,701.25
7,2019-03-14 01:07:42,1552496862,2019,3,14,1,7,42,14.5,85.0,993.6,80.0,992.8,15.3,14.5,14.5,14.533333,85.0,85.0,85.0,993.6,993.6,993.433333,80.0,80.0,80.0,701.2,701.2,798.433333
8,2019-03-14 01:08:45,1552496925,2019,3,14,1,8,45,14.5,85.0,993.4,80.0,993.2,15.3,14.5,14.5,14.5,85.0,85.0,85.0,993.4,993.5,993.5,80.0,80.0,80.0,993.2,993.0,993.0
9,2019-03-14 01:09:42,1552496982,2019,3,14,1,9,42,14.5,85.0,993.8,80.0,993.2,15.3,14.5,14.5,14.5,85.0,85.0,85.0,993.6,993.6,993.6,80.0,80.0,80.0,993.2,993.2,993.2


In [25]:
train.shape

(24807, 29)

In [24]:
train_cols

['outdoorTemp',
 'outdoorHum',
 'outdoorAtmo',
 'indoorHum',
 'indoorAtmo',
 'temperature',
 'outdoorTemp_60s_mean',
 'outdoorTemp_90s_mean',
 'outdoorTemp_120s_mean',
 'outdoorHum_60s_mean',
 'outdoorHum_90s_mean',
 'outdoorHum_120s_mean',
 'outdoorAtmo_60s_mean',
 'outdoorAtmo_90s_mean',
 'outdoorAtmo_120s_mean',
 'indoorHum_60s_mean',
 'indoorHum_90s_mean',
 'indoorHum_120s_mean',
 'indoorAtmo_60s_mean',
 'indoorAtmo_90s_mean',
 'indoorAtmo_120s_mean']

In [20]:
model_valid = lgb.train(params,
                        train_data,
                        valid_sets=[train_data, valid_data],
                        num_boost_round=100000,
                        early_stopping_rounds=200,
                        verbose_eval=200)

Training until validation scores don't improve for 200 rounds
[200]	training's l2: 0.000156642	valid_1's l2: 0.000702914
[400]	training's l2: 5.89784e-05	valid_1's l2: 0.00060594
[600]	training's l2: 3.04612e-05	valid_1's l2: 0.000572647
[800]	training's l2: 1.83483e-05	valid_1's l2: 0.000557668
[1000]	training's l2: 1.22435e-05	valid_1's l2: 0.000551739
[1200]	training's l2: 8.87489e-06	valid_1's l2: 0.000547117
[1400]	training's l2: 6.78607e-06	valid_1's l2: 0.000542942
[1600]	training's l2: 5.42172e-06	valid_1's l2: 0.000540853
[1800]	training's l2: 4.5055e-06	valid_1's l2: 0.000539424
[2000]	training's l2: 3.81845e-06	valid_1's l2: 0.000538201
[2200]	training's l2: 3.36744e-06	valid_1's l2: 0.000537288
[2400]	training's l2: 2.98255e-06	valid_1's l2: 0.000536518
[2600]	training's l2: 2.70894e-06	valid_1's l2: 0.000535733
[2800]	training's l2: 2.48372e-06	valid_1's l2: 0.000535737
[3000]	training's l2: 2.31674e-06	valid_1's l2: 0.000535161
[3200]	training's l2: 2.18673e-06	valid_1's 

In [21]:
model = lgb.train(params,
                  train_all,
                  num_boost_round=model_valid.best_iteration + 20)

In [22]:
pred = model.predict(test[train_cols])

In [23]:
sub['temperature'] = pred
sub.to_csv('../sub/sub_lgb_baseline.csv', index=False)