In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from tqdm import tqdm_notebook
import itertools

import helpers as h
%matplotlib inline

In [2]:
train = pd.read_hdf('../input/crypto_train.h5')
test = pd.read_hdf('../input/crypto_test.h5') 

In [3]:
train.head()

Unnamed: 0,id,datetime,ts,value
0,0,2017-09-30 22:00:00,0,59.744697
1,1,2017-09-30 23:00:00,0,59.697985
2,2,2017-10-01 00:00:00,0,59.242129
3,3,2017-10-01 01:00:00,0,58.624959
4,4,2017-10-01 02:00:00,0,60.135081


In [4]:
train.isnull().any()

id          False
datetime    False
ts          False
value        True
dtype: bool

In [4]:
not_continous = list(h.find_continous_series(train))
print(len(not_continous))
print(not_continous)

HBox(children=(IntProgress(value=0, description='ts loop', max=1650), HTML(value='')))


150
[2, 13, 24, 35, 46, 57, 68, 79, 90, 101, 112, 123, 134, 145, 156, 167, 178, 189, 200, 211, 222, 233, 244, 255, 266, 277, 288, 299, 310, 321, 332, 343, 354, 365, 376, 387, 398, 409, 420, 431, 442, 453, 464, 475, 486, 497, 508, 519, 530, 541, 552, 563, 574, 585, 596, 607, 618, 629, 640, 651, 662, 673, 684, 695, 706, 717, 728, 739, 750, 761, 772, 783, 794, 805, 816, 827, 838, 849, 860, 871, 882, 893, 904, 915, 926, 937, 948, 959, 970, 981, 992, 1003, 1014, 1025, 1036, 1047, 1058, 1069, 1080, 1091, 1102, 1113, 1124, 1135, 1146, 1157, 1168, 1179, 1190, 1201, 1212, 1223, 1234, 1245, 1256, 1267, 1278, 1289, 1300, 1311, 1322, 1333, 1344, 1355, 1366, 1377, 1388, 1399, 1410, 1421, 1432, 1443, 1454, 1465, 1476, 1487, 1498, 1509, 1520, 1531, 1542, 1553, 1564, 1575, 1586, 1597, 1608, 1619, 1630, 1641]


In [5]:
with_nan = list(h.with_nan_series(train))
print(len(with_nan))
print(with_nan)

HBox(children=(IntProgress(value=0, description='ts loop', max=1650), HTML(value='')))


390
[2, 6, 10, 24, 28, 32, 35, 39, 43, 46, 50, 54, 57, 61, 65, 68, 72, 76, 79, 83, 87, 90, 94, 98, 101, 105, 109, 123, 127, 131, 134, 138, 142, 167, 171, 175, 189, 193, 197, 200, 204, 208, 211, 215, 219, 222, 226, 230, 233, 237, 241, 244, 248, 252, 255, 259, 263, 266, 270, 274, 277, 281, 285, 288, 292, 296, 299, 303, 307, 310, 314, 318, 321, 325, 329, 332, 336, 340, 343, 347, 351, 354, 358, 362, 365, 369, 373, 376, 380, 384, 387, 391, 395, 398, 402, 406, 409, 413, 417, 420, 424, 428, 431, 435, 439, 453, 457, 461, 464, 468, 472, 475, 479, 483, 497, 501, 505, 508, 512, 516, 519, 523, 527, 530, 534, 538, 541, 545, 549, 552, 556, 560, 563, 567, 571, 574, 578, 582, 585, 589, 593, 596, 600, 604, 607, 611, 615, 618, 622, 626, 629, 633, 637, 640, 644, 648, 651, 655, 659, 662, 666, 670, 673, 677, 681, 684, 688, 692, 695, 699, 703, 706, 710, 714, 728, 732, 736, 750, 754, 758, 761, 765, 769, 783, 787, 791, 794, 798, 802, 805, 809, 813, 816, 820, 824, 827, 831, 835, 849, 853, 857, 860, 864, 868, 

#### Filling empty values with interpolation

In [6]:
train['value'].interpolate(inplace=True, limit_direction='both')

In [7]:
with_nan = list(h.with_nan_series(train))
print(len(with_nan))
print(with_nan)
print('is null'.format(train.isnull().any()))

HBox(children=(IntProgress(value=0, description='ts loop', max=1650), HTML(value='')))


0
[]
is null


Some timeseries are not continous. I will resample to add missing values.

In [8]:
fixed_train = pd.DataFrame(columns=['datetime', 'ts', 'value'])
range_to_process = train.ts.unique()

for ts_index in tqdm_notebook(range_to_process, desc='ts loop'):
    train_ts = train[train.ts==ts_index]
    if h.is_series_continous(train_ts):
        fixed_train = fixed_train.append(train_ts[['datetime', 'ts', 'value']], ignore_index=True)
    else:
        print('fixing {}'.format(ts_index))
        train_ts.index = train_ts.datetime
        train_ts = train_ts.resample('H').asfreq()
        train_ts = train_ts.interpolate()
        train_ts.datetime = train_ts.index
        fixed_train = fixed_train.append(train_ts[['datetime', 'ts', 'value']], ignore_index=True)

fixed_train['ts'] = fixed_train['ts'].astype(int)

HBox(children=(IntProgress(value=0, description='ts loop', max=1650), HTML(value='')))

fixing 2
fixing 13
fixing 24
fixing 35
fixing 46
fixing 57
fixing 68
fixing 79
fixing 90
fixing 101
fixing 112
fixing 123
fixing 134
fixing 145
fixing 156
fixing 167
fixing 178
fixing 189
fixing 200
fixing 211
fixing 222
fixing 233
fixing 244
fixing 255
fixing 266
fixing 277
fixing 288
fixing 299
fixing 310
fixing 321
fixing 332
fixing 343
fixing 354
fixing 365
fixing 376
fixing 387
fixing 398
fixing 409
fixing 420
fixing 431
fixing 442
fixing 453
fixing 464
fixing 475
fixing 486
fixing 497
fixing 508
fixing 519
fixing 530
fixing 541
fixing 552
fixing 563
fixing 574
fixing 585
fixing 596
fixing 607
fixing 618
fixing 629
fixing 640
fixing 651
fixing 662
fixing 673
fixing 684
fixing 695
fixing 706
fixing 717
fixing 728
fixing 739
fixing 750
fixing 761
fixing 772
fixing 783
fixing 794
fixing 805
fixing 816
fixing 827
fixing 838
fixing 849
fixing 860
fixing 871
fixing 882
fixing 893
fixing 904
fixing 915
fixing 926
fixing 937
fixing 948
fixing 959
fixing 970
fixing 981
fixing 992
fixing 10

In [9]:
fixed_train = fixed_train.dropna()

In [10]:
fixed_train.head()

Unnamed: 0,datetime,ts,value
0,2017-09-30 22:00:00,0,59.744697
1,2017-09-30 23:00:00,0,59.697985
2,2017-10-01 00:00:00,0,59.242129
3,2017-10-01 01:00:00,0,58.624959
4,2017-10-01 02:00:00,0,60.135081


In [11]:
fixed_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 863850 entries, 0 to 863849
Data columns (total 3 columns):
datetime    863850 non-null datetime64[ns]
ts          863850 non-null int32
value       863850 non-null float64
dtypes: datetime64[ns](1), float64(1), int32(1)
memory usage: 23.1 MB


In [12]:
len(fixed_train.ts.unique())

1650

In [13]:
fixed_train.isnull().any()

datetime    False
ts          False
value       False
dtype: bool

In [14]:
not_cont = list(h.find_continous_series(fixed_train))
print(len(not_cont))
print(not_cont)

HBox(children=(IntProgress(value=0, description='ts loop', max=1650), HTML(value='')))


0
[]


In [15]:
with_nan = list(h.with_nan_series(fixed_train))
print(len(with_nan))
print(with_nan)

HBox(children=(IntProgress(value=0, description='ts loop', max=1650), HTML(value='')))


0
[]


In [16]:
fixed_train.to_hdf('../input/crypto_fixed_train.h5', 'crypto', index=False) 