In [1]:
import numpy as np 
import pandas as pd 
from scipy import signal
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# load the dataset
with pd.HDFStore("./data/train.h5", "r") as train:
    df = train.get("train")

In [3]:
df.head()

Unnamed: 0,id,timestamp,derived_0,derived_1,derived_2,derived_3,derived_4,fundamental_0,fundamental_1,fundamental_2,...,technical_36,technical_37,technical_38,technical_39,technical_40,technical_41,technical_42,technical_43,technical_44,y
0,10,0,0.370326,-0.006316,0.222831,-0.21303,0.729277,-0.335633,0.113292,1.621238,...,0.775208,,,,-0.414776,,,-2.0,,-0.011753
1,11,0,0.014765,-0.038064,-0.017425,0.320652,-0.034134,0.004413,0.114285,-0.210185,...,0.02559,,,,-0.273607,,,-2.0,,-0.00124
2,12,0,-0.010622,-0.050577,3.379575,-0.157525,-0.06855,-0.155937,1.219439,-0.764516,...,0.151881,,,,-0.17571,,,-2.0,,-0.02094
3,25,0,,,,,,0.178495,,-0.007262,...,1.035936,,,,-0.211506,,,-2.0,,-0.015959
4,26,0,0.176693,-0.025284,-0.05768,0.0151,0.180894,0.139445,-0.125687,-0.018707,...,0.630232,,,,-0.001957,,,0.0,,-0.007338


In [4]:
# classes of cols
list(set([c.split('_')[0] for c in df.columns]))

[u'technical', u'timestamp', u'derived', u'fundamental', u'y', u'id']

In [5]:
#print stats of all columns in the dataset
df.describe()

Unnamed: 0,id,timestamp,derived_0,derived_1,derived_2,derived_3,derived_4,fundamental_0,fundamental_1,fundamental_2,...,technical_36,technical_37,technical_38,technical_39,technical_40,technical_41,technical_42,technical_43,technical_44,y
count,1710756.0,1710756.0,1637797.0,1629727.0,1312105.0,1561285.0,1304298.0,1686809.0,1031686.0,1341916.0,...,1708204.0,1691591.0,1691591.0,1690740.0,1708520.0,1666567.0,1690755.0,1706070.0,1473977.0,1710756.0
mean,1093.858,945.6257,-4.536046,772943600000.0,-0.3320328,-0.5046012,18.01661,-0.02040938,-570375400.0,-0.1622954,...,-0.08584833,-0.09103397,-0.08156685,-0.07287001,0.04908321,0.005236218,-0.01699966,-0.9735299,0.0003881475,0.0002217509
std,630.8563,519.5685,249.7382,76206060000000.0,65.1981,102.0749,925.836,0.2494859,75023220000.0,3.66815,...,0.6125852,0.2471038,0.2346534,0.2235729,0.3102316,0.1133733,0.2116284,0.9605551,0.03011983,0.02240643
min,0.0,0.0,-20174.97,-0.07375435,-9848.88,-34341.76,-8551.914,-2.344957,-10437370000000.0,-1077.101,...,-1.687572,-1.0,-1.0,-1.0,-0.5250904,-0.4449529,-1.0,-2.0,-0.1265686,-0.08609413
25%,550.0,504.0,-0.144971,-0.02956479,-0.05967524,-0.1655826,-0.105705,-0.1996543,-0.196047,-0.2280967,...,-0.4050297,-0.0004651562,-0.0001992532,-2.203252e-05,-0.1521701,-0.07377038,-3.887695e-15,-2.0,-0.01998819,-0.009561389
50%,1098.0,956.0,-0.0008368272,0.005523058,0.02109505,0.002475614,0.01175234,-0.04064488,-0.007395084,-0.03029069,...,-0.08502064,-3.951567e-12,-1.418487e-13,-1.591224e-16,-0.01476793,9.782702e-05,0.0,-0.659754,1.117279e-05,-0.0001570681
75%,1657.0,1401.0,0.1199108,0.1078554,0.1952209,0.3037236,0.1556464,0.1303819,0.1832071,0.1764751,...,0.19096,-5.219879e-40,0.0,0.0,0.1772415,0.07855728,0.0,-5.188884e-08,0.02047074,0.00952099
max,2158.0,1812.0,3252.527,1.068448e+16,3823.001,1239.737,67859.65,1.378195,520.3165,76.77125,...,49.57758,0.0,0.0,0.0,1.569265,0.6844833,1.0,0.0,0.1435858,0.09349781


In [7]:
# explore small portion of dataset
# timestamp 0 and 1
ndf = df[['id','timestamp','derived_0','derived_1','derived_2','derived_3','derived_4','y']]
ndf.describe()

Unnamed: 0,id,timestamp,derived_0,derived_1,derived_2,derived_3,derived_4,y
count,1710756.0,1710756.0,1637797.0,1629727.0,1312105.0,1561285.0,1304298.0,1710756.0
mean,1093.858,945.6257,-4.536046,772943600000.0,-0.3320328,-0.5046012,18.01661,0.0002217509
std,630.8563,519.5685,249.7382,76206060000000.0,65.1981,102.0749,925.836,0.02240643
min,0.0,0.0,-20174.97,-0.07375435,-9848.88,-34341.76,-8551.914,-0.08609413
25%,550.0,504.0,-0.144971,-0.02956479,-0.05967524,-0.1655826,-0.105705,-0.009561389
50%,1098.0,956.0,-0.0008368272,0.005523058,0.02109505,0.002475614,0.01175234,-0.0001570681
75%,1657.0,1401.0,0.1199108,0.1078554,0.1952209,0.3037236,0.1556464,0.00952099
max,2158.0,1812.0,3252.527,1.068448e+16,3823.001,1239.737,67859.65,0.09349781


In [42]:
#ndf.loc[ndf.id==10]
print "number of times security {} is traded: {}".format(10,sum(ndf.id==10))

number of times security 10 is traded: 116


In [47]:
print "number of securities traded at timestamp {} are : {}".format(0,sum(ndf.timestamp==0))
print "number of securities traded at timestamp {} are : {}".format(1,sum(ndf.timestamp==1))
print "number of securities traded at timestamp {} are : {}".format(25,sum(ndf.timestamp==25))

number of securities traded at timestamp 0 are : 750
number of securities traded at timestamp 1 are : 750
number of securities traded at timestamp 25 are : 745


In [45]:
#list securities at timestamp 0
ndf.loc[ndf.timestamp==0]

Unnamed: 0,id,timestamp,derived_0,derived_1,derived_2,derived_3,derived_4,y
0,10,0,0.370326,-0.006316,0.222831,-0.213030,0.729277,-0.011753
1,11,0,0.014765,-0.038064,-0.017425,0.320652,-0.034134,-0.001240
2,12,0,-0.010622,-0.050577,3.379575,-0.157525,-0.068550,-0.020940
3,25,0,,,,,,-0.015959
4,26,0,0.176693,-0.025284,-0.057680,0.015100,0.180894,-0.007338
5,27,0,0.346856,0.166239,-6.080701,-0.992249,-0.125916,0.031425
6,31,0,0.072036,0.014931,,0.014063,,-0.032895
7,38,0,0.300062,0.071251,-0.074451,-0.065292,-0.011286,0.015803
8,39,0,-0.003511,-0.034270,0.082372,-0.023937,-0.025750,-0.027593
9,40,0,-0.083330,0.081935,-2.048438,-0.206856,-0.839563,0.006662


In [46]:
#list all rows where security 10 data is available in train data 
ndf.loc[ndf.id==10]

Unnamed: 0,id,timestamp,derived_0,derived_1,derived_2,derived_3,derived_4,y
0,10,0,0.370326,-0.006316,0.222831,-0.213030,0.729277,-0.011753
750,10,1,0.370326,-0.008504,0.224193,-0.216165,0.729277,0.005850
1500,10,2,0.370326,-0.009375,0.224736,-0.217415,0.729277,-0.000476
2250,10,3,0.370326,-0.010127,0.225204,-0.218492,0.729277,0.005212
3000,10,4,0.370326,-0.011338,0.225958,-0.220228,0.729277,-0.077211
3750,10,5,0.370326,-0.011824,0.226261,-0.220925,0.729277,0.006086
4500,10,6,0.370326,-0.012245,0.226523,-0.221528,0.729277,-0.027747
5250,10,7,0.370326,-0.012610,0.226750,-0.222051,0.729277,-0.033304
6000,10,8,0.370326,-0.012926,0.226947,-0.222505,0.729277,-0.002235
6750,10,9,0.370326,-0.013200,0.227118,-0.222898,0.729277,0.067070


In [17]:
range(10)

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

In [27]:
df[['id', 'timestamp']].hist()

array([[<matplotlib.axes._subplots.AxesSubplot object at 0x000000000C7B7518>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000000000C7E6F28>]], dtype=object)

In [None]:
df.plot()