## First look at train.csv

In [1]:
# Magic
%matplotlib inline

# Libraries in use
import math
import pylab as pl
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import collections  as mc

In [2]:
# Read the data from the file
df = pd.read_csv("../data/train.csv")

In [3]:
# The data looks like this
df.head()

Unnamed: 0,TIMESTAMP,ID,y,F00,F01,F02,F03,F04,F05,F06,...,F34,F35,F36,F37,F38,F39,F40,F41,F42,F43
0,T0000,S0000,-0.004353,1.0,0.0,50.0,2.210819,-0.07526,0.618897,0.0,...,44.82035,0.035828,0.0,0.022465,78.98305,-0.097214,0.955017,0.648999,0.2955,0.868845
1,T0000,S0001,0.002971,0.0,1.0,89.0,3.047588,-0.006635,0.484964,0.0,...,52.39853,0.021581,0.0,0.006165,33.33333,-0.279424,0.355129,0.444151,0.3144,0.98849
2,T0000,S0002,0.002614,0.0,0.0,85.0,2.145732,-0.007309,0.516876,0.0,...,82.43592,0.10909,0.0,0.005787,44.95496,0.079846,,0.093355,0.3266,1.233096
3,T0000,S0003,-0.000188,0.0,0.0,90.0,1.573162,-0.166067,0.446393,0.0,...,-33.64397,0.209749,0.0,0.012024,73.79487,0.43085,2.596279,-0.570243,-0.1386,0.65734
4,T0000,S0004,-0.014813,0.0,0.0,78.0,2.738358,0.009562,0.526339,0.0,...,-144.092,0.186767,1.0,0.005475,64.69136,0.107902,33.438054,0.530521,0.2053,0.709597


### Some general information about the dataset

In [4]:
# Number of securities in use
"Number of securities in use is {}".format(len(set(df['ID']))) # So universe consists of 1315 assets

'Number of securities in use is 1315'

In [5]:
# Number of days
"Number of days in train dataset {}".format(len(set(df['TIMESTAMP']))) # the first day is T0000, the last one is T1121

'Number of days in train dataset 1122'

In [6]:
# Dataframe shape
row_num, column_num = df.shape

# Notice that the (number of days * numer of securities) is not equal to number of rows
row_num < 1315*1122 # This means that some of the securities are missing in the dataset

True

### Go more into details

In [7]:
# General description
df_desc = df.describe(include = 'all')
df_desc

Unnamed: 0,TIMESTAMP,ID,y,F00,F01,F02,F03,F04,F05,F06,...,F34,F35,F36,F37,F38,F39,F40,F41,F42,F43
count,1120078,1120078,1120078.0,1119190.0,1119190.0,1099927.0,1087108.0,1119289.0,1111943.0,1119190.0,...,1063940.0,1106588.0,1119190.0,1110815.0,1111954.0,1119289.0,1064340.0,1120078.0,1110920.0,1086557.0
unique,1122,1315,,,,,,,,,...,,,,,,,,,,
top,T0998,S0994,,,,,,,,,...,,,,,,,,,,
freq,1041,1122,,,,,,,,,...,,,,,,,,,,
mean,,,1.407743e-05,0.0744744,0.04815894,56.18067,6.260496,0.0003567283,1.007768,0.1622423,...,-415.4879,-0.009782252,0.01437736,0.01017884,52.65977,0.04320451,1.648553,-0.0006335325,0.240076,0.9906896
std,,,0.01401112,0.2625415,0.2141021,26.10712,86.10672,0.09055803,0.4687993,0.3686731,...,9045.683,4.496607,0.1190406,0.008899725,17.47191,0.7512947,2.175945,0.5774946,0.5129332,0.2456936
min,,,-0.8114524,0.0,0.0,1.0,1.0,-0.5007085,0.09064174,0.0,...,-327277.4,-367.0323,0.0,0.00017124,0.0,-4.488682,0.00682615,-4.221995,-4.9967,0.2790323
25%,,,-0.005681481,0.0,0.0,36.0,1.882747,-0.04226077,0.7390148,0.0,...,6.200001,0.03425036,0.0,0.005460955,40.28571,-0.3672302,0.5094008,-0.260746,0.3052,0.8325
50%,,,-0.000191688,0.0,0.0,58.0,2.5492,-0.004557201,0.9136,0.0,...,45.9496,0.0793998,0.0,0.0079,52.78718,0.04397215,0.9992146,-3.245478e-05,0.3243,0.9514161
75%,,,0.005395227,0.0,0.0,79.0,3.895441,0.03588148,1.1511,0.0,...,94.1624,0.1375088,0.0,0.012,65.19395,0.4567529,1.961204,0.2593683,0.3406,1.101749


In [8]:
factor_columns = [x for x in df.columns if ((x != 'TIMESTAMP') and (x != 'ID') and (x != 'y'))]

In [9]:
# Binary factors
bin_factors = []
for col in factor_columns:
    if ((df_desc[col]['min'] == 0.) and 
        (df_desc[col]['25%'] == 0.) and 
        (df_desc[col]['50%'] == 0.) and 
        (df_desc[col]['75%'] == 0.) and 
        (df_desc[col]['max'] == 1.)):
        bin_factors.append(col)
print("There are {} binary factors and they are:".format(len(bin_factors)))

for factor in bin_factors:
    print(factor)

There are 10 binary factors and they are:
F00
F01
F06
F09
F10
F24
F26
F28
F32
F36


In [10]:
# We can see that each day there are different amount of information about stocks
df.groupby("TIMESTAMP").apply(lambda df: df.shape[0]).tail()

TIMESTAMP
T1117    1026
T1118    1026
T1119    1026
T1120    1026
T1121    1030
dtype: int64

### Quick look at the test.csv

In [11]:
# Read the data from the file
df_test = pd.read_csv("../data/test.csv")

In [12]:
df_test.head()

Unnamed: 0,TIMESTAMP,ID,F00,F01,F02,F03,F04,F05,F06,F07,...,F34,F35,F36,F37,F38,F39,F40,F41,F42,F43
0,T1122,S0001,0.0,1.0,88.0,3.413485,-0.037251,0.911037,0.0,72.0,...,32.41207,0.099218,0.0,0.005583,53.65854,0.452007,1.220057,-0.629024,0.3523,0.891381
1,T1122,S0002,0.0,0.0,53.0,1.941026,-0.016867,1.543352,0.0,98.0,...,72.84467,0.145055,0.0,0.004368,55.13889,-0.605037,0.379833,-0.340378,0.3492,0.949277
2,T1122,S0003,0.0,0.0,87.0,2.024666,0.103946,0.783493,0.0,99.0,...,-39.6794,0.230158,0.0,0.006952,47.50219,-0.108841,2.342777,0.779514,0.344,0.907152
3,T1122,S0004,0.0,0.0,95.0,3.527144,0.023134,0.725113,0.0,85.0,...,-12.38947,0.184191,0.0,0.005457,54.01387,-0.739674,8.791357,0.305561,0.344,1.083814
4,T1122,S0005,0.0,0.0,35.0,2.38164,0.035584,2.277727,0.0,65.0,...,100.2469,0.132756,0.0,0.003509,48.49246,0.170616,0.951246,0.392146,0.3544,0.903084


In [13]:
# So in the test.csv all the same data, except Y