### Convert CL_1min live-stream json data into dataframe and store the data

In [1]:
import os
import time
import json
import pandas as pd
import numpy as np
import gc
import datetime
from pandas.io.json import json_normalize
import itertools
import pickle
import warnings
import subprocess
import sys

warnings.filterwarnings('ignore')

######################################
#       READ THE CORRECT FILE        #
######################################

filename = '/home/melgazar9/Trading/Data/CL/CL_TD_historical_data/CL_1min/CL_1min_historical_data_2018-11-30-09:44:PM.log'


######################################
#           READ JSON DATA           #
######################################

def read_data():
    # read the entire file into a python array
    with open(filename, 'rb') as f:
        data = f.readlines()
     # remove the trailing "\n" from each line
    data = map(lambda x: x.rstrip(), data)

    data_list = []
    try:
        #data = [json.loads(x) for x in data]
        for x in data:
            data_list.append(json.loads(x))
    except ValueError:
        print('THERE IS A NAN VALUE')

    return data_list


######################################
#        VERIFY DATA IS CORRECT      #
######################################

def verify_data_integrity(raw_data):
    json_response = raw_data[0]
    for k1 in json_response['response']:
        if (k1['command'] == 'LOGIN') and ((list(k1['content'].keys()) == ['msg', 'code']) or list(k1['content'].keys()) == ['code', 'msg']) and (k1['service'] == 'ADMIN'):
            print(True)
        else:
            print('REASON: RESPONSE ******Disconnecting from exchange****** REASON: RESPONSE')
            sys.exit()


    json_heartbeat = raw_data[1]
    for k2 in json_heartbeat['snapshot']:
        for k2a in k2.keys():
            if k2a == 'content':
                print(True)
            elif k2a == 'timestamp':
                print(True)
            elif k2a == 'command':
                print(True)
            elif k2a == 'service':
                print(True)
            else:
                print('REASON: NOTIFY ******Disconnecting from exchange****** REASON: NOTIFY')
                sys.exit()

    
    return


def get_curr_df(raw_data):

    cols = {'Key':'/CL', '0':'Datetime','1':'1minOpen','2':'1minHigh','3':'1minLow','4':'1minClose','5':'1minVolume'}

    data = pd.io.json.json_normalize(raw_data)['snapshot'].dropna()
    for lst1 in data:
        for lst2 in lst1:
            for lst3 in lst2['content']:
                df = pd.DataFrame(lst3['3'])
                
    df = df.rename(columns=cols)
    df['1minRange'] = df['1minHigh'] - df['1minLow']
    df['1minMove'] = df['1minClose'] - df['1minOpen']
    df['1minLowMove'] = df['1minLow'] - df['1minOpen']
    df['1minHighMove'] = df['1minHigh'] - df['1minOpen']
    df = df.set_index('Datetime')
    df.index = pd.to_datetime(df.index, unit='ms')
    df = df.sort_index()

    return df


raw_data = read_data()
verify_data_integrity(raw_data)
historical_1min = get_curr_df(raw_data)
historical_1min = historical_1min.sort_index()
print(historical_1min.shape)
historical_1min = historical_1min.astype('float32')
historical_1min.tail(2)

THERE IS A NAN VALUE
True
True
True
True
True
(40001, 9)


Unnamed: 0_level_0,1minOpen,1minHigh,1minLow,1minClose,1minVolume,1minRange,1minMove,1minLowMove,1minHighMove
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2018-11-30 21:59:00,50.709999,50.740002,50.650002,50.720001,92.0,0.09,0.01,-0.06,0.03
2018-11-30 23:00:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [2]:
historical_1min = historical_1min.drop_duplicates()

### Live Data

In [3]:
def read_live_data(filepath):
    # read the entire file into a python array
    with open(filepath, 'rb') as f:
        data = f.readlines()

    # remove the trailing "\n" from each line
    data = map(lambda x: x.rstrip(), data)

    data_list = []
    try:
        #data = [json.loads(x) for x in data]
        for x in data:
            data_list.append(json.loads(x))
    except ValueError:
        print('THERE IS A NAN VALUE')

    return data_list


######################################
#        VERIFY DATA IS CORRECT      #
######################################

def verify_live_data_integrity(raw_data):
    json_response = raw_data[0] # raw_data = read_data(filename)
    for k1 in json_response['response']:
        if (k1['command'] == 'LOGIN') and ((list(k1['content'].keys()) == ['msg', 'code']) or list(k1['content'].keys()) == ['code', 'msg']) and (k1['service'] == 'ADMIN'):
            print(True)
        else:
            print('REASON: RESPONSE ******Disconnecting from exchange****** REASON: RESPONSE')
            sys.exit()


    json_heartbeat = raw_data[1]
    for k2 in json_heartbeat['notify']:
        for k2a in k2.keys():
            if k2a == 'heartbeat':
                print(True)
            else:
                print('REASON: NOTIFY ******Disconnecting from exchange****** REASON: NOTIFY')
                sys.exit()

    json_response2 = raw_data[2]

    for k3 in json_response2['response']:
        if (k3['service'] == 'CHART_FUTURES') and (k3['command'] == 'SUBS') and ((list(k3['content'].keys()) == ['msg', 'code']) or (list(k3['content'].keys()) == ['code', 'msg'])):
            print(True)
        else:
            print('REASON: RESPONSE2 ******Disconnecting from exchange****** REASON: RESPONSE2')
            sys.exit()

    json_data = raw_data[3]
    for k4 in json_data['data']:
        if (k4['command'] == 'SUBS') and (k4['service'] == 'CHART_FUTURES'):
            print(True)
        else:
            print('REASON: DATA ISSUE ******Disconnecting from exchange****** REASON: DATA ISSUE')
            sys.exit()

    return

def get_live_df(raw_data):

    cols = {'1':'Datetime', '2':'1minOpen', '3':'1minHigh', '4':'1minLow', '5':'1minClose', '6':'1minVolume', 'key':'Symbol'}

    data_lst = []

    for lst1 in raw_data:
        if list(lst1.keys()) == ['data']:
            for lst2 in lst1['data']:
                for lst3 in lst2['content']:
                    data_lst.append(lst3)

    df = pd.DataFrame.from_dict(data_lst).rename(columns=cols).drop(['seq'], axis=1)
    df['1minRange'] = df['1minHigh'] - df['1minLow']
    df['1minMove'] = df['1minClose'] - df['1minOpen']
    df['1minLowMove'] = df['1minLow'] - df['1minOpen']
    df['1minHighMove'] = df['1minHigh'] - df['1minOpen']
    df = df.set_index('Datetime')
    df.index = pd.to_datetime(df.index, unit='ms')
    df = df.sort_index()

    return df

def get_live_df_init(df_1min, timeframe):

    df_1min = df_1min.drop('Symbol', axis=1)
    df_ohlc = df_1min.resample(timeframe).ohlc()
    df_volume = df_1min['1minVolume'].resample(timeframe).sum()

    df_resampled = pd.DataFrame()
    df_resampled[timeframe + 'Open'] = df_ohlc['1minOpen']['open']
    df_resampled[timeframe + 'High'] = df_ohlc['1minHigh']['high']
    df_resampled[timeframe + 'Low'] = df_ohlc['1minLow']['low']
    df_resampled[timeframe + 'Close'] = df_ohlc['1minClose']['close']
    df_resampled[timeframe + 'Move'] = df_ohlc['1minClose']['close'] - df_ohlc['1minOpen']['open']
    df_resampled[timeframe + 'Range'] = df_ohlc['1minHigh']['high'] - df_ohlc['1minLow']['low']
    df_resampled[timeframe + 'HighMove'] = df_ohlc['1minHigh']['high'] - df_ohlc['1minOpen']['open']
    df_resampled[timeframe + 'LowMove'] = df_ohlc['1minLow']['low'] - df_ohlc['1minOpen']['open']
    df_resampled[timeframe + 'Volume'] = df_volume

    return df_resampled


raw_live = read_live_data('/home/melgazar9/Trading/Live-Trading/CL/live-stream-data/CL_1min_live-logs/CL_1min_2018-11-25-10:50:PM.log')
verify_live_data_integrity(raw_live)
live_1min = get_live_df(raw_live).drop('Symbol', axis=1)
live_1min.index = pd.to_datetime(live_1min.index)
live_1min = live_1min.astype('float32')
live_1min.head()

THERE IS A NAN VALUE
True
True
True
True


Unnamed: 0_level_0,1minOpen,1minHigh,1minLow,1minClose,1minVolume,1minRange,1minMove,1minLowMove,1minHighMove
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2018-11-26 04:49:00,50.939999,50.939999,50.91,50.93,90.0,0.03,-0.01,-0.03,0.0
2018-11-26 04:50:00,50.93,50.93,50.880001,50.900002,105.0,0.05,-0.03,-0.05,0.0
2018-11-26 04:51:00,50.889999,50.900002,50.889999,50.889999,18.0,0.01,0.0,0.0,0.01
2018-11-26 04:52:00,50.900002,50.939999,50.889999,50.939999,263.0,0.05,0.04,-0.01,0.04
2018-11-26 04:53:00,50.939999,50.939999,50.91,50.919998,115.0,0.03,-0.02,-0.03,0.0


In [4]:
historical_subset = historical_1min[live_1min.index[0]:live_1min.index[-1]]
historical_subset.head()

Unnamed: 0_level_0,1minOpen,1minHigh,1minLow,1minClose,1minVolume,1minRange,1minMove,1minLowMove,1minHighMove
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2018-11-26 04:49:00,50.939999,50.939999,50.91,50.93,90.0,0.03,-0.01,-0.03,0.0
2018-11-26 04:50:00,50.93,50.93,50.880001,50.900002,105.0,0.05,-0.03,-0.05,0.0
2018-11-26 04:51:00,50.889999,50.900002,50.889999,50.889999,18.0,0.01,0.0,0.0,0.01
2018-11-26 04:52:00,50.900002,50.939999,50.889999,50.939999,262.0,0.05,0.04,-0.01,0.04
2018-11-26 04:53:00,50.939999,50.939999,50.91,50.919998,115.0,0.03,-0.02,-0.03,0.0


In [5]:
live_1min = live_1min.drop_duplicates()

In [6]:
merged = live_1min[live_1min.index.isin(historical_1min.index)]
merged.head()

Unnamed: 0_level_0,1minOpen,1minHigh,1minLow,1minClose,1minVolume,1minRange,1minMove,1minLowMove,1minHighMove
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2018-11-26 04:49:00,50.939999,50.939999,50.91,50.93,90.0,0.03,-0.01,-0.03,0.0
2018-11-26 04:50:00,50.93,50.93,50.880001,50.900002,105.0,0.05,-0.03,-0.05,0.0
2018-11-26 04:51:00,50.889999,50.900002,50.889999,50.889999,18.0,0.01,0.0,0.0,0.01
2018-11-26 04:52:00,50.900002,50.939999,50.889999,50.939999,263.0,0.05,0.04,-0.01,0.04
2018-11-26 04:53:00,50.939999,50.939999,50.91,50.919998,115.0,0.03,-0.02,-0.03,0.0


In [7]:
historical_1min['2018-11-26 04:49:00':'2018-11-27 11:58:00'].columns

Index(['1minOpen', '1minHigh', '1minLow', '1minClose', '1minVolume',
       '1minRange', '1minMove', '1minLowMove', '1minHighMove'],
      dtype='object')

In [8]:
merged.shape

(1806, 9)

In [9]:
historical_subset = historical_1min['2018-11-26 04:49:00':'2018-11-27 11:58:00']

In [10]:
merged['1minLow'][0].astype('float32')

50.91

In [11]:
historical_subset['1minLow'][0].astype('float32')

50.91

In [12]:
merged == historical_subset

Unnamed: 0_level_0,1minOpen,1minHigh,1minLow,1minClose,1minVolume,1minRange,1minMove,1minLowMove,1minHighMove
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2018-11-26 04:49:00,True,True,True,True,True,True,True,True,True
2018-11-26 04:50:00,True,True,True,True,True,True,True,True,True
2018-11-26 04:51:00,True,True,True,True,True,True,True,True,True
2018-11-26 04:52:00,True,True,True,True,False,True,True,True,True
2018-11-26 04:53:00,True,True,True,True,True,True,True,True,True
2018-11-26 04:54:00,True,True,True,True,True,True,True,True,True
2018-11-26 04:55:00,True,True,True,True,True,True,True,True,True
2018-11-26 04:56:00,True,True,True,True,True,True,True,True,True
2018-11-26 04:57:00,True,True,True,True,True,True,True,True,True
2018-11-26 04:58:00,True,True,True,True,True,True,True,True,True


In [43]:
subtracted = merged - historical_subset
subtracted[(subtracted >= 1).any(axis=1)]

Unnamed: 0_level_0,1minOpen,1minHigh,1minLow,1minClose,1minVolume,1minRange,1minMove,1minLowMove,1minHighMove
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2018-11-26 04:52:00,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2018-11-26 07:12:00,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2018-11-26 11:08:00,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0
2018-11-26 11:20:00,0.0,0.0,0.0,0.0,11.0,0.0,0.0,0.0,0.0
2018-11-27 03:31:00,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0
2018-11-27 07:13:00,0.0,0.0,0.0,0.0,16.0,0.0,0.0,0.0,0.0
2018-11-27 07:25:00,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2018-11-27 07:26:00,0.0,0.0,0.0,0.0,36.0,0.0,0.0,0.0,0.0
2018-11-27 11:16:00,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0
2018-11-27 11:20:00,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0


In [44]:
merged.drop('1minVolume', axis=1) == historical_subset.drop('1minVolume', axis=1)

Unnamed: 0_level_0,1minOpen,1minHigh,1minLow,1minClose,1minRange,1minMove,1minLowMove,1minHighMove
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2018-11-26 04:49:00,True,True,True,True,True,True,True,True
2018-11-26 04:50:00,True,True,True,True,True,True,True,True
2018-11-26 04:51:00,True,True,True,True,True,True,True,True
2018-11-26 04:52:00,True,True,True,True,True,True,True,True
2018-11-26 04:53:00,True,True,True,True,True,True,True,True
2018-11-26 04:54:00,True,True,True,True,True,True,True,True
2018-11-26 04:55:00,True,True,True,True,True,True,True,True
2018-11-26 04:56:00,True,True,True,True,True,True,True,True
2018-11-26 04:57:00,True,True,True,True,True,True,True,True
2018-11-26 04:58:00,True,True,True,True,True,True,True,True


In [13]:
pd.Series(live_1min.drop_duplicates().index) == pd.Series(historical_1min['2018-11-26 04:49:00':'2018-11-27 11:58:00'].drop_duplicates().index)

0       True
1       True
2       True
3       True
4       True
5       True
6       True
7       True
8       True
9       True
10      True
11      True
12      True
13      True
14      True
15      True
16      True
17      True
18      True
19      True
20      True
21      True
22      True
23      True
24      True
25      True
26      True
27      True
28      True
29      True
        ... 
1776    True
1777    True
1778    True
1779    True
1780    True
1781    True
1782    True
1783    True
1784    True
1785    True
1786    True
1787    True
1788    True
1789    True
1790    True
1791    True
1792    True
1793    True
1794    True
1795    True
1796    True
1797    True
1798    True
1799    True
1800    True
1801    True
1802    True
1803    True
1804    True
1805    True
Name: Datetime, Length: 1806, dtype: bool

In [14]:
historical_1min['2018-11-26 04:49:00':'2018-11-27 11:59:00'].tail()

Unnamed: 0_level_0,1minOpen,1minHigh,1minLow,1minClose,1minVolume,1minRange,1minMove,1minLowMove,1minHighMove
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2018-11-27 11:55:00,51.73,51.740002,51.700001,51.709999,327.0,0.04,-0.02,-0.03,0.01
2018-11-27 11:56:00,51.700001,51.720001,51.689999,51.700001,254.0,0.03,0.0,-0.01,0.02
2018-11-27 11:57:00,51.700001,51.709999,51.68,51.689999,170.0,0.03,-0.01,-0.02,0.01
2018-11-27 11:58:00,51.700001,51.720001,51.689999,51.700001,222.0,0.03,0.0,-0.01,0.02
2018-11-27 11:59:00,51.689999,51.709999,51.669998,51.700001,195.0,0.04,0.01,-0.02,0.02


In [15]:
live_1min.tail()

Unnamed: 0_level_0,1minOpen,1minHigh,1minLow,1minClose,1minVolume,1minRange,1minMove,1minLowMove,1minHighMove
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2018-11-27 11:54:00,51.73,51.77,51.720001,51.740002,203.0,0.05,0.01,-0.01,0.04
2018-11-27 11:55:00,51.73,51.740002,51.700001,51.709999,327.0,0.04,-0.02,-0.03,0.01
2018-11-27 11:56:00,51.700001,51.720001,51.689999,51.700001,254.0,0.03,0.0,-0.01,0.02
2018-11-27 11:57:00,51.700001,51.709999,51.68,51.689999,170.0,0.03,-0.01,-0.02,0.01
2018-11-27 11:58:00,51.700001,51.720001,51.689999,51.700001,222.0,0.03,0.0,-0.01,0.02


In [16]:
i=0
while i < len(historical_subset.index):
    if historical_subset.index[i-1] != historical_subset.index[i] - pd.Timedelta(minutes=1):
        print(historical_subset.index[i])
        i+=1
    else:
        i+=1

2018-11-26 04:49:00
2018-11-26 21:55:00
2018-11-26 23:00:00
2018-11-27 00:24:00
2018-11-27 00:29:00
2018-11-27 04:22:00


In [17]:
i=0
while i < len(live_1min.index):
    if live_1min.index[i-1] != live_1min.index[i] - pd.Timedelta(minutes=1):
        print(live_1min.index[i])
        i+=1
    else:
        i+=1

2018-11-26 04:49:00
2018-11-26 21:55:00
2018-11-26 23:00:00
2018-11-27 00:24:00
2018-11-27 00:29:00
2018-11-27 04:22:00


In [18]:
live_1min.loc['2018-11-26 23:23:00']

1minOpen        51.590000
1minHigh        51.599998
1minLow         51.590000
1minClose       51.599998
1minVolume       7.000000
1minRange        0.010000
1minMove         0.010000
1minLowMove      0.000000
1minHighMove     0.010000
Name: 2018-11-26 23:23:00, dtype: float32

In [19]:
live_1min.shape, live_1min.drop_duplicates().shape

((1806, 9), (1806, 9))

In [46]:
historical_subset['1minVolume'].resample('10min').sum()

Datetime
2018-11-26 04:40:00       90.0
2018-11-26 04:50:00      804.0
2018-11-26 05:00:00      626.0
2018-11-26 05:10:00      478.0
2018-11-26 05:20:00      506.0
2018-11-26 05:30:00      702.0
2018-11-26 05:40:00     1106.0
2018-11-26 05:50:00     2833.0
2018-11-26 06:00:00     3986.0
2018-11-26 06:10:00     1519.0
2018-11-26 06:20:00     1132.0
2018-11-26 06:30:00     2014.0
2018-11-26 06:40:00     2020.0
2018-11-26 06:50:00      852.0
2018-11-26 07:00:00     2621.0
2018-11-26 07:10:00     1296.0
2018-11-26 07:20:00     1564.0
2018-11-26 07:30:00     3175.0
2018-11-26 07:40:00     1534.0
2018-11-26 07:50:00     1925.0
2018-11-26 08:00:00     3445.0
2018-11-26 08:10:00     3998.0
2018-11-26 08:20:00     6309.0
2018-11-26 08:30:00     6528.0
2018-11-26 08:40:00     2336.0
2018-11-26 08:50:00     2814.0
2018-11-26 09:00:00     6612.0
2018-11-26 09:10:00     3477.0
2018-11-26 09:20:00     3542.0
2018-11-26 09:30:00     2104.0
                        ...   
2018-11-27 07:00:00     1442.0