## This script concatenates old data and new data 
### Use the below code if both the old and new data is a json file. If the old data is a df then scroll down

In [1]:
import os
import time
import json
import pandas as pd
import numpy as np
import gc
import datetime
from pandas.io.json import json_normalize
import itertools
import pickle
import warnings
import subprocess
import sys

# os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'
warnings.filterwarnings('ignore')

######################################
#       READ THE CORRECT FILE        #
######################################

def read_data(json_file):
    # read the entire file into a python array
    with open(json_file, 'rb') as f:
        data = f.readlines()

    # remove the trailing "\n" from each line
    data = map(lambda x: x.rstrip(), data)

    data_list = []
    try:
        #data = [json.loads(x) for x in data]
        for x in data:
            data_list.append(json.loads(x))
    except ValueError:
        print('THERE IS A NAN VALUE')

    return data_list


######################################
#        VERIFY DATA IS CORRECT      #
######################################

def verify_data_integrity(raw_data):
    json_response = raw_data[0]
    for k1 in json_response['response']:
        if (k1['command'] == 'LOGIN') and ((list(k1['content'].keys()) == ['msg', 'code']) or list(k1['content'].keys()) == ['code', 'msg']) and (k1['service'] == 'ADMIN'):
            print(True)
        else:
            print('REASON: RESPONSE ******Disconnecting from exchange****** REASON: RESPONSE')
            sys.exit()


    json_heartbeat = raw_data[1]
    for k2 in json_heartbeat['snapshot']:
        for k2a in k2.keys():
            if k2a == 'content':
                print(True)
            elif k2a == 'timestamp':
                print(True)
            elif k2a == 'command':
                print(True)
            elif k2a == 'service':
                print(True)
            else:
                print('REASON: NOTIFY ******Disconnecting from exchange****** REASON: NOTIFY')
                sys.exit()

    
    return

In [2]:
old_data = '/home/melgazar9/Trading/Data/CL/CL_TD_historical_data/CL_5min/CL_5min_historical_data_2018-09-24-09:15:PM.json'
raw_data_old = read_data(old_data)
new_data = '/home/melgazar9/Trading/Data/CL/CL_TD_historical_data/CL_5min/CL_5min_historical_data_2018-11-12-10_44_AM.log'

raw_data_new = read_data(new_data)

verify_data_integrity(raw_data_old)
verify_data_integrity(raw_data_new)

THERE IS A NAN VALUE
THERE IS A NAN VALUE
True
True
True
True
True
True
True
True
True
True


In [3]:
def get_5min_df(raw_data):

    cols = {'Key':'/ES', '0':'Datetime','1':'5minOpen','2':'5minHigh','3':'5minLow','4':'5minClose','5':'5minVolume'}

    data = pd.io.json.json_normalize(raw_data)['snapshot'].dropna()
    for lst1 in data:
        for lst2 in lst1:
            for lst3 in lst2['content']:
                df = pd.DataFrame(lst3['3'])
                
    df = df.rename(columns=cols)
    df['5minRange'] = df['5minHigh'] - df['5minLow']
    df['5minMove'] = df['5minClose'] - df['5minOpen']
    df['5minLowMove'] = df['5minLow'] - df['5minOpen']
    df['5minHighMove'] = df['5minHigh'] - df['5minOpen']
    df = df.set_index('Datetime')
    df.index = pd.to_datetime(df.index, unit='ms')
    df = df.sort_index()

    return df



In [4]:
df_5min_old = get_5min_df(raw_data_old)
df_5min_old = df_5min_old.sort_index()

print(df_5min_old.shape)
df_5min_old.head()

(40000, 9)


Unnamed: 0_level_0,5minOpen,5minHigh,5minLow,5minClose,5minVolume,5minRange,5minMove,5minLowMove,5minHighMove
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2018-03-02 13:55:00,60.92,60.93,60.84,60.85,1950.0,0.09,-0.07,-0.08,0.01
2018-03-02 14:00:00,60.84,60.86,60.75,60.79,4258.0,0.11,-0.05,-0.09,0.02
2018-03-02 14:05:00,60.79,61.02,60.77,60.91,6172.0,0.25,0.12,-0.02,0.23
2018-03-02 14:10:00,60.91,60.93,60.75,60.77,4235.0,0.18,-0.14,-0.16,0.02
2018-03-02 14:15:00,60.77,60.89,60.68,60.74,4368.0,0.21,-0.03,-0.09,0.12


In [5]:
df_5min_new = get_5min_df(raw_data_new)
df_5min_new = df_5min_new.sort_index()
df_5min_new.tail(2)

Unnamed: 0_level_0,5minOpen,5minHigh,5minLow,5minClose,5minVolume,5minRange,5minMove,5minLowMove,5minHighMove
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2018-11-12 16:35:00,60.86,60.91,60.84,60.84,3864.0,0.07,-0.02,-0.02,0.05
2018-11-12 16:40:00,60.83,60.94,60.83,60.89,2595.0,0.11,0.06,0.0,0.11


In [6]:
df_5min = pd.concat([df_5min_old, df_5min_new], axis=0).drop_duplicates()
print(df_5min.isnull().sum())
df_5min.head(2)

5minOpen        0
5minHigh        0
5minLow         0
5minClose       0
5minVolume      0
5minRange       0
5minMove        0
5minLowMove     0
5minHighMove    0
dtype: int64


Unnamed: 0_level_0,5minOpen,5minHigh,5minLow,5minClose,5minVolume,5minRange,5minMove,5minLowMove,5minHighMove
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2018-03-02 13:55:00,60.92,60.93,60.84,60.85,1950.0,0.09,-0.07,-0.08,0.01
2018-03-02 14:00:00,60.84,60.86,60.75,60.79,4258.0,0.11,-0.05,-0.09,0.02


In [7]:
df_5min.tail(2)

Unnamed: 0_level_0,5minOpen,5minHigh,5minLow,5minClose,5minVolume,5minRange,5minMove,5minLowMove,5minHighMove
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2018-11-12 16:35:00,60.86,60.91,60.84,60.84,3864.0,0.07,-0.02,-0.02,0.05
2018-11-12 16:40:00,60.83,60.94,60.83,60.89,2595.0,0.11,0.06,0.0,0.11


In [8]:
df_5min

Unnamed: 0_level_0,5minOpen,5minHigh,5minLow,5minClose,5minVolume,5minRange,5minMove,5minLowMove,5minHighMove
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2018-03-02 13:55:00,60.92,60.93,60.84,60.85,1950.0,0.09,-0.07,-0.08,0.01
2018-03-02 14:00:00,60.84,60.86,60.75,60.79,4258.0,0.11,-0.05,-0.09,0.02
2018-03-02 14:05:00,60.79,61.02,60.77,60.91,6172.0,0.25,0.12,-0.02,0.23
2018-03-02 14:10:00,60.91,60.93,60.75,60.77,4235.0,0.18,-0.14,-0.16,0.02
2018-03-02 14:15:00,60.77,60.89,60.68,60.74,4368.0,0.21,-0.03,-0.09,0.12
2018-03-02 14:20:00,60.74,60.88,60.74,60.86,2812.0,0.14,0.12,0.00,0.14
2018-03-02 14:25:00,60.85,60.91,60.69,60.77,4190.0,0.22,-0.08,-0.16,0.06
2018-03-02 14:30:00,60.76,60.78,60.62,60.67,7676.0,0.16,-0.09,-0.14,0.02
2018-03-02 14:35:00,60.66,60.77,60.52,60.58,10729.0,0.25,-0.08,-0.14,0.11
2018-03-02 14:40:00,60.57,60.60,60.49,60.49,8120.0,0.11,-0.08,-0.08,0.03


### Save df_5min

In [9]:
import re
new_data_date = re.search(r'(\d+-\d+-\d+)', new_data).group(0)
df_5min.to_csv('/home/melgazar9/Trading/Data/CL/CL_TD_historical_data_DFs/CL_5min/CL_5min_historical-data_' + new_data_date + '.csv')

# Use this below code if old data is a df 

In [10]:
import os
import time
import json
import pandas as pd
import numpy as np
import gc
import datetime
from pandas.io.json import json_normalize
import itertools
import pickle
import warnings
import subprocess
import sys

# os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'
warnings.filterwarnings('ignore')

######################################
#       READ THE CORRECT FILE        #
######################################

old_df = pd.read_csv('/home/melgazar9/Trading/Data/CL/CL_TD_historical_data_DFs/CL_5min/CL_5min_historical-data_2018-11-12.csv').set_index('Datetime')


new_data_path = '/home/melgazar9/Trading/Data/CL/CL_TD_historical_data/CL_1min/'
new_data_filename = 'CL_1min_historical_data_2018-11-30-09:44:PM.log'
new_data = read_data(new_data_path + new_data_filename)

verify_data_integrity(new_data)
old_df.head(2)

THERE IS A NAN VALUE
True
True
True
True
True


Unnamed: 0_level_0,5minOpen,5minHigh,5minLow,5minClose,5minVolume,5minRange,5minMove,5minLowMove,5minHighMove
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2018-03-02 13:55:00,60.92,60.93,60.84,60.85,1950.0,0.09,-0.07,-0.08,0.01
2018-03-02 14:00:00,60.84,60.86,60.75,60.79,4258.0,0.11,-0.05,-0.09,0.02


In [11]:
new_df = get_5min_df(new_data)
new_df.head(2)

Unnamed: 0_level_0,5minOpen,5minHigh,5minLow,5minClose,5minVolume,5minRange,5minMove,5minLowMove,5minHighMove
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2018-10-22 13:37:00,68.61,68.61,68.56,68.59,1352.0,0.05,-0.02,-0.05,0.0
2018-10-22 13:38:00,68.58,68.58,68.48,68.51,3867.0,0.1,-0.07,-0.1,0.0


In [12]:
new_df.tail(2)

Unnamed: 0_level_0,5minOpen,5minHigh,5minLow,5minClose,5minVolume,5minRange,5minMove,5minLowMove,5minHighMove
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2018-11-30 21:59:00,50.71,50.74,50.65,50.72,92.0,0.09,0.01,-0.06,0.03
2018-11-30 23:00:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
df_5min_updated = pd.concat([old_df, new_df], axis=0)
df_5min_updated.head(2)

Unnamed: 0_level_0,5minOpen,5minHigh,5minLow,5minClose,5minVolume,5minRange,5minMove,5minLowMove,5minHighMove
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2018-03-02 13:55:00,60.92,60.93,60.84,60.85,1950.0,0.09,-0.07,-0.08,0.01
2018-03-02 14:00:00,60.84,60.86,60.75,60.79,4258.0,0.11,-0.05,-0.09,0.02


In [14]:
df_5min_updated.tail(2)

Unnamed: 0_level_0,5minOpen,5minHigh,5minLow,5minClose,5minVolume,5minRange,5minMove,5minLowMove,5minHighMove
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2018-11-30 21:59:00,50.71,50.74,50.65,50.72,92.0,0.09,0.01,-0.06,0.03
2018-11-30 23:00:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
df_5min_updated

Unnamed: 0_level_0,5minOpen,5minHigh,5minLow,5minClose,5minVolume,5minRange,5minMove,5minLowMove,5minHighMove
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2018-03-02 13:55:00,60.92,60.93,60.84,60.85,1950.0,0.09,-0.07,-0.08,0.01
2018-03-02 14:00:00,60.84,60.86,60.75,60.79,4258.0,0.11,-0.05,-0.09,0.02
2018-03-02 14:05:00,60.79,61.02,60.77,60.91,6172.0,0.25,0.12,-0.02,0.23
2018-03-02 14:10:00,60.91,60.93,60.75,60.77,4235.0,0.18,-0.14,-0.16,0.02
2018-03-02 14:15:00,60.77,60.89,60.68,60.74,4368.0,0.21,-0.03,-0.09,0.12
2018-03-02 14:20:00,60.74,60.88,60.74,60.86,2812.0,0.14,0.12,0.00,0.14
2018-03-02 14:25:00,60.85,60.91,60.69,60.77,4190.0,0.22,-0.08,-0.16,0.06
2018-03-02 14:30:00,60.76,60.78,60.62,60.67,7676.0,0.16,-0.09,-0.14,0.02
2018-03-02 14:35:00,60.66,60.77,60.52,60.58,10729.0,0.25,-0.08,-0.14,0.11
2018-03-02 14:40:00,60.57,60.60,60.49,60.49,8120.0,0.11,-0.08,-0.08,0.03


In [16]:
df_5min_updated.index = pd.to_datetime(df_5min_updated.index)
df_5min_updated.sort_index(inplace=True)

In [17]:
import re
new_data_date = re.search(r'(\d+-\d+-\d+)', new_data_filename).group(0)
new_data_date

'2018-11-30'

In [18]:
df_5min_updated.to_csv('/home/melgazar9/Trading/Data/CL/CL_TD_historical_data_DFs/CL_5min/CL_5min_historical-data_' + new_data_date + '.csv')