In [1]:
import numpy as np
import pandas as pd
from datetime import datetime, timedelta
import os

In [2]:
working_dir = os.getcwd()

In [3]:
def get_one_contract_data_by_date(date_str, contract_pd):
    """Get the contract minute data by the date
    Trading hours:
        Sunday – Friday, 7:00 p.m. – 7:45 a.m. CT and 
        Monday – Friday, 8:30 a.m. – 1:20 p.m. CT
    Define the last day's night-time and today's day-time as today's trading hours
    
    Args:
        date_str: str, the date we want to extract the main contract data, e.g. "2016-01-04"
        contract_path: str, the main contract path, e.g. "../data/ZC/ZCH16.csv"
        
    Returns:
        pandas.DataFrame
    
    """
    year, month, day = map(int, date_str.split('-'))
    last_date_str = (datetime(year, month, day) - timedelta(days=1)).strftime("%Y-%m-%d")
    
    # all_data = pd.read_csv(contract_path, header=0, index_col=0, parse_dates=[0])
    all_data = contract_pd
    all_data['Volume'] = all_data.TotalVolume.diff()
    
    last_day_data = all_data.loc[last_date_str].between_time("19:00", "23:59")
    today_data = all_data.loc[date_str].between_time("0:00", "13:20")
    
    data = last_day_data.append(today_data, ignore_index=False)
    data.iloc[0, 5] = data.iloc[0, 4] #The first minute's trading volume 
    return data


In [4]:
def extract_data_from_unique_contract(contract_path, unique_dates):
    contract_pd = pd.read_csv(contract_path, header=0, index_col=0, parse_dates=[0])
    data = pd.DataFrame(columns=["Open", "High", "Low", "Close", "TotalVolume", "Volume"])
    for single_date in unique_dates:
        dd = get_one_contract_data_by_date(single_date, contract_pd)
        data = data.append(dd, ignore_index=False)
    return data

In [5]:
def get_all_main_contract_data(main_contracts_path = working_dir+"/data/ZC/front.csv", contracts_root_path = working_dir+"/data/ZC/"):
    """Combine the daily main contract data 
    Args:
        main_contracts_path: str, the path that have the daily main contract code
        contracts_root_path: str, the root path that have all the contract data
        
    Returns:
        data, pd.DataFrame,columns=["Open", "High", "Low", "Close", "TotalVolume", "Volume", "Change"], 
                            where "Change" == 1 indicate that the main contract changed that day
    
    """
    func_contract_to_path = lambda x: contracts_root_path + "ZC" + x + ".csv"
    
    main_contracts = pd.read_csv(main_contracts_path, header=None, names=['date_str', 'main_contract'])
    unique_contracts = main_contracts.main_contract.unique()
    
    data = pd.DataFrame(columns=["Open", "High", "Low", "Close", "TotalVolume", "Volume"])
    for single_contract in unique_contracts:
        contract_path = func_contract_to_path(single_contract)
        unique_dates = main_contracts.loc[main_contracts["main_contract"] == single_contract].date_str.unique()
        today_data = extract_data_from_unique_contract(contract_path, unique_dates)
        data = data.append(today_data, ignore_index=False)
        
    return data

In [6]:
%time all_data = get_all_main_contract_data()

CPU times: user 8.46 s, sys: 1.06 s, total: 9.51 s
Wall time: 9.52 s


In [7]:
all_data.to_csv(working_dir+"/data/combined_data.csv")

In [8]:
len(all_data)

535536