## Модуль работы с данными: загрузка, предоставление следующего бара

In [1]:
import pandas as pd
import numpy as np
import logging
import os, os.path
import datetime
import pytz

from abc import ABC, abstractmethod
from pytz import timezone

from event import MarketEvent

In [2]:
logging.basicConfig(level=logging.DEBUG,
                    format='[%(asctime)s %(levelname)-8s line %(lineno)s] %(funcName)30s: %(message)s',
                    datefmt='%a, %d %b %Y %H:%M:%S')

In [3]:
# labels for grouping
min5 = '5T'
min15 = '15T'
hour = 'H' 
day = 'D'
month = 'M'

In [4]:
from queue import Queue, Empty
events = Queue()

In [5]:
class DataHandler(ABC):
    """
    DataHandler is an abstract base class providing an interface for
    all inherited data handlers (both live and historic).

    The goal of a (derived) DataHandler object is to output a generated
    set of bars for each symbol requested. 
    """
    
    @abstractmethod
    def get_all_bars(self, symbol):
        """
        Returns a dataframe for the symbol.
        """
        raise NotImplementedError("Should implement get_all_bars()")
    
    @abstractmethod
    def get_latest_bars(self, symbol, N=1):
        """
        Returns the dataframe with last N bars 
        for the symbol, or fewer if less bars are available, 
        AND number of returned bars.
        
        Return: (df, n)
        """
        raise NotImplementedError("Should implement get_latest_bars()")

    @abstractmethod
    def update_bars(self):
        """
        Updates latest_idx structure for all symbols in the symbol list.
        """
        raise NotImplementedError("Should implement update_bars()")

In [34]:
class CSVDataHandler(DataHandler):
    """
    CSVDataHandler is designed to read CSV files for
    each requested symbol from disk and provide the DataHandler
    interface. 
    """

    def __init__(self, csv_dir, system_tf, symbol_dict, events = None):
        """
        Initialises the data handler by requesting the location 
        of the CSV files and a list of symbols.

        It will be assumed that all files are in the form 'symbol...csv', 
        where symbol is from the dictionary symbol -> parameters.

        Parameters:
        csv_dir - Directory path to the CSV files ("data" folder).
        system_tf - Timeframe for system "heartbeat" and resampling.
        symbol_dict - A dictionary of symbols with parameters.
        events - The Event Queue. If None (as by default), then only get_all_bars()
        """
        
        self.csv_dir = csv_dir
        self.system_tf = system_tf
        self.symbol_dict = symbol_dict
        self.events = events
        
        self.symbol_data = {}
        self.latest_idx = {} # {'APPL':0} -> .. {'APPL':None}
        self.continue_backtest = True       

        self._open_convert_csv_files()
    
    def _resample_symbol_data(self, df):
        """
        Resample dataframe according to the selected timeframe.
        If timeframe is less, than already given to the system, 
        does nothing.
        
        Parameters:
        df - Dataframe.
        Return: new Dataframe
        """
        
        df_new = pd.DataFrame({'open': df.open.resample(self.system_tf, label='right', closed='right').first().dropna(),
                               'high': df.high.resample(self.system_tf, label='right', closed='right').max().dropna(),
                               'low': df.low.resample(self.system_tf, label='right', closed='right').min().dropna(),
                               'close': df.close.resample(self.system_tf, label='right', closed='right').last().dropna(),
                               'volume': df.volume.resample(self.system_tf, label='right', closed='right').sum(
                                   min_count = 1).dropna().astype(float)
                              })
        df_new = df_new.apply(pd.to_numeric, downcast='float')
        return df_new
    
    def _read_csv(self, file, joint = None):
        n = []
        if joint:
            n=['datetime','open','high','low','close','volume']
        else:
            n=['date', 'time','open','high','low','close','volume']
        return pd.io.parsers.read_csv(file, header=None, skiprows=1, names=n)
    
    def _load_csv(self, symbol, joint, exact = False):
        df = pd.DataFrame()
        if exact:
            # load data from the exact file
            try:
                file = os.path.join(self.csv_dir, self.symbol_dict[symbol]['file'])
                df = self._read_csv(file, joint=joint)
            except KeyError:
                logging.error("No file name is given")
                raise
        else: # combine all files like 'symbol...csv'
            frames = []
            for root, dirs, files in os.walk(self.csv_dir):
                for file in files:
                    if file.startswith(symbol) and 'checkpoint' not in file:
                        frames.append(self._read_csv(os.path.join(root, file), joint=joint))
            df = pd.concat(frames)        
        return df
    
    def _convert_av_files(self, symbol):
        """
        Import files, downloaded from Alpha Vantage.
        
        Parameters:
        symbol - Ticker name.
        """
        df = pd.DataFrame()
        try:
             df = self._load_csv(symbol, True, self.symbol_dict[symbol]['exn'])
        except KeyError:
            logging.debug("No exact name is given")
            df = self._load_csv(symbol, True)
       
        df['datetime'] = pd.to_datetime(df['datetime'], format='%Y-%m-%d %H:%M:%S')
        df = df.set_index('datetime')
        df = df.reindex(index=df.index[::-1])
        return df
    
    def _convert_finam_files(self, symbol):
        """
        Import files, downloaded from Finam.
        
        Parameters:
        symbol - Ticker name.
        """
        df = pd.DataFrame()
        try:
             df = self._load_csv(symbol, False, self.symbol_dict[symbol]['exn'])
        except KeyError:
            logging.debug("No exact name is given")
            df = self._load_csv(symbol, False)
       
        # set date + time as index
        try:
            df['datetime'] = df['date'].astype(str) + df['time'].astype(str)
            df['datetime'] = pd.to_datetime(df['datetime'], format='%Y%m%d%H%M%S')
        except ValueError:
            logging.debug("No time is given")
            df['datetime'] = pd.to_datetime(df['date'], format='%Y%m%d')
        df = df.set_index('datetime')
        # drop columns
        df.drop('date', axis = 1, inplace=True)
        df.drop('time', axis = 1, inplace=True)
        return df
    
    def _open_convert_csv_files(self):
        """
        Opens the CSV files from the data directory, converting
        them into pandas DataFrames under a symbol dictionary.
        """        
        for s in self.symbol_dict.keys():
            df = None
            if self.symbol_dict[s]['src'] is 'av':
                df = self._convert_av_files(s)
            elif self.symbol_dict[s]['src'] is 'finam':            
                df = self._convert_finam_files(s)
            #elif ... another source
            
            df = self._resample_symbol_data(df)
            if self.system_tf not in ['D', 'M']:
                df = df.tz_localize(timezone(self.symbol_dict[s]['tz']))
            
            self.symbol_data[s] = df
            self.latest_idx[s] = 0
            self.symbol_dict[s]['len'] = len(list(df.index))
    
    def get_all_bars(self, symbol):
        return self.symbol_data[symbol]

    def _next_datetime(self):
        """
        Return next minimum datetime from the data.
        """
        dt = None
        for s in self.symbol_dict.keys():
            if self.latest_idx[s] is not None:
                idx = self.latest_idx[s]
                idx_dt = self.symbol_data[s].index[idx]
                if dt is None or idx_dt < dt: 
                    dt = idx_dt
        return dt
            
    def get_latest_bars(self, symbol, N=1):
        """
        Returns the last N bars for the symbol or N-k if less available.
        
        Parameters:
        symbol - Ticker name.
        N - number of bars to return, 1 by default.
        """
        try:
            bars_list = self.symbol_data[symbol]
        except KeyError:
            print("That symbol is not available in the data set.")
        else:
            idx = self.latest_idx[symbol]
            df = bars_list.iloc[idx-N:idx]
            n = df.shape[0]
            return df, n          
            
    def update_bars(self):
        """
        Increase bar index according to the new system_dt.
        Check whether is still enough data.
        """
        for s in self.symbol_dict.keys():
            if self.latest_idx[s] == self.symbol_dict[s]['len']:
                self.latest_idx[s] = None
                  
        self.system_dt = self._next_datetime()
        
        cont = False
        new_data = []
        for s in self.symbol_dict.keys():
            if self.latest_idx[s] is not None:
                cont = True
                if self.system_dt in self.symbol_data[s].index:
                    self.latest_idx[s] += 1  
                    new_data.append(s)
                    
        if self.events is not None:
            self.events.put(MarketEvent(new_data, self.system_dt))            
        self.continue_backtest = cont

In [35]:
bars = CSVDataHandler('data', day,
                      {'UVXY':
                       {'src': 'av', 'tz': 'US/Eastern', 'exn':True, 'file':'UVXY_daily.csv'}
                      })

In [36]:
df = bars.get_all_bars('UVXY')
df.head()

Unnamed: 0_level_0,open,high,low,close,volume
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2011-10-04,40.450001,40.799999,34.299999,34.299999,11420.0
2011-10-05,32.91,33.150002,30.23,30.23,3400.0
2011-10-06,30.190001,31.299999,29.0,29.0,34458.0
2011-10-07,28.43,30.959999,28.08,29.42,13601.0
2011-10-10,27.58,27.6,25.99,25.99,28700.0


In [37]:
bars = CSVDataHandler('data', hour,
                      {'UVXY':
                       {'src': 'av', 'tz': 'US/Eastern', 'exn':True, 'file':'UVXY.csv'}
                      }, events)

In [38]:
df = bars.get_all_bars('UVXY')
df.head()

Unnamed: 0_level_0,open,high,low,close,volume
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2019-11-08 10:00:00-05:00,18.570299,18.790001,18.207399,18.719999,2626038.0
2019-11-08 11:00:00-05:00,18.700701,18.806299,18.26,18.389999,3566914.0
2019-11-08 12:00:00-05:00,18.389999,18.42,18.16,18.18,1866990.0
2019-11-08 13:00:00-05:00,18.184999,18.184999,18.0,18.040001,898833.0
2019-11-08 14:00:00-05:00,18.040001,18.1,17.99,18.08,756785.0


In [39]:
bars = CSVDataHandler('data', min5,
                      {'UVXY':
                       {'src': 'av', 'tz': 'US/Eastern', 'exn':True, 'file':'UVXY_daily.csv'}
                      }, events)

In [40]:
df = bars.get_all_bars('UVXY')
df.head()

Unnamed: 0_level_0,open,high,low,close,volume
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2011-10-04 00:00:00-04:00,40.450001,40.799999,34.299999,34.299999,11420.0
2011-10-05 00:00:00-04:00,32.91,33.150002,30.23,30.23,3400.0
2011-10-06 00:00:00-04:00,30.190001,31.299999,29.0,29.0,34458.0
2011-10-07 00:00:00-04:00,28.43,30.959999,28.08,29.42,13601.0
2011-10-10 00:00:00-04:00,27.58,27.6,25.99,25.99,28700.0


In [41]:
bars = CSVDataHandler('data', day,
                      {'IMOEX':
                       {'src': 'finam', 'tz': 'Europe/Moscow'}
                      })

[Sun, 24 Nov 2019 07:20:57 DEBUG    line 112]           _convert_finam_files: No exact name is given
[Sun, 24 Nov 2019 07:20:57 DEBUG    line 120]           _convert_finam_files: No time is given


In [42]:
df = bars.get_all_bars('IMOEX')
df.head()

Unnamed: 0_level_0,open,high,low,close,volume
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2000-01-05,173.0,173.0,173.0,173.0,0.0
2000-01-06,186.259995,186.259995,186.259995,186.259995,0.0
2000-01-10,200.809998,200.809998,200.809998,200.809998,0.0
2000-01-11,199.570007,199.570007,199.570007,199.570007,0.0
2000-01-12,196.880005,196.880005,196.880005,196.880005,0.0


In [43]:
bars = CSVDataHandler('data', min5,
                      {'SPFB':
                       {'src': 'finam', 'tz': 'Europe/Moscow', 'exn': False, }
                      })

In [44]:
df = bars.get_all_bars('SPFB')
df.head()

Unnamed: 0_level_0,open,high,low,close,volume
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2009-07-16 18:10:00+04:00,63.73,63.73,63.73,63.73,100.0
2009-07-16 18:45:00+04:00,64.089996,64.139999,64.089996,64.139999,4.0
2009-07-16 18:50:00+04:00,64.07,64.110001,64.07,64.110001,3.0
2009-07-16 18:55:00+04:00,64.190002,64.190002,64.190002,64.190002,2.0
2009-07-16 19:00:00+04:00,64.25,64.25,64.25,64.25,1.0


In [32]:
df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 390596 entries, 2009-07-16 18:10:00+04:00 to 2019-08-30 23:50:00+03:00
Data columns (total 5 columns):
open      390596 non-null float32
high      390596 non-null float32
low       390596 non-null float32
close     390596 non-null float32
volume    390596 non-null float32
dtypes: float32(5)
memory usage: 10.4 MB


In [42]:
dff = df.astype({'volume': 'float32'}, copy = True)

In [43]:
dff.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 67990 entries, 2018-01-03 10:05:00+03:00 to 2019-08-30 23:50:00+03:00
Data columns (total 5 columns):
open      67990 non-null float64
high      67990 non-null float64
low       67990 non-null float64
close     67990 non-null float64
volume    67990 non-null float32
dtypes: float32(1), float64(4)
memory usage: 2.9 MB


In [44]:
def mem_usage(pandas_obj):
    if isinstance(pandas_obj,pd.DataFrame):
        usage_b = pandas_obj.memory_usage(deep=True).sum()
    else: # исходим из предположения о том, что если это не DataFrame, то это Series
        usage_b = pandas_obj.memory_usage(deep=True)
    usage_mb = usage_b / 1024 ** 2 # преобразуем байты в мегабайты
    return "{:03.2f} MB".format(usage_mb)

In [45]:
converted_float = dff.apply(pd.to_numeric,downcast='float')

print(mem_usage(dff))
print(mem_usage(converted_float))

compare_floats = pd.concat([dff.dtypes,converted_float.dtypes],axis=1)
compare_floats.columns = ['before','after']
compare_floats.apply(pd.Series.value_counts)

2.85 MB
1.82 MB


Unnamed: 0,before,after
float32,1,5.0
float64,4,


In [45]:
#n = 1
logging.info("Start!")
while True:
    if bars.continue_backtest: #and n < 400:
        bars.update_bars()
        #n += 1
    else:
        break
logging.info("Finish!")

[Sun, 24 Nov 2019 07:21:38 INFO     line 2]                       <module>: Start!
[Sun, 24 Nov 2019 07:21:47 INFO     line 9]                       <module>: Finish!
