In [2]:
# """
# Here are several functions that I made for the future upscaling of the project.
# These were in the end not needed for the Final Project but will be afterwards. For
# example, functions for the updated fetching of new data, or to get data for different
# equities not yet supported in the project due to time constraints, etc.
# """

In [None]:
def get_data():
    API_BASE = 'https://api.binance.com/api/v3/'
    results = requests.get(f'{API_BASE}exchangeInfo').json()
    all_symbols = pd.DataFrame(results['symbols'])
    all_symbols['quoteAsset']
    
    # see all quote currencies
    all_symbols['quoteAsset'].unique()
    
    # all_symbols[all_symbols['quoteAsset']=='USDT']
    
    # all traded USDT pairs
    usdt_symbols = all_symbols[all_symbols['quoteAsset']=='USDT']
    usdt_symbols
    
    # there are 264 currencies traded in USDT
    usdt_symbols.shape

    usdt_pairs = [tuple(x) for x in usdt_symbols[['baseAsset', 'quoteAsset']].to_records(index=False)]

    # let's take only bitcoin for now
    # usdt_symbols[usdt_symbols['symbol']=='BTCUSDT']
    usdt_pairs[0]

    usdt_pairs[0]

    data = get_batch(
                symbol=usdt_pairs[0][0]+usdt_pairs[0][1],
    #             interval='1m',
    #             start_time=last_timestamp+1
            )

    data
    pass

In [None]:
def get_batch(symbol, interval='1m', start_time=0, limit=1000):
    """Use a GET request to retrieve a batch of candlesticks. Process the JSON into a pandas
    dataframe and return it. If not successful, return an empty dataframe.
    """

    LABELS = [
        'open_time',
        'open',
        'high',
        'low',
        'close',
        'volume',
        'close_time',
        'quote_asset_volume',
        'number_of_trades',
        'taker_buy_base_asset_volume',
        'taker_buy_quote_asset_volume',
        'ignore'
    ]

    params = {
        'symbol': symbol,
        'interval': interval,
        'startTime': start_time,
        'limit': limit
    }
    try:
        # timeout should also be given as a parameter to the function
        response = requests.get(f'{API_BASE}klines', params, timeout=30)
    except requests.exceptions.ConnectionError:
        print('Connection error, sleep for 5 mins...')
        time.sleep(5 * 60)
        return get_batch(symbol, interval, start_time, limit)
    
    except requests.exceptions.Timeout:
        print('Timeout, sleep for 5 min...')
        time.sleep(5 * 60)
        return get_batch(symbol, interval, start_time, limit)
    
    except requests.exceptions.ConnectionResetError:
        print('Connection reset by peer, sleep for 5 min...')
        time.sleep(5 * 60)
        return get_batch(symbol, interval, start_time, limit)

    if response.status_code == 200:
        return pd.DataFrame(response.json(), columns=LABELS)
    print(f'Got erroneous response back: {response}')
    return pd.DataFrame([])


In [None]:
def all_candles_to_csv(base, quote, interval='1h'):
    """Collect a list of candlestick batches with all candlesticks of a trading pair,
    concat into a dataframe and write it to CSV.
    """

    # see if there is any data saved on disk already
    try:
        batches = [pd.read_csv(f'data/{base}-{quote}-{interval}.csv')]
        last_timestamp = batches[-1]['open_time'].max()
    except FileNotFoundError:
        batches = [pd.DataFrame([], columns=LABELS)]
        last_timestamp = 0
    old_lines = len(batches[-1].index)

    # gather all candlesticks available, starting from the last timestamp loaded from disk or 0
    # stop if the timestamp that comes back from the api is the same as the last one
    previous_timestamp = None

    while previous_timestamp != last_timestamp:
        # stop if we reached data from today
        if date.fromtimestamp(last_timestamp / 1000) >= date.today():
            break

        previous_timestamp = last_timestamp

        new_batch = get_batch(
            symbol=base+quote,
            interval=interval,
            start_time=last_timestamp+1
        )

        # requesting candles from the future returns empty
        # also stop in case response code was not 200
        if new_batch.empty:
            break

        last_timestamp = new_batch['open_time'].max()

        # sometimes no new trades took place yet on date.today();
        # in this case the batch is nothing new
        if previous_timestamp == last_timestamp:
            break

        batches.append(new_batch)
        last_datetime = datetime.fromtimestamp(last_timestamp / 1000)

        covering_spaces = 20 * ' '
        print(datetime.now(), base, quote, interval, str(last_datetime)+covering_spaces, end='\r', flush=True)

    df = pd.concat(batches, ignore_index=True)
    df['open_time'] = pd.to_datetime(df['open_time'], unit='ms')
    df['close_time'] = pd.to_datetime(df['close_time'], unit='ms')
    
    # in the case that new data was gathered write it to disk
    if len(batches) > 1:
        df.to_csv(f'data/{base}-{quote}-{interval}.csv', index=False)
        return len(df.index) - old_lines
    return 0

In [None]:
def main():
    """Main loop; loop over all currency pairs that exist on the exchange. Once done upload the
    compressed (Parquet) dataset to Kaggle.
    """

    # get all pairs currently available
    all_symbols = pd.DataFrame(requests.get(f'{API_BASE}exchangeInfo').json()['symbols'])
    all_pairs = [tuple(x) for x in all_symbols[['baseAsset', 'quoteAsset']].to_records(index=False)]

    usdt_symbols = all_symbols[all_symbols['quoteAsset']=='USDT']
    usdt_pairs = [tuple(x) for x in usdt_symbols[['baseAsset', 'quoteAsset']].to_records(index=False)]
    btcusdt = (usdt_pairs[0][0],usdt_pairs[0][1])

    base, quote = btcusdt
    intervals=['1m', '5m', '15m', '30m', '1h', '4h', '1d']
    for i in intervals:
        new_lines = all_candles_to_csv(base=base, quote=quote, interval=i)
