# Binance candlestick data download

This notebook downloads zipped CSV files provided by Binance into `temp/` folder and creates a combined CSV file.

* Binance Data Listing: https://data.binance.vision/
* Documentation: https://github.com/binance/binance-public-data/
* Binance Data S3 Bucket: https://s3-ap-northeast-1.amazonaws.com/data.binance.vision?delimiter=/&prefix=data/

Change the `config` and run all!

In [1]:
config = {
    'symbol':'ETHBTC',
    'dfreq':'15m',
    'pair_id':0, # this is the pair_id foreign key in the database, leave as 0 if not needed
}
# config = {
#     'symbol':'BTCUSDT',
#     'dfreq':'15m',
#     'pair_id':1, # this is the pair_id foreign key in the database, leave as 0 if not needed
# }
# config = {
#     'symbol':'ETHUSDT',
#     'dfreq':'15m',
#     'pair_id':2, # this is the pair_id foreign key in the database, leave as 0 if not needed
# }

In [2]:
import requests
import xml.etree.ElementTree as ET
from tqdm import tqdm
import pandas as pd
import os
from zipfile import ZipFile
import io

In [3]:
def download_files(config):
    ns = '{http://s3.amazonaws.com/doc/2006-03-01/}'
    file_list = []
    for ufreq in ['daily','monthly']:
        # aggTrades instead of klines is OK too, but need to update the columns next cell
        url = f"https://s3-ap-northeast-1.amazonaws.com/data.binance.vision?delimiter=/&prefix=data/spot/{ufreq}/klines/{config['symbol']}/{config['dfreq']}/"
        req = requests.get(url)
        root = ET.fromstring(req.content)
        file_list += [item.text for item in root.findall(f"{ns}Contents/{ns}Key",) if item.text.endswith('.zip')]
    
    ret_list = []
    for f in tqdm(file_list):
        f_ = 'temp/' + str(config['pair_id']) + "_" + f.replace("/","_")
        ret_list.append(f_)
        if not os.path.exists(f_):
            furl = f'https://data.binance.vision/{f}'
            r = requests.get(furl, allow_redirects=True)
            open(f_, 'wb').write(r.content)

    return ret_list

In [4]:
columns = ['open_time','open','high','low','close','volume','close_time','quote_asset_volume',
         'number_of_trades','taker_buy_base_asset_volume','taker_buy_quote_asset_volume','pair_id'] # for klines
dfs = []
for f in download_files(config):
    input_zip=ZipFile(f)
    data = [input_zip.read(name) for name in input_zip.namelist()][0]
    df = pd.read_csv(io.BytesIO(data), names=columns)
    df['pair_id'] = config['pair_id']
    df['open_time'] = pd.to_datetime(df['open_time'], unit='ms')
    df['close_time'] = pd.to_datetime(df['close_time'], unit='ms')
    df.set_index('open_time', inplace=True)
    dfs.append(df)
df = pd.concat(dfs)
df = df[~df.index.duplicated(keep='first')].sort_index()
df = df.tz_localize('UTC')

df.to_csv(f"{config['pair_id']}_klines_{config['symbol']}_{config['dfreq']}.csv")

print(f"CSV file generated: {config['pair_id']}_klines_{config['symbol']}_{config['dfreq']}.csv")
print(f"Number of rows: {len(df)}")

100%|██████████| 194/194 [00:00<00:00, 58839.76it/s]


CSV file generated: 0_klines_ETHBTC_15m.csv
Number of rows: 140805


In [5]:
# Reverse version
reverse_df = df.copy()
reverse_df[['open','low','high','close']] = 1/reverse_df[['open','high','low','close']]
reverse_df[['volume','quote_asset_volume','taker_buy_base_asset_volume','taker_buy_quote_asset_volume']] = reverse_df[['quote_asset_volume','volume','taker_buy_quote_asset_volume','taker_buy_base_asset_volume']]
reverse_df['pair_id'] = reverse_df['pair_id']*-1 -1

In [6]:
reverse_df.to_csv(f"{config['pair_id']}_klines_{config['symbol']}_{config['dfreq']}_reverse.csv")
reverse_df

Unnamed: 0_level_0,open,high,low,close,volume,close_time,quote_asset_volume,number_of_trades,taker_buy_base_asset_volume,taker_buy_quote_asset_volume,pair_id
open_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2017-07-14 04:00:00+00:00,12.500000,12.500000,11.574074,11.574074,0.728274,2017-07-14 04:14:59.999,8.752,26,0.282355,3.268,-1
2017-07-14 04:15:00+00:00,11.724841,11.747016,11.627907,11.653518,5.217079,2017-07-14 04:29:59.999,61.042,33,0.235027,2.738,-1
2017-07-14 04:30:00+00:00,11.653518,11.653518,11.576754,11.585606,4.631740,2017-07-14 04:44:59.999,53.769,41,1.829643,21.225,-1
2017-07-14 04:45:00+00:00,11.585606,11.586277,11.576754,11.581178,3.697253,2017-07-14 04:59:59.999,42.818,61,0.686295,7.948,-1
2017-07-14 05:00:00+00:00,11.644968,11.819213,11.600255,11.809164,1.404228,2017-07-14 05:14:59.999,16.520,15,0.100914,1.176,-1
...,...,...,...,...,...,...,...,...,...,...,...
2021-07-24 22:45:00+00:00,15.691443,15.699326,15.661707,15.679142,40.473129,2021-07-24 22:59:59.999,634.668,1247,22.648398,355.137,-1
2021-07-24 23:00:00+00:00,15.676930,15.715363,15.668333,15.709932,40.896636,2021-07-24 23:14:59.999,641.816,1003,21.176606,332.342,-1
2021-07-24 23:15:00+00:00,15.708698,15.710672,15.680863,15.687013,46.945426,2021-07-24 23:29:59.999,736.799,923,19.589290,307.457,-1
2021-07-24 23:30:00+00:00,15.687505,15.689966,15.653127,15.659009,49.330548,2021-07-24 23:44:59.999,772.984,1200,25.446157,398.717,-1
