In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import math
import seaborn as sns
import requests
import json
import zipfile
import os
from io import BytesIO
from zipfile import ZipFile
from xml.etree import ElementTree as ET
from concurrent.futures import ThreadPoolExecutor, as_completed
import gc
import time

In [2]:
data_path = '/root/coursework/full_data/'

In [3]:
tokens_df = pd.read_csv(data_path + 'tokens.csv')
tokens_df['token_address'] = tokens_df['token_address'].str.lower()

# Загрузка символов и адресов токенов

In [4]:
base_url = 'https://api.kucoin.com'

resp = requests.get(base_url + '/api/v3/currencies')
currencies_data = json.loads(resp.text)['data']

In [5]:
kucoin_tokens_df = pd.DataFrame(columns=['token_symbol', 'token_address'])

token_addresses = set(tokens_df['token_address'])

for currency_data in currencies_data:
    if currency_data['chains'] == None:
        continue
    
    for chain in currency_data['chains']:
        if chain['chainId'] == 'eth':
            if chain['contractAddress'] == '':
                continue

            token_address = chain['contractAddress'].lower()

            if token_address not in token_addresses:
                continue
            
            new_row = pd.DataFrame([{
                'token_symbol': currency_data['currency'], 
                'token_address': token_address,
            }])

            kucoin_tokens_df = pd.concat([kucoin_tokens_df, new_row], ignore_index=True)

del tokens_df
del token_addresses

In [6]:
kucoin_tokens_df.head()

Unnamed: 0,token_symbol,token_address
0,ANKR,0x8290333cef9e6d528dd5618fb97a76f268f3edd4
1,DX,0x973e52691176d36453868d9d86572788d27041a9
2,NPXS,0xa15c7ebe1f07caf6bff097d8a589fb8ac49ae5b3
3,RFOX,0xa1d6df714f91debf4e0802a542e13067f31b8262
4,CHR,0x8a2279d4a90b6fe1c4b30fa660cc9f926797baa2


# Загрузка символов пар

In [7]:
resp = requests.get(base_url + '/api/v2/symbols')
pairs_data = json.loads(resp.text)['data']

In [8]:
filenames = set(os.listdir('./kucoin_data'))


kucoin_pairs_df = pd.DataFrame(columns=['pair_symbol', 'token0_symbol', 'token1_symbol'])

token_symbols = set(kucoin_tokens_df['token_symbol'])

for pair_data in pairs_data:
    token0_symbol = pair_data['baseCurrency']
    token1_symbol = pair_data['quoteCurrency']

    if token0_symbol not in token_symbols:
        continue

    if token1_symbol not in token_symbols:
        continue

    if f'{token0_symbol}{token1_symbol}.csv' in filenames:
        continue

    new_row = pd.DataFrame([{
        'pair_symbol': pair_data['symbol'],
        'token0_symbol': token0_symbol, 
        'token1_symbol': token1_symbol,
    }])
    
    kucoin_pairs_df = pd.concat([kucoin_pairs_df, new_row], ignore_index=True)

In [9]:
kucoin_pairs_df.head()

Unnamed: 0,pair_symbol,token0_symbol,token1_symbol
0,SWFTC-USDT,SWFTC,USDT
1,CELR-USDT,CELR,USDT
2,AURORA-USDT,AURORA,USDT
3,KNC-USDT,KNC,USDT
4,OVR-USDT,OVR,USDT


# Загрузка csv сделок по дням

In [10]:
base_url = 'https://historical-data.kucoin.com/'

def urls_to_csv_zip(pair_symbol):
    links = []
    
    url = f'https://historical-data.kucoin.com/?delimiter=/&prefix=data%2Fspot%2Fdaily%2Ftrades%2F{pair_symbol}%2F'
    xml_data = requests.get(url).text
    root = ET.fromstring(xml_data)
    namespace = {'ns': 'http://s3.amazonaws.com/doc/2006-03-01/'}
    keys = root.findall('.//ns:Key', namespace)
    keys = [key.text for key in keys]
    
    for key in keys:
        if key.endswith('CHECKSUM'):
            continue
        links.append(base_url + key)

    return links

In [11]:
def download_and_process_csv(zip_url, max_retries=3, wait_seconds=5):
    retry_count = 0
    while retry_count < max_retries:
        try:
            response = requests.get(zip_url)
            if response.status_code == 200:
                with zipfile.ZipFile(BytesIO(response.content)) as z:
                    for zip_info in z.infolist():
                        if zip_info.filename.endswith('.csv'):
                            with z.open(zip_info) as csv_file:
                                temp_df = pd.read_csv(csv_file)
                                return temp_df
        except Exception as e:
            print(f"Attempt {retry_count + 1} failed for {zip_url}: {e}. Retrying in {wait_seconds} seconds...")
            retry_count += 1
            time.sleep(wait_seconds)  # Wait for a specified time before the next retry
        else:
            break  # Break out of the loop if the download was successful
    return pd.DataFrame()  # Return an empty DataFrame if all retries fail

def build_pair_csv(urls):
    all_data_dfs = []
    with ThreadPoolExecutor(max_workers=5) as executor:
        future_to_url = {executor.submit(download_and_process_csv, url): url for url in urls}
        for future in as_completed(future_to_url):
            data_df = future.result()
            if not data_df.empty:
                all_data_dfs.append(data_df)

    return pd.concat(all_data_dfs, ignore_index=True)

In [12]:
for idx, pair in kucoin_pairs_df.iterrows():
    pair_symbol = pair['token0_symbol'] + pair['token1_symbol']

    urls = urls_to_csv_zip(pair_symbol)

    df = build_pair_csv(urls)
    df['token0_symbol'] = pair['token0_symbol']
    df['token1_symbol'] = pair['token1_symbol']

    df.to_csv(f'./kucoin_data/{pair_symbol}.csv', index=False)
    del df

    print(kucoin_pairs_df.shape[0] - idx)

87
86
85
84
83
82
81
80
79
78
77
76
75
74
73
72
71
70
69
68
67
66
65
64
63
62
61
60
59
58
57
56
55
54
53
52
51
50
49
48
47
46
45
44
43
42
41
40
39
38
37
36
35
34
33
32
31
30
29
28
27
26
25
24
23
22
21
20
19
18
17
16
15
14
13
12
11
10
9
8
7
6
5
4
3
2
1
