In [1]:
import pandas as pd
import os
import requests
import json

In [2]:
data_path = '/root/coursework/full_data/'
output_path = '/root/coursework/full_data/kucoin/'
raw_data_path = '/root/coursework/model/kucoin_data/'

In [4]:
df = pd.read_csv(raw_data_path + 'ZRXUSDT.csv')
df.head()

Unnamed: 0,trade_id,trade_time,price,size,side,token0_symbol,token1_symbol
0,4107591343671297,1700870527073,0.4356,260.0,SELL,ZRX,USDT
1,4107594187540487,1700870535747,0.4356,45.9136,SELL,ZRX,USDT
2,4107594187540481,1700870535747,0.4356,18.49,SELL,ZRX,USDT
3,4107594187540484,1700870535747,0.4356,45.9136,SELL,ZRX,USDT
4,4107597988628481,1700870548783,0.4352,344.6691,BUY,ZRX,USDT


In [3]:
blocks_df = pd.read_csv(data_path + 'blocks.csv')
blocks_df['timestamp'] = pd.to_datetime(blocks_df['timestamp'], unit='s')
blocks_df.head()

Unnamed: 0,block_number,timestamp,gas_price,gas_used
0,9998634,2020-05-04 08:28:30,0.0,9991797
1,9998635,2020-05-04 08:28:57,0.0,9981989
2,9998636,2020-05-04 08:29:28,0.0,9975106
3,9998637,2020-05-04 08:29:59,0.0,9668279
4,9998638,2020-05-04 08:30:12,0.0,9980782


In [4]:
base_url = 'https://api.kucoin.com'

resp = requests.get(base_url + '/api/v3/currencies')
currencies_data = json.loads(resp.text)['data']

symbol_to_address = {}

for currency_data in currencies_data:
    if currency_data['chains'] == None:
        continue
    
    for chain in currency_data['chains']:
        if chain['chainId'] == 'eth':
            if chain['contractAddress'] == '':
                continue

            token_address = chain['contractAddress'].lower()
            symbol_to_address[currency_data['currency']] = token_address

In [5]:
def build_candles(filename):
    trades_df =  pd.read_csv(raw_data_path + filename)
    trades_df = trades_df.drop('trade_id', axis=1)
    trades_df['trade_time'] = pd.to_datetime(trades_df['trade_time'], unit='ms')
    trades_df = trades_df.sort_values('trade_time')

    trades_df = pd.merge_asof(trades_df, blocks_df[['block_number', 'timestamp']], left_on='trade_time', right_on='timestamp')
    trades_df = trades_df.drop(['trade_time', 'timestamp'], axis=1)
    trades_df = trades_df[trades_df['block_number'] != 19398714]
    
    block_offset = trades_df['block_number'].min() // 100 * 100
    trades_df['candle_id'] = (trades_df['block_number'] - block_offset) // 100
    trades_df['volume'] = trades_df['size'] * trades_df['price']

    trades_df['buys_usd'] = trades_df.apply(lambda row: row['volume'] if row['side'] == 'BUY' else 0, axis=1)
    trades_df['is_buy'] = (trades_df['side'] == 'BUY').astype(int)

    trades_df['buys_count'] = trades_df['is_buy']
    trades_df['sells_count'] = 1 - trades_df['is_buy']

    candles = trades_df.groupby('candle_id').agg(
        open_block_number=('block_number', 'first'),
        open=('price', 'first'),
        high=('price', 'max'),
        low=('price', 'min'),
        close=('price', 'last'),
        volume=('volume', 'sum'),
        buys_usd=('buys_usd', 'sum'),
        buys_count=('buys_count', 'sum'),
        sells_count=('sells_count', 'sum')
    ).reset_index()

    candles = candles.sort_values(by='open_block_number', ascending=True)
    
    candles = candles.drop('candle_id', axis=1)
    candles['open_block_number'] = candles['open_block_number'] - candles['open_block_number'] % 100
    candles['close_block_number'] = candles['open_block_number'] + 99
    candles['price_change_candle'] = (candles['close'] - candles['open']) / candles['open'] * 100
    candles['target_price'] = candles['close'].shift(-1)

    return candles

In [6]:
def add_window(candles, window_len, lbl):
    volume_window = []
    low_window = []
    high_window = []
    std_price_changes_window = []
    open_price_window = []
    buys_usd_window = []
    buys_count_window = []
    sells_count_window = []
    
    for index, row in candles.iterrows():
        start_block = row['open_block_number'] - window_len
        end_block = row['open_block_number']

        window = candles[(candles['open_block_number'] >= start_block) & (candles['open_block_number'] <= end_block)]

        volume_window.append(window['volume'].sum())
        low_window.append(window['low'].min())
        high_window.append(window['high'].max())
        open_price_window.append(window.iloc[0]['open'])
        buys_usd_window.append(window['buys_usd'].sum())
        buys_count_window.append(window['buys_count'].sum())
        sells_count_window.append(window['sells_count'].sum())
        
        if len(window) < 2:
            std_price_changes_window.append(0)
        else:
            std_price_changes_window.append(window['price_change_candle'].std())

    candles[f'volume_{lbl}'] = volume_window
    candles[f'low_price_{lbl}'] = low_window
    candles[f'high_price_{lbl}'] = high_window
    candles[f'std_price_changes_{lbl}'] = std_price_changes_window
    candles[f'open_price_{lbl}'] = open_price_window
    candles[f'buys_usd_{lbl}'] = buys_usd_window
    candles[f'buys_count_{lbl}'] = buys_count_window
    candles[f'sells_count_{lbl}'] = sells_count_window
    
    return candles

In [7]:
def get_token_symbol(filename):
    pref = ''
    if filename[:-4].endswith('DAI'):
        pref = filename[:-7]
    else:
        pref = filename[:-8]

    return pref.split('/')[-1]


filenames = [file for file in os.listdir(raw_data_path) if os.path.isfile(os.path.join(raw_data_path, file))]

counter = 0

for filename in filenames:
    stables = ['DAI', 'USDC', 'USDT']

    good = False
    for stable in stables:
        if filename[:-4].endswith(stable):
            good = True
            break

    if not good:
        continue

    good = True
    for stable in stables:
        if filename[:-4].startswith(stable):
            good = False
            break

    if not good:
        continue

    candles = build_candles(filename)
    
    candles = add_window(candles, 1800, '6h')
    candles = add_window(candles, 1800 * 4, '1d')
    candles = add_window(candles, 1800 * 12, '3d')
    candles = add_window(candles, 1800 * 28, 'week')

    candles['token_address'] = symbol_to_address[get_token_symbol(filename)]
    
    candles.to_csv(output_path + filename)
    del candles

    counter += 1
    print(len(filenames) - counter)

308
307
306
305
304
303
302
301
300
299
298
297
296
295
294
293
292
291
290
289
288
287
286
285
284
283
282
281
280
279
278
277
276
275
274
273
272
271
270
269
268
267
266
265
264
263
262
261
260
259
258
257
256
255
254
253
252
251
250
249
248
247
246
245
244
243
242
241
240
239
238
237
236
235
234
233
232
231
230
229
228
227
226
225
224
223
222
221
220
219
218
217
216
215
214
213
212
211
210
209
208
207
206
205
204
203
202
201
200
199
198
197
196
195
194
193
192
191
190
189
188
187
186
185
184
183
182
181
180
179
178
177
176
175
174
173
172
171
170
169
168
167
166
165
164
163
162
161
160
159
158
157
156
155
154
153
152
151
150
149
148
147
146
145
144
143
142
141
140
139
138
137
136
135
134
133
132
131
130
129
128
127
126
125
124
123
122
121
120
119
118
117
116
115
114
113
112
111
110
109
108
107
106
105
104
103
102
101
100
99
98
97
96
95
94
93
92
91
90
89
88
87
86
85
84
83
82
81
80
79
78
77
76
75
74
73
72
71
70
69
68
67
66
65
64
63
62
61
60
59
58
57
56
55
54
53
52
51
50
49
48
47
46
45