In [149]:
import pandas as pd
import numpy as np

In [2]:
dataset = pd.read_csv('Indicator_based_RL/indicator_dataset.csv')

In [75]:
prices = dataset['price']

In [166]:
def peak_valley_labels(prices, coefficient):
    labels = []
    print('Starting peak_valley label creation with coefficient {}'.format(coefficient))
    for index in range(len(prices)):
        label = None
        price = prices[index]
        if index%int(len(prices)/10) == 0:
            print('{:.2f}% Done'.format(100*index/len(prices)))
        peak = True
        valley = True
        for next_index in range(0, len(prices) - index):
            new_price = prices[next_index + index]
            time_step = next_index + index
            
#           Determine the type of label we are looking for
            if new_price < price:
                peak = False
            if new_price > price:
                valley = False
                
#           Check if the boundary is reached
            if new_price >= (100+coefficient)*price/100 and peak:
                label = 1
            elif new_price <= (100-coefficient)*price/100 and valley:
                label = -1
                
#           Check for change of direction
            elif not peak and not valley:
                label = 0
                
#           Check for the end of the dataset  
            elif  time_step == len(prices)-1:
                label = 0
                
#           If there is a label already stop searching
            if label is not None:
                row = [label, time_step]
                labels.append(row)
                break
    if len(labels)!=len(prices):
        raise ValueError('Labels length ({}) and dataset length ({}) missmatch'.format(len(labels), len(prices)))
    return labels

In [167]:
def separate(mixed_labels):
    peaks = []
    valleys = []
    timestamps = []
    for label, timestamp in mixed_labels:
        timestamps.append(timestamp)
        if label == -1:
            peaks.append(0)
            valleys.append(1)
        elif label == 0:
            peaks.append(0)
            valleys.append(0)
        elif label == 1:
            peaks.append(1)
            valleys.append(0)
    return peaks, valleys, timestamps

In [102]:
coeffs = [0.25, 0.5, 1, 2]
for coeff in coeffs:
    labels['peaks_{}'.format(coeff)], labels['valleys_{}'.format(coeff)], _ = separate(peak_valley_labels(prices, coeff))

Starting label creation with coefficient 0.25
0.00% Done
10.00% Done
20.00% Done
30.00% Done
40.00% Done
50.00% Done
60.00% Done
70.00% Done
80.00% Done
90.00% Done
100.00% Done
Starting label creation with coefficient 0.5
0.00% Done
10.00% Done
20.00% Done
30.00% Done
40.00% Done
50.00% Done
60.00% Done
70.00% Done
80.00% Done
90.00% Done
100.00% Done
Starting label creation with coefficient 1
0.00% Done
10.00% Done
20.00% Done
30.00% Done
40.00% Done
50.00% Done
60.00% Done
70.00% Done
80.00% Done
90.00% Done
100.00% Done
Starting label creation with coefficient 2
0.00% Done
10.00% Done
20.00% Done
30.00% Done
40.00% Done
50.00% Done
60.00% Done
70.00% Done
80.00% Done
90.00% Done
100.00% Done


In [117]:
def box_labels(prices, coeff):
    labels = []
    print('Starting box label creation with coefficient {}'.format(coeff))
    for index in range(len(prices)):    
        label = None
        if index%int(len(prices)/100) == 0:
            print('{:.2f}% Done'.format(100*index/len(prices)))
        price = prices[index]
        upper_bound = (100 + coeff)*price/100
        low_bound = (100 - coeff)*price/100
        for new_ind in range(index, len(prices)):
            new_price = prices[new_ind]
            
#           Check if we have crossed a boundary:
            if new_price >= upper_bound:
                label = 1
            elif new_price <= low_bound:
                label = -1
            
#           Check if we have reached the end of the file
            elif new_ind == len(prices) - 1:
                label = 0
            
            if label is not None:
                row = [label, new_ind]
                labels.append(row)
                print('Label: {}   Delta: {}   Achieved at: {}'.format(label, new_price-price, new_ind))
                break

    if len(labels)!=len(prices):
        raise ValueError('Labels length ({}) and dataset length ({}) missmatch'.format(len(labels), len(prices)))
    return labels

In [143]:
for coeff in coeffs:
    labels['label_{}'.format(coeff)], labels['reversed_label_{}'.format(coeff)], labels['timestamps_{}'.format(coeff)] = separate(box_labels(prices, coeff))

Starting label creation with coefficient 0.25
0.00% Done
1.00% Done
2.00% Done
3.00% Done
4.00% Done
5.00% Done
6.00% Done
7.00% Done
8.00% Done
9.00% Done
10.00% Done
11.00% Done
12.00% Done
13.00% Done
14.00% Done
15.00% Done
16.00% Done
17.00% Done
18.00% Done
19.00% Done
20.00% Done
21.00% Done
22.00% Done
23.00% Done
24.00% Done
25.00% Done
26.00% Done
27.00% Done
28.00% Done
29.00% Done
30.00% Done
31.00% Done
32.00% Done
33.00% Done
34.00% Done
35.00% Done
36.00% Done
37.00% Done
38.00% Done
39.00% Done
40.00% Done
41.00% Done
42.00% Done
43.00% Done
44.00% Done
45.00% Done
46.00% Done
47.00% Done
48.00% Done
48.99% Done
49.99% Done
50.99% Done
51.99% Done
52.99% Done
53.99% Done
54.99% Done
55.99% Done
56.99% Done
57.99% Done
58.99% Done
59.99% Done
60.99% Done
61.99% Done
62.99% Done
63.99% Done
64.99% Done
65.99% Done
66.99% Done
67.99% Done
68.99% Done
69.99% Done
70.99% Done
71.99% Done
72.99% Done
73.99% Done
74.99% Done
75.99% Done
76.99% Done
77.99% Done
78.99% Done
79.9

In [178]:
def reg_labels(prices, labels, timestamps):
    print('Generating regression labels')
    reg_targets = np.zeros(len(prices))
    for i in range(len(prices)):
        if i%int(len(prices)/10) == 0:
            print('{:.2f}% Done'.format(100*i/len(prices)))
        current_price = prices[i]
        first_label = labels[i]
        next_ind = timestamps[i]
        next_label = labels[next_ind]
#         print("Delta: {:.2f}    First label: {}   Next index: {}    Next label: {}".format(prices[next_ind]-current_price, first_label, next_ind, next_label))
        while next_label == 1 and first_label == 1:
            next_ind = timestamps[next_ind]
            next_label = labels[next_ind]
#             print("Delta: {:.2f}    Next index: {}    Next label: {}".format(prices[next_ind]-current_price, next_ind, next_label))
        final_ind = next_ind
        price_at_cross = prices[final_ind]
        delta = price_at_cross - current_price
        reg_targets[i] = delta
#         print('Generated delta: {:.2f}\n'.format(delta))
    return reg_targets

In [179]:
new_labels = pd.read_csv('new_labels.csv')

In [180]:
for coeff in coeffs:
    line_labels = new_labels['label_{}'.format(coeff)]
    timestamps = new_labels['timestamps_{}'.format(coeff)]
    new_labels['reg_labels_{}'.format(coeff)] = reg_labels(prices, line_labels, timestamps)

Generating regression labels
0.00% Done
10.00% Done
20.00% Done
30.00% Done
40.00% Done
50.00% Done
60.00% Done
70.00% Done
80.00% Done
90.00% Done
100.00% Done
Generating regression labels
0.00% Done
10.00% Done
20.00% Done
30.00% Done
40.00% Done
50.00% Done
60.00% Done
70.00% Done
80.00% Done
90.00% Done
100.00% Done
Generating regression labels
0.00% Done
10.00% Done
20.00% Done
30.00% Done
40.00% Done
50.00% Done
60.00% Done
70.00% Done
80.00% Done
90.00% Done
100.00% Done
Generating regression labels
0.00% Done
10.00% Done
20.00% Done
30.00% Done
40.00% Done
50.00% Done
60.00% Done
70.00% Done
80.00% Done
90.00% Done
100.00% Done


In [183]:
old_labels.head()

Unnamed: 0.1,Unnamed: 0,prices,regression_targets_1,regression_targets_2,regression_targets_5,reversed_label_1,reversed_label_2,reversed_label_5,labels_1,labels_2,labels_5,peak_1,peak_2,peak_5,valley_1,valley_2,valley_5
0,0,135.38,0.91,2.13,3.46,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,135.27,1.02,2.24,3.57,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0
2,2,135.49,0.8,2.02,3.35,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0
3,3,136.29,1.22,1.47,3.1,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0
4,4,137.51,-1.09,1.4,4.41,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [184]:
new_labels.head()

Unnamed: 0,peaks_0.5,valleys_0.5,peaks_0.25,valleys_0.25,peaks_1,valleys_1,peaks_2,valleys_2,label_0.25,reversed_label_0.25,...,label_1,reversed_label_1,timestamps_1,label_2,reversed_label_2,timestamps_2,reg_labels_0.25,reg_labels_0.5,reg_labels_1,reg_labels_2
0,0,0,0,0,0,0,0,0,1,0,...,1,0,4,1,0,13,2.13,2.13,3.53,16.56
1,1,0,1,0,1,0,1,0,1,0,...,1,0,4,1,0,13,2.24,2.24,3.64,16.67
2,1,0,1,0,1,0,1,0,1,0,...,1,0,4,1,0,13,2.02,2.02,3.42,16.45
3,1,0,1,0,1,0,1,0,1,0,...,1,0,12,1,0,18,1.22,1.22,3.1,3.1
4,0,1,0,1,0,0,0,0,0,1,...,1,0,17,1,0,668,-0.64,-1.09,1.4,14.43


In [186]:
new_labels.to_csv('new_labels.csv', index=False)