In [1]:
# TensorFlow e tf.keras
import tensorflow as tf
from tensorflow import keras

import keras
from keras.models import Sequential
from keras.layers import Dense
from keras.utils import to_categorical 

import numpy as np
import matplotlib.pyplot as plt

print(tf.__version__)

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer

2.4.1


In [2]:
import json

### Reading the dataset previously classified by hand (Labelled)

In [4]:
classifier_data = None
with open(f'../classifier/static/classify_data/data.json', 'r') as f:
    classifier_data = json.load(f)
data = classifier_data['data']

In [7]:
for item in data:
    features = item['features']
    features['id'] = item['id']
    features['label'] = item['label']
    features['name'] = item['name']

features_data = [item['features'] for item in data if len(item['label']) > 0]

In [8]:
df = pd.DataFrame(features_data)
df.head()

Unnamed: 0,high,low,close,open,id,label,name
0,36.52,35.53,36.52,35.8,5045d02c-7578-11eb-abae-90324bbf5cee,inverted_hammer,20200101210000.svg
1,36.73,35.97,36.14,36.02,5046cd06-7578-11eb-9344-90324bbf5cee,doji,20200102210000.svg
2,36.09,35.45,35.6,36.06,504785de-7578-11eb-96dd-90324bbf5cee,spinning top,20200105210000.svg
3,35.77,34.78,34.78,35.6,50481602-7578-11eb-a671-90324bbf5cee,hammer,20200106210000.svg
4,35.35,34.21,34.21,35.01,5048c9a8-7578-11eb-8254-90324bbf5cee,hammer,20200107210000.svg


### Adding new features

In [9]:
def create_new_features(df: pd.DataFrame): 
    # the perc can cause inf values. this bias resolves this problem
    bias = 0.00001
    df['diff_hi_low'] = df.high/df.low -1 + bias
    df['diff_close_open'] = df.close/df.open -1 + bias
    df['odds_head_tail_and_body'] = abs(df.diff_hi_low/df.diff_close_open) + bias
    df['diff_high_close_open'] = np.abs(df.high/(np.maximum(df.close, df.open))-1) + bias
    df['diff_low_close_open'] = np.abs(df.low/(np.minimum(df.close, df.open))-1) + bias
    df['odds_head_tail'] = (df.diff_high_close_open/df.diff_low_close_open) + bias
    df['close_grather_open'] = (df.diff_close_open >= 0) * 1
    return df


In [10]:
df = create_new_features(df)
df

Unnamed: 0,high,low,close,open,id,label,name,diff_hi_low,diff_close_open,odds_head_tail_and_body,diff_high_close_open,diff_low_close_open,odds_head_tail,close_grather_open
0,36.52,35.53,36.52,35.80,5045d02c-7578-11eb-abae-90324bbf5cee,inverted_hammer,20200101210000.svg,0.027874,0.020122,1.385267,0.000010,0.007552,0.001334,1
1,36.73,35.97,36.14,36.02,5046cd06-7578-11eb-9344-90324bbf5cee,doji,20200102210000.svg,0.021139,0.003341,6.326160,0.016335,0.001398,11.683863,1
2,36.09,35.45,35.60,36.06,504785de-7578-11eb-96dd-90324bbf5cee,spinning top,20200105210000.svg,0.018064,-0.012747,1.417150,0.000842,0.004223,0.199359,0
3,35.77,34.78,34.78,35.60,50481602-7578-11eb-a671-90324bbf5cee,hammer,20200106210000.svg,0.028475,-0.023024,1.236763,0.004785,0.000010,478.528100,0
4,35.35,34.21,34.21,35.01,5048c9a8-7578-11eb-8254-90324bbf5cee,hammer,20200107210000.svg,0.033334,-0.022841,1.459410,0.009722,0.000010,972.151110,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57,21.90,20.87,21.33,21.30,5076d206-7578-11eb-a407-90324bbf5cee,doji,20200426210000.svg,0.049363,0.001418,34.800753,0.026733,0.020198,1.323567,1
58,22.43,21.37,21.39,22.28,507be168-7578-11eb-997f-90324bbf5cee,marubozu,20200511210000.svg,0.049612,-0.039936,1.242299,0.006742,0.000945,7.134802,0
59,24.15,22.48,23.94,22.60,50bae8cc-7578-11eb-a716-90324bbf5cee,marubozu,20201007210000.svg,0.074298,0.059302,1.252889,0.008782,0.005320,1.650831,1
60,26.03,24.56,24.76,25.96,50c1dcb6-7578-11eb-b593-90324bbf5cee,marubozu,20201026210000.svg,0.059863,-0.046215,1.295336,0.002706,0.008088,0.334655,0


In [11]:
X = df[['diff_hi_low', 'diff_close_open', 'odds_head_tail_and_body', 'diff_high_close_open', 'diff_low_close_open', 'odds_head_tail', 'close_grather_open']]
print(X.shape)
labels = df[['label']]['label'].astype("category")
encoder = LabelBinarizer()
y = encoder.fit_transform(labels)
print(y.shape)

(62, 7)
(62, 6)


In [15]:
# 75% train 25% test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
print(X_train.shape)
print(X_test.shape)

(46, 7)
(16, 7)


In [16]:
model = Sequential()
model.add(Dense(500, activation='relu', input_dim=X.shape[1]))
model.add(Dense(100, activation='relu'))
model.add(Dense(50, activation='relu'))
model.add(Dense(y.shape[1], activation='softmax'))

# Compile the model
model.compile(optimizer='adam', 
              loss='categorical_crossentropy', 
              metrics=['accuracy'])

In [17]:
model.fit(X_train, y_train, epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<tensorflow.python.keras.callbacks.History at 0x13715c0e240>

In [18]:
train_loss, train_acc = model.evaluate(X_train,  y_train, verbose=2)

2/2 - 0s - loss: 0.3609 - accuracy: 0.8261


In [19]:
test_loss, test_acc = model.evaluate(X_test,  y_test, verbose=2)

1/1 - 0s - loss: 1.3802 - accuracy: 0.5000


### Testing with no labelled data

In [20]:
test_data = [item['features'] for item in data if len(item['label']) == 0]
test_df = pd.DataFrame(test_data)
test_df.head()

Unnamed: 0,high,low,close,open,id,label,name
0,33.05,32.35,32.49,32.93,5057b32e-7578-11eb-be72-90324bbf5cee,,20200213210000.svg
1,32.66,32.06,32.19,32.66,505865e4-7578-11eb-8b82-90324bbf5cee,,20200216210000.svg
2,32.42,31.68,32.32,32.07,5059170c-7578-11eb-b396-90324bbf5cee,,20200217210000.svg
3,32.93,32.51,32.67,32.89,505a885c-7578-11eb-af82-90324bbf5cee,,20200219210000.svg
4,31.69,30.43,31.61,30.72,505d4fd4-7578-11eb-8bbe-90324bbf5cee,,20200227210000.svg


In [21]:
test_df = create_new_features(test_df)
test_df

Unnamed: 0,high,low,close,open,id,label,name,diff_hi_low,diff_close_open,odds_head_tail_and_body,diff_high_close_open,diff_low_close_open,odds_head_tail,close_grather_open
0,33.05,32.35,32.49,32.93,5057b32e-7578-11eb-be72-90324bbf5cee,,20200213210000.svg,0.021648,-0.013352,1.621404,0.003654,0.004319,0.846057,0
1,32.66,32.06,32.19,32.66,505865e4-7578-11eb-8b82-90324bbf5cee,,20200216210000.svg,0.018725,-0.014381,1.302097,0.000010,0.004049,0.002480,0
2,32.42,31.68,32.32,32.07,5059170c-7578-11eb-b396-90324bbf5cee,,20200217210000.svg,0.023369,0.007805,2.993892,0.003104,0.012171,0.255049,1
3,32.93,32.51,32.67,32.89,505a885c-7578-11eb-af82-90324bbf5cee,,20200219210000.svg,0.012929,-0.006679,1.935805,0.001226,0.004907,0.249869,0
4,31.69,30.43,31.61,30.72,505d4fd4-7578-11eb-8bbe-90324bbf5cee,,20200227210000.svg,0.041417,0.028981,1.429084,0.002541,0.009450,0.268879,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
166,29.65,28.76,28.82,28.91,50cf2f54-7578-11eb-8ecc-90324bbf5cee,,20201126210000.svg,0.030956,-0.003103,9.975732,0.025607,0.002092,12.240954,0
167,29.06,28.43,28.43,28.83,50cfe39c-7578-11eb-902c-90324bbf5cee,,20201129210000.svg,0.022170,-0.013864,1.599043,0.007988,0.000010,798.780100,0
168,30.34,28.82,29.63,28.93,50d0858c-7578-11eb-8b71-90324bbf5cee,,20201130210000.svg,0.052751,0.024206,2.179239,0.023972,0.003812,6.288161,1
169,30.11,29.32,29.85,29.79,50d14636-7578-11eb-a630-90324bbf5cee,,20201201210000.svg,0.026954,0.002024,13.316587,0.008720,0.015787,0.552373,1


In [22]:
X = test_df[['diff_hi_low', 'diff_close_open', 'odds_head_tail_and_body', 'diff_high_close_open', 'diff_low_close_open', 'odds_head_tail', 'close_grather_open']]
X.head()

Unnamed: 0,diff_hi_low,diff_close_open,odds_head_tail_and_body,diff_high_close_open,diff_low_close_open,odds_head_tail,close_grather_open
0,0.021648,-0.013352,1.621404,0.003654,0.004319,0.846057,0
1,0.018725,-0.014381,1.302097,1e-05,0.004049,0.00248,0
2,0.023369,0.007805,2.993892,0.003104,0.012171,0.255049,1
3,0.012929,-0.006679,1.935805,0.001226,0.004907,0.249869,0
4,0.041417,0.028981,1.429084,0.002541,0.00945,0.268879,1


In [23]:
predictions = model.predict(X)
predictions

array([[3.1481083e-02, 3.1529278e-02, 6.9711413e-03, 2.4228373e-01,
        2.4528287e-01, 4.4245186e-01],
       [4.0320102e-02, 3.7693892e-02, 6.0834453e-02, 3.6747751e-01,
        3.8212687e-01, 1.1154714e-01],
       [5.1587403e-01, 1.0395654e-03, 5.0206427e-03, 2.3672396e-02,
        9.9547049e-03, 4.4443870e-01],
       ...,
       [5.1901340e-03, 8.7505502e-01, 4.6184000e-06, 2.2220197e-03,
        9.4338357e-02, 2.3189887e-02],
       [9.8610967e-01, 2.4530791e-14, 1.1235914e-02, 3.0371993e-08,
        1.3082431e-15, 2.6544235e-03],
       [8.0748624e-01, 2.4461636e-02, 1.0908263e-06, 7.3573238e-04,
        8.9449680e-04, 1.6642083e-01]], dtype=float32)

In [24]:
predictions.shape

(171, 6)

In [25]:
classes = encoder.classes_

for idx, pred in enumerate(predictions):
    test_df.iloc[idx, test_df.columns.get_loc('label')] = classes[np.argmax(pred)]

test_df.head()

Unnamed: 0,high,low,close,open,id,label,name,diff_hi_low,diff_close_open,odds_head_tail_and_body,diff_high_close_open,diff_low_close_open,odds_head_tail,close_grather_open
0,33.05,32.35,32.49,32.93,5057b32e-7578-11eb-be72-90324bbf5cee,spinning top,20200213210000.svg,0.021648,-0.013352,1.621404,0.003654,0.004319,0.846057,0
1,32.66,32.06,32.19,32.66,505865e4-7578-11eb-8b82-90324bbf5cee,marubozu,20200216210000.svg,0.018725,-0.014381,1.302097,1e-05,0.004049,0.00248,0
2,32.42,31.68,32.32,32.07,5059170c-7578-11eb-b396-90324bbf5cee,doji,20200217210000.svg,0.023369,0.007805,2.993892,0.003104,0.012171,0.255049,1
3,32.93,32.51,32.67,32.89,505a885c-7578-11eb-af82-90324bbf5cee,spinning top,20200219210000.svg,0.012929,-0.006679,1.935805,0.001226,0.004907,0.249869,0
4,31.69,30.43,31.61,30.72,505d4fd4-7578-11eb-8bbe-90324bbf5cee,marubozu,20200227210000.svg,0.041417,0.028981,1.429084,0.002541,0.00945,0.268879,1


### Show results

In [26]:
from IPython.display import SVG, display
def show_svg(filename):
    display(SVG(filename=f'../classifier/static/classify_data/{filename}'))

In [29]:
for i in range(10):
    name = test_df.iloc[i].name
    label = test_df.iloc[i].label
    show_svg(f'{name}.svg')

FileNotFoundError: [Errno 2] No such file or directory: '../classifier/static/classify_data/0.svg'