In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import keras
from trading_tools import data_frame
from tqdm import tqdm
import time




In [2]:
ASSET = "AUDCAD_otc" 
# ASSET = "EURUSD_otc" 
df = data_frame.load_csv(f"actives/ACTIVO-{ASSET}-0005s.csv")
# df = data_frame.load_csv(f"actives/ACTIVO-{ASSET}-0001s.csv")

In [None]:
df.plot()

## STAGE 1
##### Identifing valid sequences 

In [None]:
def check_conditions(df, window_size):
    """
    ### Conditions are:
    It is a valid sequence if all the candles after the end of the sequence have high and lows higher or lower than those in the previous candle
    """
    labels = []
    count = len(df) - window_size

    for i in tqdm(range(count), desc="Processing"):
        close0 = df['close'].iloc[i]
        close1 = df['close'].iloc[i+window_size]
        
        window = df.iloc[i:i+window_size]
        condition_high_down = all(window['high'].iloc[j] < window['high'].iloc[j-1] for j in range(1, window_size))
        condition_low_down = all(window['low'].iloc[j] < window['low'].iloc[j-1] for j in range(1, window_size))
        target_condition_down = close1 < close0
        
        condition_high_up = all(window['high'].iloc[j] > window['high'].iloc[j-1] for j in range(1, window_size))
        condition_low_up = all(window['low'].iloc[j] > window['low'].iloc[j-1] for j in range(1, window_size))
        target_condition_up = close1 > close0
        
        if (condition_high_down and condition_low_down and target_condition_down) or (condition_high_up and condition_low_up and target_condition_up):
            labels.append(1)
        else:
            labels.append(0)
    
    # Add NaNs for the tail of the DataFrame that doesn't have enough data to form a full window
    labels.extend([None] * window_size)
    return labels

In [3]:
def check_conditions_vectorized(df, window_size):
    labels = np.zeros(len(df))
    
    # Create shifted arrays for high, low, and close
    high_shifted = np.array([df['high'].shift(-i).values for i in range(window_size)])
    low_shifted = np.array([df['low'].shift(-i).values for i in range(window_size)])
    close_shifted = df['close'].shift(-window_size).values

    # Initialize condition arrays
    condition_high_down = np.all(high_shifted[1:] < high_shifted[:-1], axis=0)
    condition_low_down = np.all(low_shifted[1:] < low_shifted[:-1], axis=0)
    target_condition_down = close_shifted < df['close'].values
    
    condition_high_up = np.all(high_shifted[1:] > high_shifted[:-1], axis=0)
    condition_low_up = np.all(low_shifted[1:] > low_shifted[:-1], axis=0)
    target_condition_up = close_shifted > df['close'].values
    
    # Apply conditions
    conditions_down = condition_high_down & condition_low_down & target_condition_down
    conditions_up = condition_high_up & condition_low_up & target_condition_up
    
    # Adjust indices to ensure correct length
    valid_conditions = conditions_down[:-window_size] | conditions_up[:-window_size]
    labels[:len(valid_conditions)] = valid_conditions.astype(int)
    labels[len(valid_conditions):] = np.nan
    
    return labels


In [4]:
window_size = 5
QUALITY_LABEL = f'seq_quality_{window_size}'

# Measure execution time for the vectorized method
start_time = time.time()
SEQ_LABEL = f'seq_quality_{window_size}'
df[SEQ_LABEL] = check_conditions_vectorized(df, window_size)
time_check_conditions_vectorized = time.time() - start_time
print(f"Time for check_conditions_vectorized: {time_check_conditions_vectorized:.2f} seconds")

df.dropna(inplace=True)
count_values = df[SEQ_LABEL].value_counts()

print(f"Conteo de valores en la columna {SEQ_LABEL}: {count_values}")


Time for check_conditions_vectorized: 0.12 seconds
Conteo de valores en la columna seq_quality_5: seq_quality_5
0.0    853723
1.0     67151
Name: count, dtype: int64


In [5]:
column_indices = {name: i for i, name in enumerate(df.columns)}
num_features = df.shape[1]

from sklearn.model_selection import train_test_split

features_columns = df.drop(columns=[SEQ_LABEL]).columns.copy()

# Split the data into training and temporary sets first
X_temp, y_temp = df.drop(columns=[SEQ_LABEL]).values, df[SEQ_LABEL].values

# Identify the indices of each class
zero_indices = [i for i, y in enumerate(y_temp) if y == 0]
one_indices = [i for i, y in enumerate(y_temp) if y == 1]

# Determine the smaller class size
min_class_size = min(len(zero_indices), len(one_indices))

# Randomly sample from each class to ensure balance
np.random.seed(42)
zero_sample = np.random.choice(zero_indices, min_class_size, replace=False)
one_sample = np.random.choice(one_indices, min_class_size, replace=False)

# Combine the sampled indices
balanced_indices = np.concatenate([zero_sample, one_sample])

# Create balanced datasets
X_balanced = [X_temp[i] for i in balanced_indices]
y_balanced = [y_temp[i] for i in balanced_indices]
print(len(balanced_indices))
# Split the data into training and temporary sets first
X_train, X_temp, y_train, y_temp = train_test_split(X_balanced, y_balanced, test_size=0.3, random_state=42, shuffle=True)

# Split the temporary set into validation and test sets
X_val, X_test, y_val, y_test = train_test_split( X_temp, y_temp, test_size=0.333, random_state=16, shuffle=True)
# Verify the balance
print(f"Total zeros: {len(zero_indices)}, Total ones: {len(one_indices)}")
print(f"Training data size: {len(X_train)}, Training target size: {len(y_train)}")
print(f"Validation data size: {len(X_val)}, Validation target size: {len(y_val)}")
print(f"Testing data size: {len(X_test)}, Testing target size: {len(y_test)}")
print(f"Class balance in training set: {np.sum(np.array(y_train) == 0)} zeros and {np.sum(np.array(y_train) == 1)} ones")

@staticmethod
def create_df(features, targets, feature_labels, target_label):
    out_df = pd.DataFrame(data=features, columns=feature_labels)
    out_df[target_label] = targets
    return out_df
    
train_df = create_df(X_train, y_train, features_columns, SEQ_LABEL)
val_df = create_df(X_val, y_val, features_columns, SEQ_LABEL)
test_df = create_df(X_test, y_test, features_columns, SEQ_LABEL)


134302
Total zeros: 853723, Total ones: 67151
Training data size: 94011, Training target size: 94011
Validation data size: 26874, Validation target size: 26874
Testing data size: 13417, Testing target size: 13417
Class balance in training set: 46905 zeros and 47106 ones


In [6]:
class Stage1WindowGenerator():
    def __init__(self, input_width, label_columns, features_columns,
                 train_df=train_df, val_df=val_df, test_df=test_df,
                  batch_size=32):
        # Almacenar los datos crudos
        self.train_df = train_df
        self.val_df = val_df
        self.test_df = test_df

        self.column_indices = {name: i for i, name in enumerate(train_df.columns)}
        
        self.label_columns = label_columns     
           
        self.features_columns = features_columns
        

        # Calcular los parámetros de la ventana
        self.input_width = input_width

        self.total_window_size = input_width 

        self.input_slice = slice(0, input_width)
        self.input_indices = np.arange(self.total_window_size)[self.input_slice]

        self.batch_size = batch_size
        self.normalize_inputs = False

    def __repr__(self):
        return '\n'.join([
            f'Total window size: {self.total_window_size}',
            f'Input indices: {min(self.input_indices)} - {max(self.input_indices)}',
            f'Label column name(s): {self.label_columns}',
            f'Feature column name(s): {self.features_columns}'
        ])
    
    @tf.function
    def normalize(self, inputs):
        def normalize_sequence(sequence):
            rolling_mean = tf.reduce_mean(sequence, axis=0, keepdims=True)
            return (sequence - rolling_mean) / rolling_mean

        return tf.map_fn(normalize_sequence, inputs, fn_output_signature=tf.float32)

    def split_window(self, features):
        data = features[:, self.input_slice, :]
        inputs = tf.stack([data[:, :, self.column_indices[name]] for name in self.features_columns], axis=-1)
        labels = tf.stack([data[:, -1, self.column_indices[name]] for name in self.label_columns], axis=-1)

        # if self.normalize_inputs:
        #     inputs = self.normalize(tf.cast(inputs, tf.float32))  # Convertir inputs a float32

        inputs.set_shape([None, self.input_width, len(self.features_columns)])
        labels.set_shape([None, len(self.label_columns)])

        return inputs, labels

    def make_dataset(self, data):
        data = data.to_numpy()
        ds = keras.preprocessing.timeseries_dataset_from_array(
            data=data,
            targets=None,
            sequence_length=self.total_window_size,
            sequence_stride=1,
            shuffle=False,
            batch_size=self.batch_size
        )
        ds = ds.map(self.split_window)
        return ds

    @property
    def train(self):
        return self.make_dataset(self.train_df)

    @property
    def val(self):
        return self.make_dataset(self.val_df)

    @property
    def test(self):
        return self.make_dataset(self.test_df)

    @property
    def example(self):
        result = getattr(self, '_example', None)
        if result is None:
            # No se encontró un lote de ejemplo, así que obtén uno del dataset de entrenamiento
            result = next(iter(self.train))
            # Y guárdalo en caché para la próxima vez
            self._example = result
        return result


In [7]:
label_columns=[SEQ_LABEL]
# features_columns=['open', "close"]
features_columns= [col for col in df.columns if col not in label_columns]
num_labels = len(label_columns)
num_features = len(features_columns)
input_width = 30

window = Stage1WindowGenerator( input_width=input_width, label_columns= label_columns , features_columns=features_columns, batch_size=2048)

window

Total window size: 30
Input indices: 0 - 29
Label column name(s): ['seq_quality_5']
Feature column name(s): ['open', 'high', 'low', 'close']

In [10]:
# Define the model
model = keras.models.Sequential([
    keras.layers.Conv1D(32, 3, activation='relu'),
    keras.layers.LSTM(64, return_sequences=True),
    keras.layers.LSTM(32, return_sequences=False),
    # keras.layers.Dense(128, activation='relu'),
    # keras.layers.Dense(128, activation='relu'),
    # keras.layers.Dropout(0.2),
    # keras.layers.Dense(64, activation='relu'),
    keras.layers.Dense(32, activation='relu'),
    keras.layers.Dropout(0.2),
    # keras.layers.TimeDistributed( keras.layers.Dense(1, activation='sigmoid'))
    keras.layers.Dense(1, activation='sigmoid')
])

def custom_binary_loss(y_true, y_pred):
    # Force predictions to be strictly 0 or 1
    y_pred_binary = keras.backend.round(y_pred)

    # Binary cross-entropy loss
    bce_loss = keras.backend.binary_crossentropy(y_true, y_pred_binary)

    # Penalty term to push y_pred closer to 0 or 1
    penalty = keras.backend.square(y_pred - y_pred_binary)

    # Total loss
    return keras.backend.mean(bce_loss + penalty)


model.compile(optimizer=keras.optimizers.Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])
# model.compile(optimizer=keras.optimizers.Adam(learning_rate=0.001), loss=custom_binary_loss, metrics=['accuracy'])


In [11]:
# Create directories for saving model checkpoints and logs if they don't exist
# Example of training with the datasets
import os
import utils
os.makedirs("models", exist_ok=True)
os.makedirs('.logs', exist_ok=True)
utils.clear_directory('.logs')

history = model.fit(window.train, epochs=10,
                        validation_data=window.val,
                        callbacks=[
                            tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True, mode='min'),
                            tf.keras.callbacks.ModelCheckpoint(filepath=f"models/{ASSET}.st1.keras", monitor='val_loss', save_best_only=True),
                            tf.keras.callbacks.TensorBoard(log_dir='.logs'),
                            tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=3, min_lr=1e-5)
                        ])

Epoch 1/10


Epoch 2/10
Epoch 3/10


In [None]:
result = model.predict(window.test, verbose=1)
result

## STAGE 2
##### Predicting behaviour

In [None]:
def agregar_labels(df, window_size=6):
    # Crear una columna para la decisión de compra o venta
    df[f'label_{window_size}'] = (df['close'].shift(-window_size) > df['close']).astype(int)
    df.dropna(inplace=True)  # Eliminar filas con NaN resultantes del shift
    return df
FORWARD_WINDOW = 6
LABEL = f"label_{FORWARD_WINDOW}"
# Añadir la columna de labels al DataFrame
df = agregar_labels(df, FORWARD_WINDOW)

from feature_processor import FeatureProcessor

df = FeatureProcessor.add_ta_features(df)
df = FeatureProcessor.add_bar_features(df)
df = FeatureProcessor.add_adj_features(df)
df = FeatureProcessor.add_mv_avg_features(df)
df = FeatureProcessor.add_time_features(df)
df.dropna(inplace=True)

In [None]:
df.head()

In [None]:
# from sklearn.utils import class_weight

# class_weights = class_weight.compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
# class_weights_dict = dict(enumerate(class_weights))


In [None]:
# Supongamos que tu DataFrame se llama df
# Reemplaza 'label_6' con el nombre de la columna que deseas analizar

# label_column = LABEL
label_column = SEQ_LABEL
count_values = df[label_column].value_counts()

print(f"Conteo de valores en la columna {label_column}:")
print(count_values)


In [None]:
column_indices = {name: i for i, name in enumerate(df.columns)}

n = len(df)
train_df = df[0         :int(n*0.7)]
val_df   = df[int(n*0.7):int(n*0.9)]
test_df  = df[int(n*0.9):          ]

num_features = df.shape[1]
print(f"Features count: {num_features}")
print(f"Train count: {len(train_df)}")
print(f"Validation count: {len(val_df)}")
print(f"Test count: {len(test_df)}")

In [None]:
class WindowGenerator():
    def __init__(self, input_width, label_columns, features_columns,
                 train_df=train_df, val_df=val_df, test_df=test_df,
                  batch_size=32):
        # Almacenar los datos crudos
        self.train_df = train_df
        self.val_df = val_df
        self.test_df = test_df

        self.column_indices = {name: i for i, name in enumerate(train_df.columns)}
        
        self.label_columns = label_columns     
           
        self.features_columns = features_columns
        

        # Calcular los parámetros de la ventana
        self.input_width = input_width

        self.total_window_size = input_width 

        self.input_slice = slice(0, input_width)
        self.input_indices = np.arange(self.total_window_size)[self.input_slice]

        self.batch_size = batch_size
        self.normalize_inputs = False

    def __repr__(self):
        return '\n'.join([
            f'Total window size: {self.total_window_size}',
            f'Input indices: {min(self.input_indices)} - {max(self.input_indices)}',
            f'Label column name(s): {self.label_columns}',
            f'Feature column name(s): {self.features_columns}'
        ])
    
    @tf.function
    def normalize(self, inputs):
        def normalize_sequence(sequence):
            rolling_mean = tf.reduce_mean(sequence, axis=0, keepdims=True)
            return (sequence - rolling_mean) / rolling_mean

        return tf.map_fn(normalize_sequence, inputs, fn_output_signature=tf.float32)

    def split_window(self, features):
        data = features[:, self.input_slice, :]
        inputs = tf.stack([data[:, :, self.column_indices[name]] for name in self.features_columns], axis=-1)
        labels = tf.stack([data[:, -1, self.column_indices[name]] for name in self.label_columns], axis=-1)

        if self.normalize_inputs:
            inputs = self.normalize(tf.cast(inputs, tf.float32))  # Convertir inputs a float32

        inputs.set_shape([None, self.input_width, len(self.features_columns)])
        labels.set_shape([None, len(self.label_columns)])

        return inputs, labels

    def make_dataset(self, data):
        data = data.to_numpy()
        ds = keras.preprocessing.timeseries_dataset_from_array(
            data=data,
            targets=None,
            sequence_length=self.total_window_size,
            sequence_stride=1,
            shuffle=False,
            batch_size=self.batch_size
        )
        ds = ds.map(self.split_window)
        return ds

    @property
    def train(self):
        return self.make_dataset(self.train_df)

    @property
    def val(self):
        return self.make_dataset(self.val_df)

    @property
    def test(self):
        return self.make_dataset(self.test_df)

    @property
    def example(self):
        result = getattr(self, '_example', None)
        if result is None:
            # No se encontró un lote de ejemplo, así que obtén uno del dataset de entrenamiento
            result = next(iter(self.train))
            # Y guárdalo en caché para la próxima vez
            self._example = result
        return result


In [None]:
label_columns=[LABEL]
# features_columns=['open', "close"]
features_columns= [col for col in df.columns if col not in label_columns]
num_labels = len(label_columns)
num_features = len(features_columns)
input_width = 120

window = WindowGenerator( input_width=input_width, label_columns= label_columns , features_columns=features_columns, batch_size=2048)

window

In [None]:
window.example[1][9]

In [None]:
import tensorflow as tf
from tensorflow.keras import layers, models

def create_lstm_model(input_shape, num_labels):
    inputs = tf.keras.Input(shape=input_shape)
    
    x = layers.LSTM(64, return_sequences=True)(inputs)
    x = attention_gate(x)
    x = layers.LSTM(32, return_sequences=False)(x)
    x = layers.Dense(32, activation='relu')(x)
    x = layers.Dropout(0.2)(x)
    x = layers.Dense(input_shape[0], activation='softmax')(x)
    outputs = layers.Dense(num_labels, activation='softmax')(x)
    
    model = models.Model(inputs=inputs, outputs=outputs)
    return model

def attention_gate(inputs):
    attention = layers.Dense(1, activation='tanh')(inputs)
    attention = layers.Flatten()(attention)
    attention = layers.Activation('softmax')(attention)
    attention = layers.RepeatVector(inputs.shape[-1])(attention)
    attention = layers.Permute([2, 1])(attention)
    attended_output = layers.Multiply()([inputs, attention])
    return attended_output

# Define input shape and number of labels
input_shape = (input_width, num_features)  # Adjust based on your data

# Create the model
model = create_lstm_model(input_shape, num_labels)

# Compile the model
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

# Display the model summary
model.summary()


In [None]:
keras.utils.plot_model(model, to_file='model_plot.png', show_shapes=True, show_layer_names=True)

In [None]:
model = keras.models.Sequential([
    # keras.layers.Conv1D(32, 3, activation='relu'),
    # keras.layers.BatchNormalization(),
    keras.layers.LSTM(64, return_sequences=True),
    # keras.layers.BatchNormalization(),
    keras.layers.LSTM(32, return_sequences=False),
    # keras.layers.BatchNormalization(),
    keras.layers.Dense(32, activation='relu'),
    keras.layers.Dropout(0.2),
    keras.layers.Dense(input_width, activation='softmax'),
    keras.layers.Dense(num_labels, activation='softmax')
])

model.compile(optimizer=keras.optimizers.Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])
# model.compile(optimizer=keras.optimizers.SGD(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])




In [None]:
window.normalize_inputs = False

In [None]:
# Create directories for saving model checkpoints and logs if they don't exist
# Example of training with the datasets
import os
import utils
os.makedirs("models", exist_ok=True)
os.makedirs('.logs', exist_ok=True)
utils.clear_directory('.logs')

history = model.fit(window.train, epochs=10,
                        validation_data=window.val,
                        callbacks=[
                            tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True, mode='min'),
                            tf.keras.callbacks.ModelCheckpoint(filepath=f"models/{ASSET}.lstm.keras", monitor='val_loss', save_best_only=True),
                            tf.keras.callbacks.TensorBoard(log_dir='.logs'),
                            tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=3, min_lr=1e-5)
                        ])

In [None]:
# Evaluate the model on the validation data
test_loss, test_accuracy = model.evaluate(window.test, verbose=1)

# Convert accuracy to percentage
test_accuracy_percentage = test_accuracy * 100

print(f"Validation Loss: {test_loss}")
print(f"Validation Accuracy: {test_accuracy}")
print(f"Validation Accuracy %: {test_accuracy_percentage:.2f}%")

### Stage 2

In [None]:
result = model.predict(window.train, verbose=1)

In [None]:
result

In [None]:
from sklearn.preprocessing import StandardScaler, Normalizer, MinMaxScaler
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt


In [None]:
def agregar_labels(df, window_size=6):
    # Crear una columna para la decisión de compra o venta
    df[f'label_{window_size}'] = (df['close'].shift(-window_size) > df['close']).astype(int)
    df.dropna(inplace=True)  # Eliminar filas con NaN resultantes del shift
    return df
FORWARD_WINDOW = 6
LABEL = f"label_{FORWARD_WINDOW}"
# Añadir la columna de labels al DataFrame
df = agregar_labels(df, FORWARD_WINDOW)

In [None]:
# Normalize the data
# scaler = StandardScaler()
# scaler = Normalizer()
scaler = MinMaxScaler()
# df2 = df[LABEL]
normalized_data = scaler.fit_transform(df)
df2 = pd.DataFrame(data=normalized_data, columns=df.columns)
# normalized_data = df.values



In [None]:
df_log = df.apply(np.log)

In [None]:
df_log

In [None]:
# Define the number of clusters
num_clusters = 3  # Adjust based on your specific needs

# Apply K-means clustering
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
df['cluster'] = kmeans.fit_predict(normalized_data)



In [None]:

# Plotting the clusters using the first two features as an example
plt.scatter(df.iloc[:, 3], df.iloc[:, 4], c=df['cluster'], cmap='viridis')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.title('K-means Clustering of OHLC Data with Additional Features')
plt.show()