# Header

In [2]:
import pandas as pd
import time
import numpy as np

# Metrics
from sklearn.metrics import accuracy_score, f1_score

In [3]:
from utils import train_val_split
from utils import train_datapath, test_datapath

targets_for_test_df = pd.read_csv('data/targets_for_test.csv', index_col=0)

In [4]:
# Define function to create sequences
def create_sequences(data, num_timesteps):
    sequences = []
    for i in range(len(data) - num_timesteps + 1):
        sequences.append(data[i:i+num_timesteps])
    return np.array(sequences)


# Base RNN

## Crude

In [6]:
train_df = pd.read_csv(train_datapath)
test_df = pd.read_csv(test_datapath)

In [7]:
from keras.models import Sequential
from keras.layers import SimpleRNN, Dense
from sklearn.preprocessing import StandardScaler
import tensorflow_addons as tfa


start_time = time.time()


# Extract features and target
X_train, y_train, X_val, y_val = train_val_split(train_df)

# Scale the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)



# Reshape data to fit RNN input requirements (samples, time steps, features)
num_timesteps = 15
num_features = X_train.shape[1]  # Assuming X_train has 10 features

# Generate sequences
X_train_seq = create_sequences(X_train, num_timesteps)
X_val_seq = create_sequences(X_val, num_timesteps)

# Adjust y_train and y_val accordingly
y_train_seq = y_train[num_timesteps - 1:]
y_val_seq = y_val[num_timesteps - 1:]

# Define the RNN model
model = Sequential()
model.add(SimpleRNN(50, input_shape=(X_train_seq.shape[1], X_train_seq.shape[2])))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=[tfa.metrics.F1Score(num_classes=1, threshold=0.5)])

# Train the model
model.fit(X_train_seq, y_train_seq, epochs=10, batch_size=32)

# Predict probabilities
y_pred_prob = model.predict(X_val_seq)

# Convert probabilities to binary predictions
y_pred = (y_pred_prob > 0.5).astype(int)


end_time = time.time()
minutes = (end_time - start_time) // 60
seconds = (end_time - start_time) % 60
print(f'Time elapsed: {minutes:.0f}m {seconds:.1f}s')
print('--------------------------------------')
# Calculate accuracy
accuracy = accuracy_score(y_val_seq, y_pred)
print(f'Validation Accuracy: {accuracy:.5f}')

# Calculate F1 macro score
f1_macro = f1_score(y_val_seq, y_pred, average='macro')
print(f'Validation F1 Macro Score: {f1_macro:.5f}')



TensorFlow Addons (TFA) has ended development and introduction of new features.
TFA has entered a minimal maintenance and release mode until a planned end of life in May 2024.
Please modify downstream libraries to take dependencies from other repositories in our TensorFlow community (e.g. Keras, Keras-CV, and Keras-NLP). 

For more information see: https://github.com/tensorflow/addons/issues/2807 



Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Time elapsed: 32m 31.6s
--------------------------------------
Validation Accuracy: 0.53143
Validation F1 Macro Score: 0.36083


In [10]:
# Ensure the test data is preprocessed in the same way as the training data
X_test = test_df.drop(columns=['row_id'])
X_test = scaler.transform(X_test)

# Reshape test data to fit RNN input requirements (samples, time steps, features)
X_test_seq = create_sequences(X_test, num_timesteps)

# Make predictions on the test data
test_predictions_prob = model.predict(X_test_seq)

# Convert probabilities to binary predictions
test_predictions = (test_predictions_prob > 0.5).astype('float32')

# Fil the predictions to match the length of the targets_for_test_df
filled_test_predictions = (len(targets_for_test_df) - len(test_predictions)) * [0]  + test_predictions.flatten().tolist()

# Calculate accuracy
accuracy = accuracy_score(targets_for_test_df, filled_test_predictions)
print(f'Test Accuracy: {accuracy:.5f}')

# Calculate F1 macro score
f1_macro = f1_score(targets_for_test_df, filled_test_predictions, average='macro')
print(f'Test F1 Macro Score: {f1_macro:.5f}')

Test Accuracy: 0.57593
Test F1 Macro Score: 0.43709


In [None]:
# Create a new DataFrame for the submission
submission_df = pd.DataFrame({
    'row_id': test_df['row_id'],
    'target': filled_test_predictions
})

# Save the submission file
submission_df.to_csv('submission.csv', index=False)
print("Predictions saved to submission.csv")



ValueError: in user code:

    File "d:\01_GitHub\Directional-Forecasting-in-Cryptocurrencies\env_rnn\Lib\site-packages\keras\engine\training.py", line 2169, in predict_function  *
        return step_function(self, iterator)
    File "d:\01_GitHub\Directional-Forecasting-in-Cryptocurrencies\env_rnn\Lib\site-packages\keras\engine\training.py", line 2155, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "d:\01_GitHub\Directional-Forecasting-in-Cryptocurrencies\env_rnn\Lib\site-packages\keras\engine\training.py", line 2143, in run_step  **
        outputs = model.predict_step(data)
    File "d:\01_GitHub\Directional-Forecasting-in-Cryptocurrencies\env_rnn\Lib\site-packages\keras\engine\training.py", line 2111, in predict_step
        return self(x, training=False)
    File "d:\01_GitHub\Directional-Forecasting-in-Cryptocurrencies\env_rnn\Lib\site-packages\keras\utils\traceback_utils.py", line 70, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "d:\01_GitHub\Directional-Forecasting-in-Cryptocurrencies\env_rnn\Lib\site-packages\keras\engine\input_spec.py", line 298, in assert_input_compatibility
        raise ValueError(

    ValueError: Input 0 of layer "sequential" is incompatible with the layer: expected shape=(None, 15, 10), found shape=(None, 10)


In [None]:
del train_df, test_df, X_train, y_train, X_val, y_val, X_train_seq, X_val_seq, y_train_seq, y_val_seq, model, y_pred_prob, y_pred, X_test, X_test_seq, test_predictions_prob, test_predictions

## SVD Features

In [None]:
svd_train = pd.read_csv('data/svd_train.csv')
svd_test = pd.read_csv('data/svd_test.csv')

In [None]:
from keras.models import Sequential
from keras.layers import SimpleRNN, Dense
from sklearn.preprocessing import StandardScaler
import tensorflow_addons as tfa


start_time = time.time()


# Extract features and target
X_train, y_train, X_val, y_val = train_val_split(svd_train)

# Scale the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)



# Reshape data to fit RNN input requirements (samples, time steps, features)
num_timesteps = 15
num_features = X_train.shape[1]  # Assuming X_train has 10 features

# Generate sequences
X_train_seq = create_sequences(X_train, num_timesteps)
X_val_seq = create_sequences(X_val, num_timesteps)

# Adjust y_train and y_val accordingly
y_train_seq = y_train[num_timesteps - 1:]
y_val_seq = y_val[num_timesteps - 1:]

# Define the RNN model
model = Sequential()
model.add(SimpleRNN(50, input_shape=(X_train_seq.shape[1], X_train_seq.shape[2])))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=[tfa.metrics.F1Score(num_classes=1, threshold=0.5)])

# Train the model
model.fit(X_train_seq, y_train_seq, epochs=10, batch_size=32)

# Predict probabilities
y_pred_prob = model.predict(X_val_seq)

# Convert probabilities to binary predictions
y_pred = (y_pred_prob > 0.5).astype(int)


end_time = time.time()
minutes = (end_time - start_time) // 60
seconds = (end_time - start_time) % 60
print(f'Time elapsed: {minutes:.0f}m {seconds:.1f}s')
print('--------------------------------------')
# Calculate accuracy
accuracy = accuracy_score(y_val_seq, y_pred)
print(f'Validation Accuracy: {accuracy:.5f}')

# Calculate F1 macro score
f1_macro = f1_score(y_val_seq, y_pred, average='macro')
print(f'Validation F1 Macro Score: {f1_macro:.5f}')


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Time elapsed: 29m 58.5s
--------------------------------------
Validation Accuracy: 0.51924
Validation F1 Macro Score: 0.50666


In [None]:
# Ensure the test data is preprocessed in the same way as the training data
X_SVD_test = svd_test.drop(columns=['row_id'])
X_SVD_test = scaler.transform(X_SVD_test)

# Reshape test data to fit RNN input requirements (samples, time steps, features)
X_SVD_test_seq = create_sequences(X_SVD_test, num_timesteps)

# Make predictions on the test data
test_predictions_prob = model.predict(X_SVD_test_seq)

# Convert probabilities to binary predictions
test_predictions = (test_predictions_prob > 0.5).astype(int)

filled_test_predictions = (len(targets_for_test_df) - len(test_predictions)) * [0]  + test_predictions.flatten().tolist()

# Calculate accuracy
accuracy = accuracy_score(targets_for_test_df, filled_test_predictions)
print(f'Test Accuracy: {accuracy:.5f}')

# Calculate F1 macro score
f1_macro = f1_score(targets_for_test_df, filled_test_predictions, average='macro')
print(f'Test F1 Macro Score: {f1_macro:.5f}')

Test Accuracy: 0.46368
Test F1 Macro Score: 0.45448


In [None]:

# Create a new DataFrame for the submission
submission_df = pd.DataFrame({
    'row_id': test_df['row_id'],
    'target': filled_test_predictions
})

# Save the submission file
submission_df.to_csv('submission.csv', index=False)
print("Predictions saved to submission.csv")



In [None]:
del svd_train, svd_test, X_train, y_train, X_val, y_val, X_train_seq, X_val_seq, y_train_seq, y_val_seq, model, y_pred_prob, y_pred, X_test, X_test_seq, test_predictions_prob, test_predictions

## New Features

In [None]:
treated_train_df = pd.read_csv('data/treated_train.csv')
treated_test_df = pd.read_csv('data/treated_test.csv')

In [None]:
from keras.models import Sequential
from keras.layers import SimpleRNN, Dense
from sklearn.preprocessing import StandardScaler
import tensorflow_addons as tfa


start_time = time.time()


# Extract features and target
X_train, y_train, X_val, y_val = train_val_split(treated_train_df)

# Scale the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)



# Reshape data to fit RNN input requirements (samples, time steps, features)
num_timesteps = 15
num_features = X_train.shape[1]  # Assuming X_train has 10 features

# Generate sequences
X_train_seq = create_sequences(X_train, num_timesteps)
X_val_seq = create_sequences(X_val, num_timesteps)

# Adjust y_train and y_val accordingly
y_train_seq = y_train[num_timesteps - 1:]
y_val_seq = y_val[num_timesteps - 1:]

# Define the RNN treated_model
treated_model = Sequential()
treated_model.add(SimpleRNN(50, input_shape=(X_train_seq.shape[1], X_train_seq.shape[2])))
treated_model.add(Dense(1, activation='sigmoid'))
treated_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=[tfa.metrics.F1Score(num_classes=1, threshold=0.5)])

# Train the treated_model
treated_model.fit(X_train_seq, y_train_seq, epochs=10, batch_size=32)

# Predict probabilities
y_pred_prob = treated_model.predict(X_val_seq)

# Convert probabilities to binary predictions
y_pred = (y_pred_prob > 0.5).astype(int)


end_time = time.time()
minutes = (end_time - start_time) // 60
seconds = (end_time - start_time) % 60
print(f'Time elapsed: {minutes:.0f}m {seconds:.1f}s')
print('--------------------------------------')
# Calculate accuracy
accuracy = accuracy_score(y_val_seq, y_pred)
print(f'Validation Accuracy: {accuracy:.5f}')

# Calculate F1 macro score
f1_macro = f1_score(y_val_seq, y_pred, average='macro')
print(f'Validation F1 Macro Score: {f1_macro:.5f}')


MemoryError: Unable to allocate 284. MiB for an array with shape (1693399, 22) and data type float64

In [None]:
# Ensure the test data is preprocessed in the same way as the training data
X_test = treated_test_df.drop(columns=['row_id'])
X_test = scaler.transform(X_test)

# Reshape test data to fit RNN input requirements (samples, time steps, features)
X_test_seq = create_sequences(X_test, num_timesteps)

# Make predictions on the test data
test_predictions_prob = model.predict(X_test_seq)

# Convert probabilities to binary predictions
test_predictions = (test_predictions_prob > 0.5).astype(int)

filled_test_predictions = (len(targets_for_test_df) - len(test_predictions)) * [0]  + test_predictions.flatten().tolist()

# Calculate accuracy
accuracy = accuracy_score(targets_for_test_df, filled_test_predictions)
print(f'Test Accuracy: {accuracy:.5f}')

# Calculate F1 macro score
f1_macro = f1_score(targets_for_test_df, filled_test_predictions, average='macro')
print(f'Test F1 Macro Score: {f1_macro:.5f}')

In [None]:
del treated_train_df, treated_test_df, X_train, y_train, X_val, y_val, X_train_seq, X_val_seq, y_train_seq, y_val_seq, treated_model, y_pred_prob, y_pred, X_test, X_test_seq, test_predictions_prob, test_predictions

## Only new features

In [None]:
new_features_train_df = pd.read_csv('data/new_features_train.csv')
new_features_test_df = pd.read_csv('data/new_features_test.csv')

In [None]:
from keras.models import Sequential
from keras.layers import SimpleRNN, Dense
from sklearn.preprocessing import StandardScaler
import tensorflow_addons as tfa


start_time = time.time()


# Extract features and target
X_train, y_train, X_val, y_val = train_val_split(new_features_train_df)

# Scale the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)



# Reshape data to fit RNN input requirements (samples, time steps, features)
num_timesteps = 15
num_features = X_train.shape[1]  # Assuming X_train has 10 features

# Generate sequences
X_train_seq = create_sequences(X_train, num_timesteps)
X_val_seq = create_sequences(X_val, num_timesteps)

# Adjust y_train and y_val accordingly
y_train_seq = y_train[num_timesteps - 1:]
y_val_seq = y_val[num_timesteps - 1:]

# Define the RNN only_model
only_model = Sequential()
only_model.add(SimpleRNN(50, input_shape=(X_train_seq.shape[1], X_train_seq.shape[2])))
only_model.add(Dense(1, activation='sigmoid'))
only_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=[tfa.metrics.F1Score(num_classes=1, threshold=0.5)])

# Train the only_model
only_model.fit(X_train_seq, y_train_seq, epochs=10, batch_size=32)

# Predict probabilities
y_pred_prob = only_model.predict(X_val_seq)

# Convert probabilities to binary predictions
y_pred = (y_pred_prob > 0.5).astype(int)


end_time = time.time()
minutes = (end_time - start_time) // 60
seconds = (end_time - start_time) % 60
print(f'Time elapsed: {minutes:.0f}m {seconds:.1f}s')
print('--------------------------------------')
# Calculate accuracy
accuracy = accuracy_score(y_val_seq, y_pred)
print(f'Validation Accuracy: {accuracy:.5f}')

# Calculate F1 macro score
f1_macro = f1_score(y_val_seq, y_pred, average='macro')
print(f'Validation F1 Macro Score: {f1_macro:.5f}')


array([[1],
       [0],
       [0],
       ...,
       [0],
       [0],
       [0]])

In [None]:
# Ensure the test data is preprocessed in the same way as the training data
X_test = new_features_test_df.drop(columns=['row_id'])
X_test = scaler.transform(X_test)

# Reshape test data to fit RNN input requirements (samples, time steps, features)
X_test_seq = create_sequences(X_test, num_timesteps)

# Make predictions on the test data
test_predictions_prob = model.predict(X_test_seq)

# Convert probabilities to binary predictions
test_predictions = (test_predictions_prob > 0.5).astype(int)

filled_test_predictions = (len(targets_for_test_df) - len(test_predictions)) * [0]  + test_predictions.flatten().tolist()

# Calculate accuracy
accuracy = accuracy_score(targets_for_test_df, filled_test_predictions)
print(f'Test Accuracy: {accuracy:.5f}')

# Calculate F1 macro score
f1_macro = f1_score(targets_for_test_df, filled_test_predictions, average='macro')
print(f'Test F1 Macro Score: {f1_macro:.5f}')