# Header

In [1]:
import pandas as pd
import time
import numpy as np



from keras.models import Sequential
from keras.layers import SimpleRNN, Dense
from keras.layers import LSTM
from sklearn.preprocessing import StandardScaler
import tensorflow_addons as tfa
import time

# Metrics
from sklearn.metrics import accuracy_score, f1_score



TensorFlow Addons (TFA) has ended development and introduction of new features.
TFA has entered a minimal maintenance and release mode until a planned end of life in May 2024.
Please modify downstream libraries to take dependencies from other repositories in our TensorFlow community (e.g. Keras, Keras-CV, and Keras-NLP). 

For more information see: https://github.com/tensorflow/addons/issues/2807 



In [2]:
from utils import train_val_split
from utils import evaluate_model
from utils import train_datapath, test_datapath

targets_for_test_df = pd.read_csv('data/targets_for_test.csv', index_col=0)

ImportError: cannot import name 'evaluate_model' from 'utils' (d:\01_GitHub\Directional-Forecasting-in-Cryptocurrencies\utils.py)

In [None]:
# Define function to create sequences
def create_sequences(data, num_timesteps):
    sequences = []
    for i in range(len(data) - num_timesteps + 1):
        sequences.append(data[i:i+num_timesteps])
    return np.array(sequences)


def evaluate_model_performance(model, X_val_seq, y_val_seq):
    # Predict probabilities
    y_pred_prob = model.predict(X_val_seq)

    # Convert probabilities to binary predictions
    y_pred = (y_pred_prob > 0.5).astype(int)
    
    y_val = y_val_seq.copy()
    filled_y_pred = y_pred.copy()
    if len(y_val_seq) == (y_pred):
        pass
    elif len(y_val_seq) > len(y_pred):
        difference = y_val_seq - y_pred
        filled_y_pred = np.concatenate([np.zeros(difference), filled_y_pred])
    else: 
        y_val = np.concatenate([0], y_val)
        
    
    # Calculate accuracy
    accuracy = accuracy_score(y_val_seq, filled_y_pred)
    print(f'Validation Accuracy: {accuracy:.5f}')

    # Calculate F1 macro score
    f1_macro = f1_score(y_val_seq, filled_y_pred, average='macro')
    print(f'Validation F1 Macro Score: {f1_macro:.5f}')

    return y_pred, y_pred_prob


def save_submission(test_df, filled_test_predictions, filename='submission.csv'):
    # Create a new DataFrame for the submission
    submission_df = pd.DataFrame({
        'row_id': test_df['row_id'],
        'target': [0] + filled_test_predictions
    })

    # Save the submission file
    submission_df.to_csv(filename, index=False)
    print(f"Predictions saved to {filename}")





# Base RNN

In [None]:

def train_rnn_model(train_df, num_timesteps=15, epochs=10, batch_size=32):
    start_time = time.time()

    # Extract features and target
    X_train, y_train, X_val, y_val = train_val_split(train_df)

    # Scale the data
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_val = scaler.transform(X_val)

    # Reshape data to fit RNN input requirements (samples, time steps, features)
    num_features = X_train.shape[1]  # Assuming X_train has 10 features

    # Generate sequences
    X_train_seq = create_sequences(X_train, num_timesteps)
    X_val_seq = create_sequences(X_val, num_timesteps)

    # Adjust y_train and y_val accordingly
    y_train_seq = y_train[num_timesteps - 1:]
    y_val_seq = y_val[num_timesteps - 1:]

    # Define the RNN model
    model = Sequential()
    model.add(SimpleRNN(50, input_shape=(X_train_seq.shape[1], X_train_seq.shape[2])))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=[tfa.metrics.F1Score(num_classes=1, threshold=0.5)])

    # Train the model
    model.fit(X_train_seq, y_train_seq, epochs=epochs, batch_size=batch_size)
    
    end_time = time.time()
    minutes = (end_time - start_time) // 60
    seconds = (end_time - start_time) % 60

    print(f'Time elapsed: {minutes:.0f}m {seconds:.1f}s')
    print('--------------------------------------')

    evaluate_model_performance(model, X_val_seq, y_val_seq)
    return model, scaler

## Crude

In [None]:
# Load the training data and train the model
train_df = pd.read_csv(train_datapath)
model, scaler = train_rnn_model(train_df)
del train_df

# Load the test data
test_df = pd.read_csv(test_datapath)
y_pred, ypred_prob = evaluate_model_performance(model, scaler, test_df)

# Save the submission
save_submission(test_df, y_pred, 'crude_rnn_submission.csv')

# Save predictions and probabilities
predictions_df = pd.DataFrame({
    'row_id': test_df['row_id'],
    'target': y_pred,
    'probability': ypred_prob
})
predictions_df.to_csv('crude_rnn_prob_predictions.csv', index=False)

del model, scaler, test_df

MemoryError: Unable to allocate 1.90 GiB for an array with shape (1697936, 15, 10) and data type float64

## SVD Features

In [None]:
# Load the training data and train the model
svd_train = pd.read_csv('data/svd_train.csv')
model, scaler = train_rnn_model(svd_train)
del svd_train

# Load the test data
svd_test = pd.read_csv('data/svd_test.csv')
y_pred, ypred_prob = evaluate_model_performance(model, scaler, svd_test)

# Save the submission
save_submission(test_df, y_pred, 'svd_rnn_submission.csv')


# Save predictions and probabilities
predictions_df = pd.DataFrame({
    'row_id': test_df['row_id'],
    'target': y_pred,
    'probability': ypred_prob
})
predictions_df.to_csv('crude_rnn_prob_predictions.csv', index=False)

del model, scaler, svd_test


TensorFlow Addons (TFA) has ended development and introduction of new features.
TFA has entered a minimal maintenance and release mode until a planned end of life in May 2024.
Please modify downstream libraries to take dependencies from other repositories in our TensorFlow community (e.g. Keras, Keras-CV, and Keras-NLP). 

For more information see: https://github.com/tensorflow/addons/issues/2807 



Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Time elapsed: 33m 41.0s
--------------------------------------
Validation Accuracy: 0.53228
Validation F1 Macro Score: 0.41841


## Only new features

In [None]:
# Load the training data and train the model
new_features_train_df = pd.read_csv('data/new_features_train.csv')

model, scaler = train_rnn_model(new_features_train_df)
del new_features_train_df

# Load the test data
new_features_test_df = pd.read_csv('data/new_features_test.csv')
y_pred, ypred_prob = evaluate_model_performance(model, scaler, new_features_test_df)

# Save the submission
save_submission(test_df, y_pred, 'only_new_rnn_submission.csv')


# Save predictions and probabilities
predictions_df = pd.DataFrame({
    'row_id': test_df['row_id'],
    'target': y_pred,
    'probability': ypred_prob
})
predictions_df.to_csv('crude_rnn_prob_predictions.csv', index=False)

del model, scaler, new_features_train_df

## New Features

In [None]:
# Load the training data and train the model
treated_train_df = pd.read_csv('data/treated_train.csv')

model, scaler = train_rnn_model(new_features_train_df)
del svd_train

# Load the test data
treated_test_df = pd.read_csv('data/treated_test.csv')
y_pred, ypred_prob = evaluate_model_performance(model, scaler, new_features_test_df)

# Save the submission
save_submission(test_df, y_pred, 'new_feat_rnn_submission.csv')


# Save predictions and probabilities
predictions_df = pd.DataFrame({
    'row_id': test_df['row_id'],
    'target': y_pred,
    'probability': ypred_prob
})
predictions_df.to_csv('crude_rnn_prob_predictions.csv', index=False)

del model, scaler, svd_test

# LSTM

In [None]:
def train_lstm_model(train_df, num_timesteps=15, epochs=10, batch_size=32):
    start_time = time.time()

    # Extract features and target
    X_train, y_train, X_val, y_val = train_val_split(train_df)

    # Scale the data
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_val = scaler.transform(X_val)

    # Reshape data to fit LSTM input requirements (samples, time steps, features)
    num_features = X_train.shape[1]  # Assuming X_train has 10 features

    # Generate sequences
    X_train_seq = create_sequences(X_train, num_timesteps)
    X_val_seq = create_sequences(X_val, num_timesteps)

    # Adjust y_train and y_val accordingly
    y_train_seq = y_train[num_timesteps - 1:]
    y_val_seq = y_val[num_timesteps - 1:]

    # Define the LSTM model
    lstm_model = Sequential()
    lstm_model.add(LSTM(50, input_shape=(X_train_seq.shape[1], X_train_seq.shape[2])))
    lstm_model.add(Dense(1, activation='sigmoid'))
    lstm_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=[tfa.metrics.F1Score(num_classes=1, threshold=0.5)])

    # Train the model
    lstm_model.fit(X_train_seq, y_train_seq, epochs=epochs, batch_size=batch_size)

    end_time = time.time()
    minutes = (end_time - start_time) // 60
    seconds = (end_time - start_time) % 60
    print(f'Time elapsed: {minutes:.0f}m {seconds:.1f}s')
    print('--------------------------------------')

    evaluate_model_performance(model, X_val_seq, y_val_seq)
    
    return lstm_model, scaler


## Crude

In [None]:
# Load the training data and train the model
train_df = pd.read_csv(train_datapath)
model, scaler = train_lstm_model(train_df)
del train_df

# Load the test data
test_df = pd.read_csv(test_datapath)
y_pred, ypred_prob = evaluate_model_performance(model, scaler, test_df)

# Save the submission
save_submission(test_df, y_pred, 'crude_rnn_submission.csv')


# Save predictions and probabilities
predictions_df = pd.DataFrame({
    'row_id': test_df['row_id'],
    'target': y_pred,
    'probability': ypred_prob
})
predictions_df.to_csv('crude_rnn_predictions.csv', index=False)

del model, scaler, test_df

MemoryError: Unable to allocate 1.90 GiB for an array with shape (1697936, 15, 10) and data type float64

## SVD Features


In [None]:
# Load the training data and train the model
svd_train = pd.read_csv('data/svd_train.csv')
model, scaler = train_lstm_model(svd_train)
del svd_train

# Load the test data
svd_test = pd.read_csv('data/svd_test.csv')
y_pred, ypred_prob = evaluate_model_performance(model, scaler, svd_test)

# Save the submission
save_submission(svd_test, y_pred, 'crude_rnn_submission.csv')


# Save predictions and probabilities
predictions_df = pd.DataFrame({
    'row_id': test_df['row_id'],
    'target': y_pred,
    'probability': ypred_prob
})
predictions_df.to_csv('crude_rnn_predictions.csv', index=False)

del model, scaler, svd_test

# End