In [None]:
# add sliding window to make true RNN
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, accuracy_score, f1_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam
import tensorflow as tf

# Load and preprocess the data
df = pd.read_csv('combined_data_with_y_ta.csv')
df['timestamp'] = pd.to_datetime(df['timestamp'])
df.sort_values(by=['timestamp', 'ticker'], inplace=True)

# Define features and target
X = df.drop(columns='y')
y = df['y']

# Normalize numeric features
numeric_columns = X.drop(columns=['timestamp', 'ticker']).columns
X[numeric_columns] = (X[numeric_columns] - X[numeric_columns].mean()) / X[numeric_columns].std()

# Split the dataset into train, validation, and test sets
split_date = '2020-01-01'
binary_threshold = 0.1
y_binary = (y >= binary_threshold).astype(int)

train_mask = df['timestamp'] < split_date
X_train_full, X_test, y_train_full, y_test = train_test_split(
    X[train_mask], 
    y_binary[train_mask], 
    test_size=0.2, 
    random_state=42
)
X_train, X_val, y_train, y_val = train_test_split(
    X_train_full, 
    y_train_full, 
    test_size=0.25, 
    random_state=42
)

# Define the sliding window function
def create_sliding_window(data, labels, window_size=5):
    X, y = [], []
    for i in range(len(data) - window_size + 1):
        X.append(data[i:i + window_size])
        y.append(labels[i + window_size - 1])  # Target is the last value in the window
    return np.array(X), np.array(y)

# Apply sliding window on the training, validation, and test sets
window_size = 5
X_train_sliding, y_train_sliding = create_sliding_window(X_train.values, y_train.values, window_size)
X_val_sliding, y_val_sliding = create_sliding_window(X_val.values, y_val.values, window_size)
X_test_sliding, y_test_sliding = create_sliding_window(X_test.values, y_test.values, window_size)

# Define the RNN model
rnn_model = Sequential([
    LSTM(64, input_shape=(X_train_sliding.shape[1], X_train_sliding.shape[2]), return_sequences=True),
    Dropout(0.2),
    LSTM(32, return_sequences=False),
    Dropout(0.2),
    Dense(1, activation='sigmoid')
])

# Compile the model
rnn_model.compile(optimizer=Adam(learning_rate=0.001), 
                  loss='binary_crossentropy', 
                  metrics=['accuracy'])

# Train the model
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
history = rnn_model.fit(
    X_train_sliding, y_train_sliding, 
    validation_data=(X_val_sliding, y_val_sliding), 
    epochs=50, batch_size=32, 
    callbacks=[early_stopping], 
    verbose=1
)

# Evaluate the model
y_pred_proba = rnn_model.predict(X_test_sliding)
y_pred = (y_pred_proba >= 0.5).astype(int)

accuracy = accuracy_score(y_test_sliding, y_pred)
f1 = f1_score(y_test_sliding, y_pred)
roc_auc = roc_auc_score(y_test_sliding, y_pred_proba)
conf_matrix = confusion_matrix(y_test_sliding, y_pred)

# Print evaluation metrics
print(f"RNN Accuracy: {accuracy:.4f}")
print(f"RNN F1 Score: {f1:.4f}")
print(f"ROC AUC Score: {roc_auc:.4f}")
print("Confusion Matrix:\n", conf_matrix)
print("Classification Report:\n", classification_report(y_test_sliding, y_pred))

# Save the model
rnn_model.save('rnn_model.h5')
