### Unweighted Training of RNN

In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, accuracy_score, f1_score
from sklearn.preprocessing import StandardScaler

# Load and preprocess the data
file_path = r"G:\My Drive\Columbia Business School\Applied Machine Learning\Combined Code\Chinese-Stock-Market-Quantitative-Model\combined_data_with_y_ta.csv"
df = pd.read_csv(file_path)
df['timestamp'] = pd.to_datetime(df['timestamp'])
df.sort_values(by=['timestamp', 'ticker'], inplace=True)
df = df.dropna()

# Check if CUDA is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
if torch.cuda.is_available():
    print(f"CUDA is available. Using GPU: {torch.cuda.get_device_name(0)}")
else:
    print("CUDA is not available. Using CPU.")

# Define features and target
X = df.drop(columns='y')
y = df['y']

# Normalize numeric features
numeric_columns = X.drop(columns=['timestamp', 'ticker']).columns
scaler = StandardScaler()
X[numeric_columns] = scaler.fit_transform(X[numeric_columns])

# Split the dataset into train, validation, and test sets
split_date = '2020-01-01'
binary_threshold = 0.1
y_binary = (y >= binary_threshold).astype(int)

train_mask = df['timestamp'] < split_date

X_train_full, X_test, y_train_full, y_test = train_test_split(
    X[train_mask], 
    y_binary[train_mask], 
    test_size=0.2, 
    random_state=42
)
X_train, X_val, y_train, y_val = train_test_split(
    X_train_full, 
    y_train_full, 
    test_size=0.25, 
    random_state=42
)

X_train = X_train.drop(columns=['timestamp'])
X_val = X_val.drop(columns=['timestamp'])
X_test = X_test.drop(columns=['timestamp'])

# Define the sliding window function
def create_sliding_window(data, labels, window_size=5):
    X, y = [], []
    for i in range(len(data) - window_size + 1):
        X.append(data[i:i + window_size])
        y.append(labels[i + window_size - 1])  # Target is the last value in the window
    return np.array(X), np.array(y)

# Apply sliding window on the training, validation, and test sets
window_size = 5
X_train_sliding, y_train_sliding = create_sliding_window(X_train.values, y_train.values, window_size)
X_val_sliding, y_val_sliding = create_sliding_window(X_val.values, y_val.values, window_size)
X_test_sliding, y_test_sliding = create_sliding_window(X_test.values, y_test.values, window_size)

# Convert data to PyTorch tensors
X_train_tensor = torch.tensor(X_train_sliding, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train_sliding, dtype=torch.float32)
X_val_tensor = torch.tensor(X_val_sliding, dtype=torch.float32)
y_val_tensor = torch.tensor(y_val_sliding, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test_sliding, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test_sliding, dtype=torch.float32)

# Create PyTorch datasets and loaders
class TimeSeriesDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

train_dataset = TimeSeriesDataset(X_train_tensor, y_train_tensor)
val_dataset = TimeSeriesDataset(X_val_tensor, y_val_tensor)
test_dataset = TimeSeriesDataset(X_test_tensor, y_test_tensor)

train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=128, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=128, shuffle=False)


y_train_tensor = (y_train_tensor >= 0.5).float()
y_val_tensor = (y_val_tensor >= 0.5).float()
y_test_tensor = (y_test_tensor >= 0.5).float()

# Define the RNN model
class RNNModel(nn.Module):
    def __init__(self, input_size):
        super(RNNModel, self).__init__()
        self.lstm1 = nn.LSTM(input_size, 64, batch_first=True)
        self.lstm2 = nn.LSTM(64, 32, batch_first=True)
        self.dropout = nn.Dropout(0.2)
        self.fc = nn.Linear(32, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        out, _ = self.lstm1(x)
        out, _ = self.lstm2(out)
        out = out[:, -1, :]  # Use the output of the last time step
        out = self.dropout(out)
        out = self.fc(out)
        return self.sigmoid(out)

input_size = X_train_sliding.shape[2]
model = RNNModel(input_size).to(device)


from tqdm import tqdm

# Define optimizer, loss function, and device
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.BCELoss()

# Train the model
def train_model(model, train_loader, val_loader, criterion, optimizer, epochs=50):
    best_val_loss = float('inf')
    patience = 3
    patience_counter = 0

    for epoch in range(epochs):
        model.train()
        train_loss = 0.0

        for X_batch, y_batch in tqdm(train_loader):
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            optimizer.zero_grad()
            y_pred = model(X_batch)
            loss = criterion(y_pred.squeeze(), y_batch)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()

        val_loss = 0.0
        model.eval()
        with torch.no_grad():
            for X_batch, y_batch in tqdm(val_loader):
                X_batch, y_batch = X_batch.to(device), y_batch.to(device)
                y_pred = model(X_batch)
                loss = criterion(y_pred.squeeze(), y_batch)
                val_loss += loss.item()

        print(f"Epoch {epoch+1}/{epochs} - Train Loss: {train_loss/len(train_loader):.4f}, Val Loss: {val_loss/len(val_loader):.4f}")

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(model.state_dict(), 'best_rnn_model.pth')
            patience_counter = 0
        else:
            patience_counter += 1
            if patience_counter >= patience:
                print("Early stopping triggered.")
                break

# Train the model
train_model(model, train_loader, val_loader, criterion, optimizer)

print("MODEL TRAINED!!")

# Evaluate the model
model.load_state_dict(torch.load('best_rnn_model.pth'))
model.eval()

y_pred_list = []
y_true_list = []

with torch.no_grad():
    for X_batch, y_batch in test_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        y_pred = model(X_batch)
        y_pred_list.append(y_pred.cpu().numpy())
        y_true_list.append(y_batch.cpu().numpy())

y_pred_list = np.concatenate(y_pred_list).squeeze()
y_true_list = np.concatenate(y_true_list).squeeze()
y_pred_binary = (y_pred_list >= 0.5).astype(int)

# Metrics
accuracy = accuracy_score(y_true_list, y_pred_binary)
f1 = f1_score(y_true_list, y_pred_binary)
roc_auc = roc_auc_score(y_true_list, y_pred_list)
conf_matrix = confusion_matrix(y_true_list, y_pred_binary)

print(f"RNN Accuracy: {accuracy:.4f}")
print(f"RNN F1 Score: {f1:.4f}")
print(f"ROC AUC Score: {roc_auc:.4f}")
print("Confusion Matrix:\n", conf_matrix)
print("Classification Report:\n", classification_report(y_true_list, y_pred_binary))



CUDA is available. Using GPU: NVIDIA GeForce RTX 3080


100%|██████████| 44855/44855 [02:11<00:00, 340.82it/s]
100%|██████████| 14952/14952 [00:18<00:00, 793.81it/s]


Epoch 1/50 - Train Loss: 0.1316, Val Loss: 0.1293


100%|██████████| 44855/44855 [02:11<00:00, 340.37it/s]
100%|██████████| 14952/14952 [00:18<00:00, 796.46it/s]


Epoch 2/50 - Train Loss: 0.1288, Val Loss: 0.1249


100%|██████████| 44855/44855 [02:10<00:00, 343.76it/s]
100%|██████████| 14952/14952 [00:18<00:00, 805.10it/s]


Epoch 3/50 - Train Loss: 0.1270, Val Loss: 0.1283


100%|██████████| 44855/44855 [02:09<00:00, 346.54it/s]
100%|██████████| 14952/14952 [00:18<00:00, 806.12it/s]


Epoch 4/50 - Train Loss: 0.1271, Val Loss: 0.1257


100%|██████████| 44855/44855 [02:23<00:00, 313.08it/s]
100%|██████████| 14952/14952 [00:19<00:00, 769.96it/s]


Epoch 5/50 - Train Loss: 0.1273, Val Loss: 0.1210


100%|██████████| 44855/44855 [02:31<00:00, 295.93it/s]
100%|██████████| 14952/14952 [00:19<00:00, 753.12it/s]


Epoch 6/50 - Train Loss: 0.1249, Val Loss: 0.1270


100%|██████████| 44855/44855 [02:24<00:00, 309.94it/s]
100%|██████████| 14952/14952 [00:18<00:00, 797.03it/s]


Epoch 7/50 - Train Loss: 0.1261, Val Loss: 0.1266


100%|██████████| 44855/44855 [02:22<00:00, 315.19it/s]
100%|██████████| 14952/14952 [00:19<00:00, 778.39it/s]
  model.load_state_dict(torch.load('best_rnn_model.pth'))


Epoch 8/50 - Train Loss: 0.1255, Val Loss: 0.1255
Early stopping triggered.
MODEL TRAINED!!
RNN Accuracy: 0.9710
RNN F1 Score: 0.0097
ROC AUC Score: 0.6756
Confusion Matrix:
 [[1858073     140]
 [  55307     272]]
Classification Report:
               precision    recall  f1-score   support

         0.0       0.97      1.00      0.99   1858213
         1.0       0.66      0.00      0.01     55579

    accuracy                           0.97   1913792
   macro avg       0.82      0.50      0.50   1913792
weighted avg       0.96      0.97      0.96   1913792



### Weighted Loss training for RNNs

In [None]:
# Assuming y_train_tensor contains binary labels (0 or 1)
num_zeros = (y_train_tensor == 0).sum().item()  # Count the number of '0' labels
num_ones = (y_train_tensor == 1).sum().item()   # Count the number of '1' labels

# Calculate class weight as inverse of class frequency
weight_for_class_1 = num_zeros / num_ones  # Higher weight for minority class
pos_weight = torch.tensor([weight_for_class_1]).to(device)  # Weight for positive class '1'

5572389

In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, accuracy_score, f1_score
from sklearn.preprocessing import StandardScaler

# Load and preprocess the data
file_path = r"G:\My Drive\Columbia Business School\Applied Machine Learning\Combined Code\Chinese-Stock-Market-Quantitative-Model\combined_data_with_y_ta.csv"
df = pd.read_csv(file_path)
df['timestamp'] = pd.to_datetime(df['timestamp'])
df.sort_values(by=['timestamp', 'ticker'], inplace=True)
df = df.dropna()

# Check if CUDA is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
if torch.cuda.is_available():
    print(f"CUDA is available. Using GPU: {torch.cuda.get_device_name(0)}")
else:
    print("CUDA is not available. Using CPU.")

# Define features and target
X = df.drop(columns='y')
y = df['y']

# Normalize numeric features
numeric_columns = X.drop(columns=['timestamp', 'ticker']).columns
scaler = StandardScaler()
X[numeric_columns] = scaler.fit_transform(X[numeric_columns])

# Split the dataset into train, validation, and test sets
split_date = '2020-01-01'
binary_threshold = 0.1
y_binary = (y >= binary_threshold).astype(int)

train_mask = df['timestamp'] < split_date

X_train_full, X_test, y_train_full, y_test = train_test_split(
    X[train_mask], 
    y_binary[train_mask], 
    test_size=0.2, 
    random_state=42
)
X_train, X_val, y_train, y_val = train_test_split(
    X_train_full, 
    y_train_full, 
    test_size=0.25, 
    random_state=42
)

X_train = X_train.drop(columns=['timestamp'])
X_val = X_val.drop(columns=['timestamp'])
X_test = X_test.drop(columns=['timestamp'])

# Define the sliding window function
def create_sliding_window(data, labels, window_size=5):
    X, y = [], []
    for i in range(len(data) - window_size + 1):
        X.append(data[i:i + window_size])
        y.append(labels[i + window_size - 1])  # Target is the last value in the window
    return np.array(X), np.array(y)

# Apply sliding window on the training, validation, and test sets
window_size = 5
X_train_sliding, y_train_sliding = create_sliding_window(X_train.values, y_train.values, window_size)
X_val_sliding, y_val_sliding = create_sliding_window(X_val.values, y_val.values, window_size)
X_test_sliding, y_test_sliding = create_sliding_window(X_test.values, y_test.values, window_size)

# Convert data to PyTorch tensors
X_train_tensor = torch.tensor(X_train_sliding, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train_sliding, dtype=torch.float32)
X_val_tensor = torch.tensor(X_val_sliding, dtype=torch.float32)
y_val_tensor = torch.tensor(y_val_sliding, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test_sliding, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test_sliding, dtype=torch.float32)

# Create PyTorch datasets and loaders
class TimeSeriesDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

train_dataset = TimeSeriesDataset(X_train_tensor, y_train_tensor)
val_dataset = TimeSeriesDataset(X_val_tensor, y_val_tensor)
test_dataset = TimeSeriesDataset(X_test_tensor, y_test_tensor)

train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=128, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=128, shuffle=False)

# Binarize labels for classification
y_train_tensor = (y_train_tensor >= 0.5).float()
y_val_tensor = (y_val_tensor >= 0.5).float()
y_test_tensor = (y_test_tensor >= 0.5).float()

# Calculate class weights
num_zeros = (y_train_tensor == 0).sum().item()
num_ones = (y_train_tensor == 1).sum().item()
weight_for_class_1 = num_zeros / num_ones
pos_weight = torch.tensor([weight_for_class_1]).to(device)

# Define the RNN model
class RNNModel(nn.Module):
    def __init__(self, input_size):
        super(RNNModel, self).__init__()
        self.lstm1 = nn.LSTM(input_size, 64, batch_first=True)
        self.lstm2 = nn.LSTM(64, 32, batch_first=True)
        self.dropout = nn.Dropout(0.2)
        self.fc = nn.Linear(32, 1)

    def forward(self, x):
        out, _ = self.lstm1(x)
        out, _ = self.lstm2(out)
        out = out[:, -1, :]  # Use the output of the last time step
        out = self.dropout(out)
        out = self.fc(out)
        return out

input_size = X_train_sliding.shape[2]
model = RNNModel(input_size).to(device)

from tqdm import tqdm

# Define optimizer, loss function, and device
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)

# Train the model
def train_model(model, train_loader, val_loader, criterion, optimizer, epochs=50):
    best_val_loss = float('inf')
    patience = 3
    patience_counter = 0

    for epoch in range(epochs):
        model.train()
        train_loss = 0.0

        for X_batch, y_batch in tqdm(train_loader):
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            optimizer.zero_grad()
            y_pred = model(X_batch)
            loss = criterion(y_pred.squeeze(), y_batch)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()

        val_loss = 0.0
        model.eval()
        with torch.no_grad():
            for X_batch, y_batch in tqdm(val_loader):
                X_batch, y_batch = X_batch.to(device), y_batch.to(device)
                y_pred = model(X_batch)
                loss = criterion(y_pred.squeeze(), y_batch)
                val_loss += loss.item()

        print(f"Epoch {epoch+1}/{epochs} - Train Loss: {train_loss/len(train_loader):.4f}, Val Loss: {val_loss/len(val_loader):.4f}")

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(model.state_dict(), 'best_rnn_model.pth')
            patience_counter = 0
        else:
            patience_counter += 1
            if patience_counter >= patience:
                print("Early stopping triggered.")
                break

# Train the model
train_model(model, train_loader, val_loader, criterion, optimizer)

print("MODEL TRAINED!!")

# Evaluate the model
model.load_state_dict(torch.load('best_rnn_model.pth'))
model.eval()

y_pred_list = []
y_true_list = []

with torch.no_grad():
    for X_batch, y_batch in test_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        y_pred = model(X_batch)
        y_pred_list.append(y_pred.cpu().numpy())
        y_true_list.append(y_batch.cpu().numpy())

y_pred_list = np.concatenate(y_pred_list).squeeze()
y_true_list = np.concatenate(y_true_list).squeeze()
y_pred_binary = (y_pred_list >= 0.5).astype(int)

# Metrics
accuracy = accuracy_score(y_true_list, y_pred_binary)
f1 = f1_score(y_true_list, y_pred_binary)
roc_auc = roc_auc_score(y_true_list, y_pred_list)
conf_matrix = confusion_matrix(y_true_list, y_pred_binary)

print(f"RNN Accuracy: {accuracy:.4f}")
print(f"RNN F1 Score: {f1:.4f}")
print(f"ROC AUC Score: {roc_auc:.4f}")
print("Confusion Matrix:\n", conf_matrix)
print("Classification Report:\n", classification_report(y_true_list, y_pred_binary))


CUDA is available. Using GPU: NVIDIA GeForce RTX 3080


100%|██████████| 44855/44855 [02:25<00:00, 307.76it/s]
100%|██████████| 14952/14952 [00:19<00:00, 749.81it/s]


Epoch 1/50 - Train Loss: 1.3143, Val Loss: 1.2841


100%|██████████| 44855/44855 [02:30<00:00, 298.35it/s]
100%|██████████| 14952/14952 [00:20<00:00, 726.67it/s]


Epoch 2/50 - Train Loss: 1.2640, Val Loss: 1.2409


100%|██████████| 44855/44855 [02:26<00:00, 306.61it/s]
100%|██████████| 14952/14952 [00:20<00:00, 738.18it/s]


Epoch 3/50 - Train Loss: 1.2589, Val Loss: 1.2578


100%|██████████| 44855/44855 [02:24<00:00, 310.22it/s]
100%|██████████| 14952/14952 [00:20<00:00, 734.40it/s]


Epoch 4/50 - Train Loss: 1.2629, Val Loss: 1.3173


100%|██████████| 44855/44855 [02:26<00:00, 305.48it/s]
100%|██████████| 14952/14952 [00:20<00:00, 743.18it/s]
  model.load_state_dict(torch.load('best_rnn_model.pth'))


Epoch 5/50 - Train Loss: 1.2448, Val Loss: 1.2800
Early stopping triggered.
MODEL TRAINED!!
RNN Accuracy: 0.9079
RNN F1 Score: 0.1596
ROC AUC Score: 0.6224
Confusion Matrix:
 [[1720838  137375]
 [  38851   16728]]
Classification Report:
               precision    recall  f1-score   support

         0.0       0.98      0.93      0.95   1858213
         1.0       0.11      0.30      0.16     55579

    accuracy                           0.91   1913792
   macro avg       0.54      0.61      0.56   1913792
weighted avg       0.95      0.91      0.93   1913792



: 