# Project 266

## Setup

## Import libraries

In [1]:
!pip install loralib

Collecting loralib
  Downloading loralib-0.1.2-py3-none-any.whl (10 kB)
Installing collected packages: loralib
Successfully installed loralib-0.1.2


In [2]:
# Torch
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim import AdamW, Adam, SGD
from torch.nn.utils.rnn import pad_sequence

# Gensim
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess

# Sklearn
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, auc, roc_auc_score, precision_recall_curve, classification_report

# Bert
from transformers import AutoTokenizer, AutoModel, get_linear_schedule_with_warmup
from torch.utils.data import Dataset, DataLoader

# Admin
import os
import time
from tqdm.auto import tqdm
import re

# Data
import pandas as pd
import numpy as np
import random
import pickle

# loRA
import loralib as lora

# Gradients
import csv

# Optuna
!pip install optuna
import optuna
from optuna.pruners import MedianPruner

# Visualizations
import matplotlib.pyplot as plt

Collecting optuna
  Downloading optuna-3.6.1-py3-none-any.whl (380 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m380.1/380.1 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.13.1-py3-none-any.whl (233 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.4/233.4 kB[0m [31m14.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting colorlog (from optuna)
  Downloading colorlog-6.8.2-py3-none-any.whl (11 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.2-py3-none-any.whl (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.7/78.7 kB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: Mako, colorlog, alembic, optuna
Successfully installed Mako-1.3.2 alembic-1.13.1 colorlog-6.8.2 optuna-3.6.1


In [3]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


## Functions

In [4]:
# Combined Cleaning and Preprocessing Function
def clean_and_preprocess_tweets(df):
    def clean_tweet(tweet):
        # Remove URLs
        tweet = re.sub(r'http\S+|www\S+|https\S+', '', tweet, flags=re.MULTILINE)
        # Remove user mentions
        tweet = re.sub(r'@\w+', '', tweet)
        # Remove excessive whitespace
        tweet = re.sub(r'\s+', ' ', tweet).strip()
        return tweet

    # Apply cleaning and simple preprocessing
    return df['text'].apply(lambda x: simple_preprocess(clean_tweet(x)))

# Streamlined Training and Sequence Preparation
def prepare_word2vec_sequences(df, max_len=None):
    # Step 1: Clean and preprocess tweets
    tweets_preprocessed = clean_and_preprocess_tweets(df)

    # Step 2: Train Word2Vec
    word2vec_model = Word2Vec(sentences=tweets_preprocessed, vector_size=768, window=5, min_count=1, workers=4)

    # Step 3: Create word to index mapping
    word_index = {word: i for i, word in enumerate(word2vec_model.wv.index_to_key)}

    # Step 4: Convert tweets to sequences of indices
    sequences = [[word_index.get(word, 0) for word in tweet] for tweet in tweets_preprocessed]

    # Convert sequences to PyTorch tensors before padding
    sequences_tensors = [torch.tensor(seq) for seq in sequences]

    # Step 5: Pad sequences
    max_len = max(len(seq) for seq in sequences_tensors)

    # Step 6: Make padded sequence
    padded_sequences = pad_sequence(sequences_tensors, batch_first=True, padding_value=0)

    # Dimensions
    word2vec_dim = 300
    bert_dim = 768

    # Linear transformation layer
    projection_layer = nn.Linear(in_features=word2vec_dim, out_features=bert_dim)
    torch.nn.init.xavier_uniform_(projection_layer.weight)

    # Example Word2Vec embeddings tensor ([batch_size, sequence_length, word2vec_dim])
    word2vec_embeddings = torch.randn(len(padded_sequences), max_len, word2vec_dim)

    # Project embeddings
    projected_embeddings = projection_layer(word2vec_embeddings)

    return projected_embeddings, word2vec_model

In [5]:
# Import df
df = pd.read_csv('/content/drive/My Drive/266_project/csv files for train_val_test and embeddings/text_for_embeddings.csv', encoding='utf-8')
print('df size:', df.shape)

# Print pivot table
pd.pivot_table(df, index=['source'], columns='label', values=['text'], aggfunc='count')

# Instantiate tweets
tweets = df[df['source'].isin(['t3','t5'])].copy()

df size: (43102, 4)


# Prepare embeddings

In [6]:
# # Get padded sequences and word2vec model
# word2vec_embeddings, word2vec_model = prepare_word2vec_sequences(df)
# #Save
# word2vec_model.save("/content/drive/My Drive/266 - Project/word2vec_model.model")
# torch.save(word2vec_embeddings, '/content/drive/My Drive/266 - Project/word2vec_embeddings.pt')
#load
word2vec_model = Word2Vec.load("/content/drive/My Drive/266_project/project/support/word2vec_model.model")
# word2vec_embeddings = torch.load('/content/drive/My Drive/266 - Project/project/support/word2vec_embeddings.pt')

# print(f"word2vec_embeddings.shape: {word2vec_embeddings.shape}")

In [7]:

vocab_size = len(word2vec_model.wv.index_to_key)
print(f"vocab_size: {vocab_size}")

vocab_size: 39709


# Create BERT embeddings

In [8]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
bert_embeddings = []
for text in tweets['text']:
    tokens = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length = 512,
            padding="max_length",
            truncation=True,
            return_tensors='pt'
        )
    bert_embeddings.append(tokens)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [9]:
bert_model = AutoModel.from_pretrained('bert-base-uncased')

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [10]:
output = bert_model(**tokens)

In [11]:
bert_embeddings = output.last_hidden_state

# Import training

In [12]:
# Import data sets
train_df = pd.read_csv(f'/content/drive/My Drive/266_project/csv files for train_val_test and embeddings/train_df.csv')
val_df = pd.read_csv(f'/content/drive/My Drive/266_project/csv files for train_val_test and embeddings/val_df.csv')
test_df = pd.read_csv(f'/content/drive/My Drive/266_project/csv files for train_val_test and embeddings/test_df.csv')

In [13]:
tf = train_df.copy()
tf['len'] = tf['text'].apply(lambda x: len(x))
tf.len.describe()


count    17990.000000
mean        96.605003
std        108.156943
min          7.000000
25%         52.000000
50%         67.000000
75%         86.000000
max       1167.000000
Name: len, dtype: float64

In [14]:
n_rows = 2500
train_df = pd.concat([train_df[train_df['label']==0].sample(n=n_rows, random_state=42),train_df[train_df['label']==1].sample(n=n_rows, random_state=42)])
n_rows = 600
val_df = pd.concat([val_df[val_df['label']==0].sample(n=n_rows, random_state=42),val_df[val_df['label']==1].sample(n=n_rows, random_state=42)])
n_rows = 1500
test_df = pd.concat([test_df[test_df['label']==0].sample(n=n_rows, random_state=42),test_df[test_df['label']==1].sample(n=n_rows, random_state=42)])

In [15]:
train_df.reset_index(drop=True, inplace=True)
val_df.reset_index(drop=True, inplace=True)
test_df.reset_index(drop=True, inplace=True)

In [16]:
print(train_df['label'].value_counts(normalize=True) * 100)
print(train_df['label'].value_counts())
print(val_df['label'].value_counts(normalize=True) * 100)
print(val_df['label'].value_counts())
print(test_df['label'].value_counts(normalize=True) * 100)
print(test_df['label'].value_counts())

print(train_df.shape)
print(val_df.shape)
print(test_df.shape)

label
0    50.0
1    50.0
Name: proportion, dtype: float64
label
0    2500
1    2500
Name: count, dtype: int64
label
0    50.0
1    50.0
Name: proportion, dtype: float64
label
0    600
1    600
Name: count, dtype: int64
label
0    50.0
1    50.0
Name: proportion, dtype: float64
label
0    1500
1    1500
Name: count, dtype: int64
(5000, 3)
(1200, 3)
(3000, 3)


# CNN with Global Average Pooling for Word2Vec Embeddings

In [17]:
class Attention(nn.Module):
    def __init__(self, embedding_dim, hidden_dim):
        super(Attention, self).__init__()
        self.attention = nn.Sequential(
            nn.Linear(embedding_dim, hidden_dim),
            nn.ReLU(True),
            nn.Linear(hidden_dim, 1)
        )

    def forward(self, embeddings):
        # embeddings shape: [batch_size, max_seq_length, embedding_dim]
        attention_weights = self.attention(embeddings)  # Compute attention weights
        attention_weights = torch.softmax(attention_weights, dim=1)  # Softmax over max_seq_length dimension
        attended_embeddings = embeddings * attention_weights  # Apply weights
        attended_embeddings = torch.sum(attended_embeddings, dim=1)  # Sum over the sequence
        return attended_embeddings

In [18]:
class CNNForWord2VecBERTFT(nn.Module):
    def __init__(self, word2vec_weights, vocab_size, embedding_dim, num_filters, filter_sizes, dropout_rate, hidden_dim, freeze=True):
        super(CNNForWord2VecBERTFT, self).__init__()

        # WORD2VEC Embedding layer
        self.embedding = nn.Embedding.from_pretrained(torch.FloatTensor(word2vec_weights))

        # Convolutional layers: Adjusted for embedding dimensions
        self.convs = nn.ModuleList([
            nn.Conv2d(in_channels=1, out_channels=num_filters, kernel_size=(k, embedding_dim), padding=(k - 1, 0)) for k in filter_sizes
        ])

        # Batch normalization layers: https://pytorch.org/docs/stable/generated/torch.nn.BatchNorm2d.html#torch.nn.BatchNorm2d
        self.conv_bn = nn.ModuleList([
            nn.BatchNorm2d(num_filters) for _ in filter_sizes
        ])

        # Global Average Pooling layer for CNN features
        self.cnn_global_avg_pool = nn.AdaptiveAvgPool2d((1, num_filters))

        # BERT Embedding Layer
        self.bert = AutoModel.from_pretrained('bert-base-uncased')
        self.bert_attention = Attention(embedding_dim, hidden_dim)  # Initialize attention for BERT embeddings

        # Freeze BERT layer
        if freeze:
            for param in self.bert.parameters():
                param.requires_grad = False

        # global average pooling layer for BERT embeddings
        self.bert_global_avg_pool = nn.AdaptiveAvgPool1d(1)

        # Dropout layer
        self.dropout = nn.Dropout(dropout_rate)

        # Fully connected layer: Adjust according to your task
        self.fc = nn.Linear(num_filters * len(filter_sizes) + embedding_dim, 1) # The "* 2" accounts for concatenation of avg and max pooling features

    def forward(self, input_ids, attention_mask, word_indices, prediction = False):
        # x shape: [batch_size, max_sequence_length, embedding_dim]

        # Word2Vec Embeddings

        # Convert ids to embeddings
        x = self.embedding(word_indices)

        # Add a channel dimension: [batch_size, 1, max_sequence_length, embedding_dim]
        x = x.unsqueeze(1)

        # Apply convolutions and ReLU
        x = [F.relu(conv(x)).squeeze(3) for conv in self.convs]

        # Apply global average pooling
        x = [self.cnn_global_avg_pool(xi).squeeze(2) for xi in x]

        # Concatenate along the filter dimension
        x = torch.cat(x, 1)

        # Flatten
        x = x.view(x.size(0), -1)

        # Process BERT embeddings
        bert_embeddings = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        x_bert = bert_embeddings.last_hidden_state

        # Apply attention in BERT embeddings
        x_bert = self.bert_attention(x_bert)

        # Add a dimension to match the expected input shape
        x_bert = x_bert.unsqueeze(-1)

        # BERT Global average pooling
        x_bert = self.bert_global_avg_pool(x_bert).squeeze(-1)

        # Concatenate Word2Vec and BERT embeddings
        x_combined = torch.cat((x, x_bert), 1)

        # Apply dropout
        x_combined = self.dropout(x_combined)

        # Apply fully connected layer
        x = self.fc(x_combined)

        if prediction:
          return x, x_combined
        else:
          return x


In [19]:
# Create a class for the dataset
class CustomDataset(Dataset):
    def __init__(self, dataframe, tokenizer, word2vec_model, max_len):
        self.data = dataframe
        self.text = dataframe.text
        self.targets = self.data.label
        self.tokenizer = tokenizer
        self.word2vec_model = word2vec_model
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])
        words = simple_preprocess(text)  # Use gensim's simple_preprocess for consistency
        targets = self.targets[index]
        inputs = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            truncation=True,  # Ensure text is truncated to max_length
            padding='max_length',  # Ensure padding to max_length
            return_attention_mask=True,
            return_tensors='pt',
        )

        # Prepare Word2Vec embeddings
        word_indices = [self.word2vec_model.wv.key_to_index.get(word, 0) for word in words]
        # Ensure word_indices does not exceed max_len
        word_indices = word_indices[:self.max_len]
        # Pad word_indices to ensure it has length of max_len
        word_indices = np.pad(word_indices, (0, self.max_len - len(word_indices)), mode='constant', constant_values=0)
        word_indices = torch.tensor(word_indices, dtype=torch.long)

        return {
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            'word_indices': word_indices,
            'targets': torch.tensor(targets, dtype=torch.float),
        }

# Function to calculate metrics
def calculate_metrics(targets, outputs):
    accuracy = accuracy_score(targets, outputs)
    precision = precision_score(targets, outputs)
    recall = recall_score(targets, outputs)
    f1 = f1_score(targets, outputs)
    precision_vals, recall_vals, _ = precision_recall_curve(targets, outputs)
    pr_auc = auc(recall_vals, precision_vals)
    roc_auc = roc_auc_score(targets, outputs)
    return accuracy, precision, recall, f1, pr_auc, roc_auc

def objective(trial):

  parameters = {
      'batch_size': trial.suggest_int('batch_size', 2, 4),
      'learning_rate': trial.suggest_float('learning_rate', 1e-5, 1e-2, log=True),
      'epochs': trial.suggest_int('epochs', 3, 5),
      'learning_rate': trial.suggest_float('learning_rate', 1e-5, 1e-2, log=True),
      'dropout_rate': trial.suggest_float('dropout_rate', 0.1, 0.5, log=True),
      'optimizer': trial.suggest_categorical('optimizer', ['AdamW', 'Adam','SGD']),
      'weight_decay': trial.suggest_float('weight_decay', 1e-5, 1e-2, log=True),
      'max_len': trial.suggest_int('max_len', 50, 100),
      'unfreeze_epoch': trial.suggest_int('unfreeze_epoch', 0, 5),
      'freeze': trial.suggest_categorical('freeze', [True, False]),
      'hidden_dim': trial.suggest_int('hidden_dim', 64, 256),
      'num_filters': trial.suggest_int('num_filters', 32, 128),
      'hidden_dim': trial.suggest_int('hidden_dim', 64, 256),
      'accumulation_steps': trial.suggest_int('accumulation_steps', 1, 8)

    }

  # Set the parameters
  batch_size = parameters['batch_size']
  learning_rate = parameters['learning_rate']
  epochs = parameters['epochs']
  dropout_rate = parameters['dropout_rate']
  optimizer = parameters['optimizer']
  weight_decay = parameters['weight_decay']
  max_len = parameters['max_len']
  unfreeze_epoch = parameters['unfreeze_epoch']
  freeze = parameters['freeze']
  hidden_dim = parameters['hidden_dim']
  num_filters = parameters['num_filters']
  hidden_dim = parameters['hidden_dim']
  accumulation_steps = parameters['accumulation_steps']

  # Define the parameters
  train_params = {'batch_size': batch_size,'shuffle': True}
  filter_sizes = [3, 4, 5] # We should add it to the Parameters
  vocab_size = len(word2vec_model.wv.index_to_key)
  word2vec_weights = word2vec_model.wv.vectors
  embedding_dim = word2vec_weights.shape[1]


  # Instantiate the dataset with the BERT tokenizer and embeddings
  bert_model = AutoModel.from_pretrained('bert-base-uncased')
  tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

  # Ensure bert_model is in eval mode and move to GPU if available
  bert_model.eval()
  if torch.cuda.is_available():
      bert_model = bert_model.to('cuda')

  # Pass train and test to dataloader
  training_set = CustomDataset(train_df, tokenizer, word2vec_model, max_len)
  val_set = CustomDataset(val_df, tokenizer, word2vec_model, max_len)

  # Create the dataloaders
  training_loader = DataLoader(training_set, **train_params)
  val_loader = DataLoader(val_set, **train_params)

  # Instantiate model
  model = CNNForWord2VecBERTFT(word2vec_weights, vocab_size, embedding_dim, num_filters, filter_sizes, dropout_rate, hidden_dim, freeze=freeze)

  # Move the model to the GPU
  if torch.cuda.is_available():
      model = model.to('cuda')

  # Create the optimizer
  if optimizer == 'AdamW':
    optimizer = AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
  elif optimizer == 'Adam':
    optimizer = Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
  elif optimizer == 'SGD':
    optimizer = SGD(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
  else:
    raise ValueError("Invalid optimizer")

  # Create the loss function
  loss_function = nn.BCEWithLogitsLoss()

  # Instantiate pruner
  pruner = MedianPruner()

  # Initialize lists to store metrics
  metrics = {
      'train': {'loss': [], 'accuracy': [], 'precision': [], 'recall': [], 'f1': [], 'pr_auc': [], 'roc_auc': []},
      'val': {'loss':[], 'accuracy': [], 'precision': [], 'recall': [], 'f1': [], 'pr_auc': [], 'roc_auc': []}
  }

  # Define threshold
  threshold = 0.5

  # Training loop with metrics calculation
  for epoch in range(epochs):
      model.train()
      train_targets = []
      train_outputs = []

      # Freeze bert embeddings -> unfreeze_epoch is the epoch to unfreeze BERT
      if epoch == unfreeze_epoch:
          for param in model.bert.parameters():
              param.requires_grad = True

      # Unfreeze bert embeddings -> unfreeze_epoch is the epoch to unfreeze BERT
      if epoch > unfreeze_epoch:
          for param in model.bert.parameters():
              param.requires_grad = False

      # Training phase
      total_train_iterations = len(training_loader)
      total_loss = 0
      for i, data in tqdm(enumerate(training_loader,0),total=total_train_iterations, desc="Training"):
          word_indices = data['word_indices'].to(bert_model.device)
          input_ids = data['input_ids'].to(bert_model.device)
          attention_mask = data['attention_mask'].to(bert_model.device)
          targets = data['targets'].to(bert_model.device)

          # Forward pass
          outputs = model(input_ids, attention_mask, word_indices, prediction = False)
          loss = loss_function(outputs, targets.unsqueeze(1))
          loss.backward()
          if (i + 1) % accumulation_steps == 0:  # Wait for several backward steps
              optimizer.step()  # Now we can do an optimizer step
              optimizer.zero_grad()  # Reset gradients tensors

          # Store targets and outputs for evaluation
          train_targets.extend(targets.cpu().detach().numpy().tolist())
          train_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())

      # Calculate and store training metrics
      train_outputs_bin = np.array(train_outputs) >= threshold
      train_acc, train_prec, train_rec, train_f1, train_pr_auc, train_roc_auc = calculate_metrics(np.array(train_targets), train_outputs_bin)
      total_loss += loss.item()
      train_loss = total_loss / len(training_loader)
      metrics['train']['loss'].append(round(train_loss,4))
      metrics['train']['accuracy'].append(round(train_acc,4))
      metrics['train']['precision'].append(round(train_prec,4))
      metrics['train']['recall'].append(round(train_rec,4))
      metrics['train']['f1'].append(round(train_f1,4))
      metrics['train']['pr_auc'].append(round(train_pr_auc,4))
      metrics['train']['roc_auc'].append(round(train_roc_auc,4))

     # Validation phase
      model.eval()
      val_targets = []
      val_outputs = []
      val_loss_accumulated = 0.0  # To accumulate loss over all validation batches

      # Freeze bert embeddings -> unfreeze_epoch is the epoch to unfreeze BERT
      if epoch == unfreeze_epoch:
          for param in model.bert.parameters():
              param.requires_grad = True

      # Unfreeze bert embeddings -> unfreeze_epoch is the epoch to unfreeze BERT
      if epoch > unfreeze_epoch:
          for param in model.bert.parameters():
              param.requires_grad = False

      with torch.no_grad():
          total_val_iterations = len(val_loader)
          for data in tqdm(val_loader, total=total_val_iterations, desc="Validation"):
              word_indices = data['word_indices'].to(bert_model.device)
              input_ids = data['input_ids'].to(bert_model.device)
              attention_mask = data['attention_mask'].to(bert_model.device)
              targets = data['targets'].to(bert_model.device)

              # Forward pass
              outputs = model(input_ids, attention_mask, word_indices, prediction = False)  # Assuming model outputs logits
              loss = loss_function(outputs, targets.unsqueeze(1))
              val_loss_accumulated += loss.item()

              outputs = torch.sigmoid(outputs).squeeze()  # Apply sigmoid once to get probabilities
              val_targets.extend(targets.cpu().detach().numpy())
              # Assuming outputs could be a scalar or an array, ensure it's always treated as an iterable
              outputs_np = outputs.cpu().detach().numpy()  # Convert to numpy array

              # If outputs_np is a scalar (0-d array), convert it into a 1-d array with a single value
              if outputs_np.ndim == 0:
                  outputs_np = np.expand_dims(outputs_np, axis=0)

              val_outputs.extend(outputs_np)
              # val_outputs.extend(outputs.cpu().detach().numpy())

      # Calculate average validation loss
      val_loss = val_loss_accumulated / total_val_iterations

      # Convert outputs to binary predictions based on the threshold
      val_outputs_bin = np.array(val_outputs) >= threshold
      # Now calculate and print metrics using val_targets and val_outputs_bin
      val_acc, val_prec, val_rec, val_f1, val_pr_auc, val_roc_auc = calculate_metrics(np.array(val_targets), val_outputs_bin)
      metrics['val']['loss'].append(round(val_loss,4))
      metrics['val']['accuracy'].append(round(val_acc,4))
      metrics['val']['precision'].append(round(val_prec,4))
      metrics['val']['recall'].append(round(val_rec,4))
      metrics['val']['f1'].append(round(val_f1,4))
      metrics['val']['pr_auc'].append(round(val_pr_auc,4))
      metrics['val']['roc_auc'].append(round(val_roc_auc,4))

      print(f"Epoch {epoch+1}/{epochs} - Train Metrics: Loss: {train_loss}, Accuracy: {train_acc}, Precision: {train_prec}, Recall: {train_rec}, F1: {train_f1}, PR AUC: {train_pr_auc}, ROC AUC: {train_roc_auc}")
      print(f"Epoch {epoch+1}/{epochs} - Val Metrics: Loss: {val_loss},  Accuracy: {val_acc}, Precision: {val_prec}, Recall: {val_rec}, F1: {val_f1}, PR AUC: {val_pr_auc}, ROC AUC: {val_roc_auc}")
      trial.report(val_f1, epoch)
      if trial.should_prune():
        raise optuna.exceptions.TrialPruned()

      # At the end of your objective function, before returning the optimization metric
      trial.set_user_attr("train_loss", train_loss)
      trial.set_user_attr("train_accuracy", train_acc)
      trial.set_user_attr("train_precision", train_prec)
      trial.set_user_attr("train_recall", train_rec)
      trial.set_user_attr("train_f1", train_f1)
      trial.set_user_attr("train_pr_auc", train_pr_auc)
      trial.set_user_attr("train_roc_auc", train_roc_auc)

      trial.set_user_attr("val_loss", val_loss)
      trial.set_user_attr("val_accuracy", val_acc)
      trial.set_user_attr("val_precision", val_prec)
      trial.set_user_attr("val_recall", val_rec)
      trial.set_user_attr("val_f1", val_f1)
      trial.set_user_attr("val_pr_auc", val_pr_auc)
      trial.set_user_attr("val_roc_auc", val_roc_auc)

  return np.max(metrics['val']['f1'])


In [20]:
# Empty cash
torch.cuda.empty_cache()

# Run trials
study = optuna.create_study(direction='maximize', pruner=MedianPruner())
study.optimize(objective, n_trials=5)

# Get the best hyperparameters
best_params = study.best_params
print(best_params)

# Pickle the study object
with open('/content/drive/My Drive/266_project/project/support/optuna_study_CNNForWord2VecBERTFT.pkl', 'wb') as f:
    pickle.dump(study, f)

[I 2024-04-10 02:50:41,704] A new study created in memory with name: no-name-7210b50e-892c-498b-9c9b-16ff3f66ab21


Training:   0%|          | 0/2500 [00:00<?, ?it/s]

Validation:   0%|          | 0/600 [00:00<?, ?it/s]

Epoch 1/5 - Train Metrics: Loss: 0.0002691544055938721, Accuracy: 0.757, Precision: 0.7573087705246295, Recall: 0.7564, F1: 0.7568541124674805, PR AUC: 0.8177543852623148, ROC AUC: 0.7570000000000001
Epoch 1/5 - Val Metrics: Loss: 0.47273944737234463,  Accuracy: 0.785, Precision: 0.8154981549815498, Recall: 0.7366666666666667, F1: 0.7740805604203153, PR AUC: 0.8419157441574415, ROC AUC: 0.7850000000000001


Training:   0%|          | 0/2500 [00:00<?, ?it/s]

Validation:   0%|          | 0/600 [00:00<?, ?it/s]

Epoch 2/5 - Train Metrics: Loss: 0.0003903374910354614, Accuracy: 0.806, Precision: 0.8055111821086262, Recall: 0.8068, F1: 0.8061550759392486, PR AUC: 0.854455591054313, ROC AUC: 0.8059999999999999
Epoch 2/5 - Val Metrics: Loss: 0.4645049999545639,  Accuracy: 0.7783333333333333, Precision: 0.8174904942965779, Recall: 0.7166666666666667, F1: 0.763765541740675, PR AUC: 0.8379119138149556, ROC AUC: 0.7783333333333333


Training:   0%|          | 0/2500 [00:00<?, ?it/s]

Validation:   0%|          | 0/600 [00:00<?, ?it/s]

Epoch 3/5 - Train Metrics: Loss: 0.00028552818298339846, Accuracy: 0.8186, Precision: 0.817710410849621, Recall: 0.82, F1: 0.8188536049530656, PR AUC: 0.8638552054248104, ROC AUC: 0.8186
Epoch 3/5 - Val Metrics: Loss: 0.4508243429784973,  Accuracy: 0.8025, Precision: 0.8354898336414048, Recall: 0.7533333333333333, F1: 0.7922874671340929, PR AUC: 0.8560782501540357, ROC AUC: 0.8025


Training:   0%|          | 0/2500 [00:00<?, ?it/s]

Validation:   0%|          | 0/600 [00:00<?, ?it/s]

Epoch 4/5 - Train Metrics: Loss: 0.000741307544708252, Accuracy: 0.8308, Precision: 0.8266192733017378, Recall: 0.8372, F1: 0.8318759936406996, PR AUC: 0.8726096366508688, ROC AUC: 0.8308000000000001
Epoch 4/5 - Val Metrics: Loss: 0.43110738686130695,  Accuracy: 0.8041666666666667, Precision: 0.8218694885361552, Recall: 0.7766666666666666, F1: 0.7986289631533847, PR AUC: 0.8551014109347441, ROC AUC: 0.8041666666666667


Training:   0%|          | 0/2500 [00:00<?, ?it/s]

Validation:   0%|          | 0/600 [00:00<?, ?it/s]

[I 2024-04-10 02:55:42,152] Trial 0 finished with value: 0.7986 and parameters: {'batch_size': 2, 'learning_rate': 0.0016550646749723733, 'epochs': 5, 'dropout_rate': 0.11653261946051362, 'optimizer': 'AdamW', 'weight_decay': 4.7092592273994384e-05, 'max_len': 86, 'unfreeze_epoch': 4, 'freeze': True, 'hidden_dim': 247, 'num_filters': 97, 'accumulation_steps': 2}. Best is trial 0 with value: 0.7986.


Epoch 5/5 - Train Metrics: Loss: 0.00030379033088684084, Accuracy: 0.558, Precision: 0.5601160862354893, Recall: 0.5404, F1: 0.5500814332247557, PR AUC: 0.6651580431177446, ROC AUC: 0.558
Epoch 5/5 - Val Metrics: Loss: 0.6720191659530004,  Accuracy: 0.5833333333333334, Precision: 0.5510204081632653, Recall: 0.9, F1: 0.6835443037974683, PR AUC: 0.7505102040816327, ROC AUC: 0.5833333333333333


Training:   0%|          | 0/1667 [00:00<?, ?it/s]

Validation:   0%|          | 0/400 [00:00<?, ?it/s]

Epoch 1/5 - Train Metrics: Loss: 0.0003908467135460847, Accuracy: 0.6796, Precision: 0.6977973568281939, Recall: 0.6336, F1: 0.6641509433962264, PR AUC: 0.7572986784140969, ROC AUC: 0.6796
Epoch 1/5 - Val Metrics: Loss: 0.5918835952877999,  Accuracy: 0.7341666666666666, Precision: 0.7826961770623743, Recall: 0.6483333333333333, F1: 0.7092069279854148, PR AUC: 0.8034314218645204, ROC AUC: 0.7341666666666667


Training:   0%|          | 0/1667 [00:00<?, ?it/s]

Validation:   0%|          | 0/400 [00:00<?, ?it/s]

Epoch 2/5 - Train Metrics: Loss: 0.0002980380803436023, Accuracy: 0.7444, Precision: 0.7633620689655173, Recall: 0.7084, F1: 0.7348547717842324, PR AUC: 0.8087810344827586, ROC AUC: 0.7444
Epoch 2/5 - Val Metrics: Loss: 0.5543212812393904,  Accuracy: 0.75, Precision: 0.7840909090909091, Recall: 0.69, F1: 0.7340425531914891, PR AUC: 0.8145454545454546, ROC AUC: 0.75


Training:   0%|          | 0/1667 [00:00<?, ?it/s]

Validation:   0%|          | 0/400 [00:00<?, ?it/s]

Epoch 3/5 - Train Metrics: Loss: 0.00038734015834925056, Accuracy: 0.7516, Precision: 0.7685738684884714, Recall: 0.72, F1: 0.7434944237918215, PR AUC: 0.8142869342442357, ROC AUC: 0.7516
Epoch 3/5 - Val Metrics: Loss: 0.5374598729610444,  Accuracy: 0.7466666666666667, Precision: 0.8162393162393162, Recall: 0.6366666666666667, F1: 0.7153558052434457, PR AUC: 0.8172863247863248, ROC AUC: 0.7466666666666667


Training:   0%|          | 0/1667 [00:00<?, ?it/s]

Validation:   0%|          | 0/400 [00:00<?, ?it/s]

Epoch 4/5 - Train Metrics: Loss: 0.0002471003871373095, Accuracy: 0.7584, Precision: 0.7693911592994161, Recall: 0.738, F1: 0.7533687219273174, PR AUC: 0.819195579649708, ROC AUC: 0.7584
Epoch 4/5 - Val Metrics: Loss: 0.5215881613641977,  Accuracy: 0.7641666666666667, Precision: 0.7897623400365631, Recall: 0.72, F1: 0.7532693984306887, PR AUC: 0.8248811700182814, ROC AUC: 0.7641666666666667


Training:   0%|          | 0/1667 [00:00<?, ?it/s]

Validation:   0%|          | 0/400 [00:00<?, ?it/s]

[I 2024-04-10 02:59:02,943] Trial 1 finished with value: 0.7662 and parameters: {'batch_size': 3, 'learning_rate': 0.0017523400994718323, 'epochs': 5, 'dropout_rate': 0.21532641302492167, 'optimizer': 'SGD', 'weight_decay': 1.4501347231931907e-05, 'max_len': 65, 'unfreeze_epoch': 4, 'freeze': True, 'hidden_dim': 244, 'num_filters': 99, 'accumulation_steps': 8}. Best is trial 0 with value: 0.7986.


Epoch 5/5 - Train Metrics: Loss: 0.0002741836006940305, Accuracy: 0.7764, Precision: 0.7793047696038804, Recall: 0.7712, F1: 0.7752312022517089, PR AUC: 0.8324523848019402, ROC AUC: 0.7764
Epoch 5/5 - Val Metrics: Loss: 0.4495324239227921,  Accuracy: 0.7925, Precision: 0.8774193548387097, Recall: 0.68, F1: 0.7661971830985915, PR AUC: 0.8587096774193549, ROC AUC: 0.7925000000000001


Training:   0%|          | 0/1667 [00:00<?, ?it/s]

Validation:   0%|          | 0/400 [00:00<?, ?it/s]

Epoch 1/3 - Train Metrics: Loss: 0.0003393094698397357, Accuracy: 0.6904, Precision: 0.7018659881255301, Recall: 0.662, F1: 0.6813503499382463, PR AUC: 0.766432994062765, ROC AUC: 0.6903999999999999
Epoch 1/3 - Val Metrics: Loss: 0.4855421398580074,  Accuracy: 0.7575, Precision: 0.793168880455408, Recall: 0.6966666666666667, F1: 0.7417923691215618, PR AUC: 0.8207511068943707, ROC AUC: 0.7575000000000001


Training:   0%|          | 0/1667 [00:00<?, ?it/s]

Validation:   0%|          | 0/400 [00:00<?, ?it/s]

Epoch 2/3 - Train Metrics: Loss: 0.00014836597242395393, Accuracy: 0.7876, Precision: 0.8227109515260324, Recall: 0.7332, F1: 0.7753807106598984, PR AUC: 0.8446554757630163, ROC AUC: 0.7876000000000001
Epoch 2/3 - Val Metrics: Loss: 0.48442863691598176,  Accuracy: 0.76, Precision: 0.8277310924369747, Recall: 0.6566666666666666, F1: 0.7323420074349442, PR AUC: 0.828032212885154, ROC AUC: 0.76


Training:   0%|          | 0/1667 [00:00<?, ?it/s]

Validation:   0%|          | 0/400 [00:00<?, ?it/s]

[I 2024-04-10 03:01:18,122] Trial 2 finished with value: 0.7418 and parameters: {'batch_size': 3, 'learning_rate': 0.0005346498633320365, 'epochs': 3, 'dropout_rate': 0.3846041220195397, 'optimizer': 'SGD', 'weight_decay': 0.00013787475506137713, 'max_len': 72, 'unfreeze_epoch': 0, 'freeze': True, 'hidden_dim': 170, 'num_filters': 120, 'accumulation_steps': 6}. Best is trial 0 with value: 0.7986.


Epoch 3/3 - Train Metrics: Loss: 0.00016277576179367094, Accuracy: 0.7978, Precision: 0.8255356362046349, Recall: 0.7552, F1: 0.7888030081470649, PR AUC: 0.8515678181023174, ROC AUC: 0.7978000000000001
Epoch 3/3 - Val Metrics: Loss: 0.47938298103399574,  Accuracy: 0.7633333333333333, Precision: 0.819838056680162, Recall: 0.675, F1: 0.7404021937842779, PR AUC: 0.8286690283400809, ROC AUC: 0.7633333333333334


Training:   0%|          | 0/1667 [00:00<?, ?it/s]

Validation:   0%|          | 0/400 [00:00<?, ?it/s]

Epoch 1/5 - Train Metrics: Loss: 1.5934775055491717e-05, Accuracy: 0.812, Precision: 0.8145161290322581, Recall: 0.808, F1: 0.8112449799196787, PR AUC: 0.859258064516129, ROC AUC: 0.812
Epoch 1/5 - Val Metrics: Loss: 0.3810042844386771,  Accuracy: 0.8375, Precision: 0.867513611615245, Recall: 0.7966666666666666, F1: 0.8305821025195481, PR AUC: 0.8829234724742892, ROC AUC: 0.8374999999999999


Training:   0%|          | 0/1667 [00:00<?, ?it/s]

Validation:   0%|          | 0/400 [00:00<?, ?it/s]

Epoch 2/5 - Train Metrics: Loss: 3.0920184348683625e-05, Accuracy: 0.939, Precision: 0.9409401366010446, Recall: 0.9368, F1: 0.93886550410904, PR AUC: 0.9546700683005223, ROC AUC: 0.939
Epoch 2/5 - Val Metrics: Loss: 0.4081090196385048,  Accuracy: 0.83, Precision: 0.8485915492957746, Recall: 0.8033333333333333, F1: 0.8253424657534246, PR AUC: 0.8751291079812206, ROC AUC: 0.83


Training:   0%|          | 0/1667 [00:00<?, ?it/s]

Validation:   0%|          | 0/400 [00:00<?, ?it/s]

Epoch 3/5 - Train Metrics: Loss: 7.485041766911835e-06, Accuracy: 0.9762, Precision: 0.9802339653085922, Recall: 0.972, F1: 0.9760996183972686, PR AUC: 0.9831169826542961, ROC AUC: 0.9762000000000001
Epoch 3/5 - Val Metrics: Loss: 0.533369500132394,  Accuracy: 0.8516666666666667, Precision: 0.8625429553264605, Recall: 0.8366666666666667, F1: 0.8494077834179358, PR AUC: 0.8904381443298969, ROC AUC: 0.8516666666666667


Training:   0%|          | 0/1667 [00:00<?, ?it/s]

Validation:   0%|          | 0/400 [00:00<?, ?it/s]

Epoch 4/5 - Train Metrics: Loss: 4.808604703287593e-07, Accuracy: 0.9952, Precision: 0.9932270916334661, Recall: 0.9972, F1: 0.9952095808383233, PR AUC: 0.9959135458167331, ROC AUC: 0.9951999999999999
Epoch 4/5 - Val Metrics: Loss: 0.5580633794010645,  Accuracy: 0.8525, Precision: 0.8519134775374376, Recall: 0.8533333333333334, F1: 0.852622814321399, PR AUC: 0.8892900721020522, ROC AUC: 0.8525


Training:   0%|          | 0/1667 [00:00<?, ?it/s]

Validation:   0%|          | 0/400 [00:00<?, ?it/s]

[I 2024-04-10 03:06:03,837] Trial 3 finished with value: 0.8526 and parameters: {'batch_size': 3, 'learning_rate': 3.508959527633033e-05, 'epochs': 5, 'dropout_rate': 0.20251598167097748, 'optimizer': 'Adam', 'weight_decay': 0.0004961314187308233, 'max_len': 69, 'unfreeze_epoch': 2, 'freeze': False, 'hidden_dim': 68, 'num_filters': 40, 'accumulation_steps': 2}. Best is trial 3 with value: 0.8526.


Epoch 5/5 - Train Metrics: Loss: 7.994278559229942e-07, Accuracy: 0.9958, Precision: 0.9940215225189318, Recall: 0.9976, F1: 0.9958075464164504, PR AUC: 0.9964107612594658, ROC AUC: 0.9958
Epoch 5/5 - Val Metrics: Loss: 0.5868258388918184,  Accuracy: 0.8533333333333334, Precision: 0.8569023569023569, Recall: 0.8483333333333334, F1: 0.8525963149078727, PR AUC: 0.8905345117845118, ROC AUC: 0.8533333333333334


Training:   0%|          | 0/2500 [00:00<?, ?it/s]

Validation:   0%|          | 0/600 [00:00<?, ?it/s]

Epoch 1/3 - Train Metrics: Loss: 0.00018723832368850707, Accuracy: 0.7518, Precision: 0.7452278924814959, Recall: 0.7652, F1: 0.7550819025064142, PR AUC: 0.813913946240748, ROC AUC: 0.7518
Epoch 1/3 - Val Metrics: Loss: 0.5040636044678589,  Accuracy: 0.7708333333333334, Precision: 0.701363073110285, Recall: 0.9433333333333334, F1: 0.8045486851457, PR AUC: 0.8365148698884759, ROC AUC: 0.7708333333333334


Training:   0%|          | 0/2500 [00:00<?, ?it/s]

Validation:   0%|          | 0/600 [00:00<?, ?it/s]

Epoch 2/3 - Train Metrics: Loss: 1.6213205456733704e-05, Accuracy: 0.8388, Precision: 0.8326787117046347, Recall: 0.848, F1: 0.8402695204122076, PR AUC: 0.8783393558523174, ROC AUC: 0.8388
Epoch 2/3 - Val Metrics: Loss: 0.3766478198521266,  Accuracy: 0.8375, Precision: 0.8648648648648649, Recall: 0.8, F1: 0.8311688311688312, PR AUC: 0.8824324324324324, ROC AUC: 0.8375


Training:   0%|          | 0/2500 [00:00<?, ?it/s]

Validation:   0%|          | 0/600 [00:00<?, ?it/s]

[I 2024-04-10 03:11:00,170] Trial 4 finished with value: 0.8395 and parameters: {'batch_size': 2, 'learning_rate': 0.0033735748784844213, 'epochs': 3, 'dropout_rate': 0.10508256945316508, 'optimizer': 'SGD', 'weight_decay': 0.0008014562210006893, 'max_len': 80, 'unfreeze_epoch': 5, 'freeze': False, 'hidden_dim': 190, 'num_filters': 98, 'accumulation_steps': 5}. Best is trial 3 with value: 0.8526.


Epoch 3/3 - Train Metrics: Loss: 2.842559292912483e-06, Accuracy: 0.8794, Precision: 0.8715236976106542, Recall: 0.89, F1: 0.8806649515139522, PR AUC: 0.9082618488053271, ROC AUC: 0.8794000000000001
Epoch 3/3 - Val Metrics: Loss: 0.40537927815419,  Accuracy: 0.8241666666666667, Precision: 0.772027972027972, Recall: 0.92, F1: 0.8395437262357414, PR AUC: 0.8660139860139859, ROC AUC: 0.8241666666666666
{'batch_size': 3, 'learning_rate': 3.508959527633033e-05, 'epochs': 5, 'dropout_rate': 0.20251598167097748, 'optimizer': 'Adam', 'weight_decay': 0.0004961314187308233, 'max_len': 69, 'unfreeze_epoch': 2, 'freeze': False, 'hidden_dim': 68, 'num_filters': 40, 'accumulation_steps': 2}


# Best hyperparams

In [21]:
def create_results_dataframe(study):
    # Create a list to hold all trial data
    trial_data = []

    # Iterate through all completed trials
    for trial in study.trials:
        # Retrieve the user attributes for the trial
        user_attrs = trial.user_attrs
        user_attrs["trial_number"] = trial.number
        user_attrs["value"] = trial.value  # The objective value (e.g., validation F1 score)

        # Append the trial data to the list
        trial_data.append(user_attrs)

    # Create a DataFrame from the list of trial data
    df = pd.DataFrame(trial_data)

    # Optionally, you might want to sort the DataFrame based on the objective value or another metric
    df = df.sort_values("value", ascending=False)

    return df

# Assuming 'study' is your Optuna study object
df_results = create_results_dataframe(study)

In [22]:
df_results.head()

Unnamed: 0,train_loss,train_accuracy,train_precision,train_recall,train_f1,train_pr_auc,train_roc_auc,val_loss,val_accuracy,val_precision,val_recall,val_f1,val_pr_auc,val_roc_auc,trial_number,value
3,7.994279e-07,0.9958,0.994022,0.9976,0.995808,0.996411,0.9958,0.586826,0.853333,0.856902,0.848333,0.852596,0.890535,0.853333,3,0.8526
4,2.842559e-06,0.8794,0.871524,0.89,0.880665,0.908262,0.8794,0.405379,0.824167,0.772028,0.92,0.839544,0.866014,0.824167,4,0.8395
0,0.0003037903,0.558,0.560116,0.5404,0.550081,0.665158,0.558,0.672019,0.583333,0.55102,0.9,0.683544,0.75051,0.583333,0,0.7986
1,0.0002741836,0.7764,0.779305,0.7712,0.775231,0.832452,0.7764,0.449532,0.7925,0.877419,0.68,0.766197,0.85871,0.7925,1,0.7662
2,0.0001627758,0.7978,0.825536,0.7552,0.788803,0.851568,0.7978,0.479383,0.763333,0.819838,0.675,0.740402,0.828669,0.763333,2,0.7418


# Graveyard

In [23]:
# class CNNForWord2Vec(nn.Module):
#     def __init__(self, input, embedding_dim, num_filters, filter_sizes, dropout_rate):
#         super(CNNForWord2Vec, self).__init__()
#         self.convs = nn.ModuleList([
#             nn.Conv2d(in_channels=1, out_channels=num_filters, kernel_size=k) for k in filter_sizes
#         ])
#         self.dropout = nn.Dropout(dropout_rate)

#     def forward(self, x):
#         # x shape: [batch_size, max_sequence_length, embedding_dim]
#         x = x.unsqueeze(1)  # Add channel dimension: [batch_size, 1, max_sequence_length, embedding_dim]

#         # Apply convolution and ReLU. Output shape: [batch_size, num_filters, L, 1]
#         x = [F.relu(conv(x)).squeeze(3) for conv in self.convs]

#         # Apply global average pooling. Output shape: [batch_size, num_filters]
#         x = [F.avg_pool1d(i, i.size(2)).squeeze(2) for i in x]

#         # Concatenate along the filter dimension
#         x = torch.cat(x, 1)

#         x = self.dropout(x)  # Apply dropout
#         return x


# embedding_dim = 768  # Dimension of Word2Vec embeddings
# num_filters = 100  # Number of filters per filter size
# filter_sizes = [3, 4, 5]  # Sizes of filters

# model = CNNForWord2Vec(embedding_dim, num_filters, filter_sizes,dropout_rate=0.1)

# # Example input tensor representing padded sequences of Word2Vec embeddings
# word2vec_embeddings = torch.randn(32, 65, embedding_dim)  # Example: batch_size=32, max_sequence_length=65

# # Forward pass through the model
# cnn_output = model(word2vec_embeddings)

# print("Output shape:", cnn_output.shape)
# # The output shape will be [batch_size, num_filters * len(filter_sizes)] due to the concatenation

In [24]:
# class LSTMWithGAP(nn.Module):
#     def __init__(self, embedding_dim, hidden_dim, num_layers):
#         super(LSTMWithGAP, self).__init__()
#         self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers, batch_first=True)

#     def forward(self, x):
#         # x shape: [batch_size, sequence_length, embedding_dim]
#         lstm_out, (hidden, cell) = self.lstm(x)
#         # lstm_out shape: [batch_size, sequence_length, hidden_dim]

#         # Apply Global Average Pooling across the sequence dimension
#         gap_out = torch.mean(lstm_out, dim=1)
#         # gap_out shape: [batch_size, hidden_dim]

#         return gap_out

# # Example usage
# embedding_dim = 768  # Dimension of Word2Vec embeddings
# hidden_dim = 128  # Hidden dimension of the LSTM
# num_layers = 2  # Number of LSTM layers

# model = LSTMWithGAP(embedding_dim, hidden_dim, num_layers)

# # Example input tensor representing padded sequences of Word2Vec embeddings
# word2vec_embeddings = torch.randn(32, 65, embedding_dim)  # Example: batch_size=32, sequence_length=65

# # Forward pass through the model
# lstm_output = model(word2vec_embeddings)

# print("Output shape:", lstm_output.shape)
# # The output shape will be [batch_size, hidden_dim] because of the Global Average Pooling

In [25]:

# # Reduce
# n_rows = 1000
# train_df = pd.concat([train_df[train_df['label']==0].sample(n=n_rows, random_state=42),train_df[train_df['label']==1].sample(n=n_rows, random_state=42)])
# val_df = pd.concat([val_df[val_df['label']==0].sample(n=n_rows, random_state=42),val_df[val_df['label']==1].sample(n=n_rows, random_state=42)])

In [26]:
# train_df.head()

In [27]:
# class BERTClass(nn.Module):
#     def __init__(self, dropout_rate):
#         super(BERTClass, self).__init__()
#         self.l1 = BertModel.from_pretrained('bert-base-uncased')
#         self.pooling = nn.AdaptiveAvgPool1d(1)
#         self.l2 = lora.Linear(768, 1, r=16)  # LoRA layer
#         self.l3 = nn.Dropout(dropout_rate)

#     def forward(self, ids, mask, token_type_ids, return_embeddings=False):
#         outputs = self.l1(ids, attention_mask=mask, token_type_ids=token_type_ids)
#         last_hidden_state = outputs.last_hidden_state
#         # Apply pooling across the sequence dimension (dim=1) and then squeeze the pooled output
#         pooled_output = self.pooling(last_hidden_state.transpose(1, 2)).squeeze(-1)
#         output_2 = self.l2(pooled_output)
#         output_3 = self.l3(output_2)
#         if return_embeddings:
#             return output_3, output_2, last_hidden_state

#         return output_3

# # Create a class for the dataset
# class CustomDataset(Dataset):
#     def __init__(self, dataframe, tokenizer, max_len):
#         self.tokenizer = tokenizer
#         self.data = dataframe
#         self.text = dataframe.text
#         self.targets = self.data.label
#         self.max_len = max_len

#     def __len__(self):
#         return len(self.text)

#     def __getitem__(self, index):
#         text = str(self.text[index])
#         text = " ".join(text.split())

#         inputs = self.tokenizer.encode_plus(
#             text,
#             add_special_tokens=True,
#             max_length=self.max_len,
#             padding="max_length",
#             truncation=True,
#             return_token_type_ids=True,
#             return_attention_mask=True,
#             return_tensors='pt'
#         )

#         ids = inputs['input_ids']
#         mask = inputs['attention_mask']
#         token_type_ids = inputs.get("token_type_ids", None)

#         return {
#                 'ids': ids.squeeze(),
#                 'mask': mask.squeeze(),
#                 'token_type_ids': token_type_ids.squeeze() if token_type_ids is not None else None,
#                 'targets': torch.tensor(self.targets[index], dtype=torch.float)
#             }


# # Function to calculate metrics
# def calculate_metrics(targets, outputs):
#     accuracy = accuracy_score(targets, outputs)
#     precision = precision_score(targets, outputs)
#     recall = recall_score(targets, outputs)
#     f1 = f1_score(targets, outputs)
#     precision_vals, recall_vals, _ = precision_recall_curve(targets, outputs)
#     pr_auc = auc(recall_vals, precision_vals)
#     roc_auc = roc_auc_score(targets, outputs)
#     return accuracy, precision, recall, f1, pr_auc, roc_auc


# def objective(trial):

#   parameters = {
#       'batch_size': trial.suggest_int('batch_size', 2, 4),
#       'learning_rate': trial.suggest_float('learning_rate', 1e-5, 1e-2, log=True),
#       'epochs': trial.suggest_int('epochs', 3, 5),
#       'learning_rate': trial.suggest_float('learning_rate', 1e-5, 1e-2, log=True),
#       'dropout_rate': trial.suggest_float('dropout_rate', 0.1, 0.5, log=True),
#       'optimizer': trial.suggest_categorical('optimizer', ['AdamW', 'Adam','SGD']),
#       'weight_decay': trial.suggest_float('weight_decay', 1e-5, 1e-2, log=True),
#       'max_len': trial.suggest_int('max_len', 50, 100)
#     }

#   # Set the parameters
#   batch_size = parameters['batch_size']
#   learning_rate = parameters['learning_rate']
#   epochs = parameters['epochs']
#   dropout_rate = parameters['dropout_rate']
#   max_len = parameters['max_len']
#   optimizer = parameters['optimizer']
#   weight_decay = parameters['weight_decay']
#   train_params = {'batch_size': batch_size,'shuffle': True}

#   # Instantiate tokenizer and model
#   tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
#   bert_model = BertModel.from_pretrained('bert-base-uncased')

#   # Pass train and test to dataloader
#   training_set = CustomDataset(train_df, tokenizer, max_len)
#   val_set = CustomDataset(val_df, tokenizer, max_len)

#   # Create the dataloaders
#   training_loader = DataLoader(training_set, **train_params)
#   val_loader = DataLoader(val_set, **train_params)

#   # Instantiate model
#   model = BERTClass(dropout_rate)
#   lora.mark_only_lora_as_trainable(model)

#   # Move the model to the GPU
#   if torch.cuda.is_available():
#       model = model.to('cuda')

#   # Create the optimizer
#   if optimizer == 'AdamW':
#     optimizer = AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
#   elif optimizer == 'Adam':
#     optimizer = Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
#   elif optimizer == 'SGD':
#     optimizer = SGD(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
#   else:
#     raise ValueError("Invalid optimizer")

#   # Create the loss function
#   loss_function = nn.BCEWithLogitsLoss()

#   # Instantiate pruner
#   pruner = MedianPruner()

#   # Initialize lists to store metrics
#   metrics = {
#       'train': {'loss': [], 'accuracy': [], 'precision': [], 'recall': [], 'f1': [], 'pr_auc': [], 'roc_auc': []},
#       'val': {'loss':[], 'accuracy': [], 'precision': [], 'recall': [], 'f1': [], 'pr_auc': [], 'roc_auc': []}
#   }

#   gradients_stats = []

#   def collect_gradients(module, grad_input, grad_output):
#         grad_stats = {
#           'layer': module.__class__.__name__,
#           'grad_input_mean': [grad.mean().item() for grad in grad_input if grad is not None],
#           'grad_input_std': [grad.std().item() for grad in grad_input if grad is not None],
#           'grad_output_mean': [grad.mean().item() for grad in grad_output if grad is not None],
#           'grad_output_std': [grad.std().item() for grad in grad_output if grad is not None],
#         }
#         gradients_stats.append(grad_stats)

#   # Assuming `model` is already defined
#   model.l2.register_full_backward_hook(collect_gradients)

#   # Define threshold
#   threshold = 0.5

#   # Training loop with metrics calculation
#   for epoch in range(epochs):
#       model.train()
#       train_targets = []
#       train_outputs = []

#       # Training phase
#       total_train_iterations = len(training_loader)
#       for i, data in tqdm(enumerate(training_loader,0),total=total_train_iterations, desc="Training"):
#           ids = data['ids'].to('cuda', dtype=torch.long)
#           mask = data['mask'].to('cuda', dtype=torch.long)
#           token_type_ids = data['token_type_ids'].to('cuda', dtype=torch.long)
#           targets = data['targets'].to('cuda', dtype=torch.float)

#           # Forward pass
#           outputs = model(ids, mask, token_type_ids)
#           optimizer.zero_grad()
#           loss = loss_function(outputs, targets.unsqueeze(1))
#           loss.backward()
#           optimizer.step()
#           train_targets.extend(targets.cpu().detach().numpy().tolist())
#           train_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())

#       # Save gradients for analysis
#       with open('./gradient_statistics.csv', 'w', newline='') as csvfile:
#         fieldnames = ['layer', 'grad_input_mean', 'grad_input_std', 'grad_output_mean', 'grad_output_std']
#         writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
#         writer.writeheader()
#         for grad_stat in gradients_stats:
#           writer.writerow(grad_stat)

#       # Calculate and store training metrics
#       train_outputs_bin = np.array(train_outputs) >= threshold
#       train_acc, train_prec, train_rec, train_f1, train_pr_auc, train_roc_auc = calculate_metrics(np.array(train_targets), train_outputs_bin)
#       train_loss = loss.item()
#       metrics['train']['loss'].append(round(train_loss,4))
#       metrics['train']['accuracy'].append(round(train_acc,4))
#       metrics['train']['precision'].append(round(train_prec,4))
#       metrics['train']['recall'].append(round(train_rec,4))
#       metrics['train']['f1'].append(round(train_f1,4))
#       metrics['train']['pr_auc'].append(round(train_pr_auc,4))
#       metrics['train']['roc_auc'].append(round(train_roc_auc,4))

#       # Validation phase
#       model.eval()
#       val_targets = []
#       val_outputs = []
#       with torch.no_grad():
#           total_val_iterations = len(val_loader)
#           for data in tqdm(val_loader, total=total_val_iterations, desc="Validation"):
#             ids = data['ids'].to('cuda', dtype=torch.long)
#             mask = data['mask'].to('cuda', dtype=torch.long)
#             token_type_ids = data['token_type_ids'].to('cuda', dtype=torch.long)
#             targets = data['targets'].to('cuda', dtype=torch.float)

#             # Forward pass
#             outputs = model(ids, mask, token_type_ids)
#             outputs = torch.sigmoid(outputs).squeeze()
#             val_targets.extend(targets.cpu().detach().numpy().tolist())
#             output_list = torch.sigmoid(outputs).cpu().detach().numpy().flatten().tolist()
#             val_outputs.extend(output_list)

#       # Calculate and store validation metrics
#       val_outputs_bin = np.array(val_outputs) >= threshold
#       val_acc, val_prec, val_rec, val_f1, val_pr_auc, val_roc_auc = calculate_metrics(np.array(val_targets), val_outputs_bin)
#       val_loss = loss.item()
#       metrics['val']['loss'].append(round(val_loss,4))
#       metrics['val']['accuracy'].append(round(val_acc,4))
#       metrics['val']['precision'].append(round(val_prec,4))
#       metrics['val']['recall'].append(round(val_rec,4))
#       metrics['val']['f1'].append(round(val_f1,4))
#       metrics['val']['pr_auc'].append(round(val_pr_auc,4))
#       metrics['val']['roc_auc'].append(round(val_roc_auc,4))

#       print(f"Epoch {epoch+1}/{epochs} - Train Metrics: Loss: {train_loss}, Accuracy: {train_acc}, Precision: {train_prec}, Recall: {train_rec}, F1: {train_f1}, PR AUC: {train_pr_auc}, ROC AUC: {train_roc_auc}")
#       print(f"Epoch {epoch+1}/{epochs} - Val Metrics: Loss: {val_loss},  Accuracy: {val_acc}, Precision: {val_prec}, Recall: {val_rec}, F1: {val_f1}, PR AUC: {val_pr_auc}, ROC AUC: {val_roc_auc}")
#       trial.report(val_f1, epoch)
#       if trial.should_prune():
#         raise optuna.exceptions.TrialPruned()

#       # At the end of your objective function, before returning the optimization metric
#       trial.set_user_attr("train_loss", train_loss)
#       trial.set_user_attr("train_accuracy", train_acc)
#       trial.set_user_attr("train_precision", train_prec)
#       trial.set_user_attr("train_recall", train_rec)
#       trial.set_user_attr("train_f1", train_f1)
#       trial.set_user_attr("train_pr_auc", train_pr_auc)
#       trial.set_user_attr("train_roc_auc", train_roc_auc)

#       trial.set_user_attr("val_loss", val_loss)
#       trial.set_user_attr("val_accuracy", val_acc)
#       trial.set_user_attr("val_precision", val_prec)
#       trial.set_user_attr("val_recall", val_rec)
#       trial.set_user_attr("val_f1", val_f1)
#       trial.set_user_attr("val_pr_auc", val_pr_auc)
#       trial.set_user_attr("val_roc_auc", val_roc_auc)

#   return np.max(metrics['val']['f1'])


In [28]:
# # Empty cash
# torch.cuda.empty_cache()

# # Run trials
# study = optuna.create_study(direction='maximize', pruner=MedianPruner())
# study.optimize(objective, n_trials=1)

# # Get the best hyperparameters
# best_params = study.best_params
# print(best_params)

# gr = pd.read_csv(f'./gradient_statistics.csv')
# print(gr.describe())

# def create_results_dataframe(study):
#     # Create a list to hold all trial data
#     trial_data = []

#     # Iterate through all completed trials
#     for trial in study.trials:
#         # Retrieve the user attributes for the trial
#         user_attrs = trial.user_attrs
#         user_attrs["trial_number"] = trial.number
#         user_attrs["value"] = trial.value  # The objective value (e.g., validation F1 score)

#         # Append the trial data to the list
#         trial_data.append(user_attrs)

#     # Create a DataFrame from the list of trial data
#     df = pd.DataFrame(trial_data)

#     # Optionally, you might want to sort the DataFrame based on the objective value or another metric
#     df = df.sort_values("value", ascending=False)

#     return df

# # Assuming 'study' is your Optuna study object
# df_results = create_results_dataframe(study)

In [29]:
# # Instantiate tokenizer, data and model
# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# train_dataset = CustomDataset(train_df, tokenizer, max_len=128)  # Example max_len
# train_loader = DataLoader(train_dataset, batch_size=1, shuffle=False)  # Single batch for simplicity
# model = BERTClass(dropout_rate=0.1).to('cuda')
# model.eval()

# # Get a batch from your DataLoader
# for batch in train_loader:
#     ids, mask, token_type_ids, targets = batch['ids'].to('cuda'), batch['mask'].to('cuda'), batch['token_type_ids'].to('cuda'), batch['targets'].to('cuda')

#     # Forward pass to get raw and adapted embeddings
#     output, adapted_embeddings, raw_embeddings = model(ids, mask, token_type_ids, return_embeddings=True)

#     break



In [30]:
# raw_embeddings.size()

In [31]:
# from sklearn.decomposition import PCA
# import matplotlib.pyplot as plt

# # Assuming raw_embeddings is the tensor you're trying to visualize
# embeddings_pca = PCA(n_components=2).fit_transform(raw_embeddings.detach().cpu().numpy())

# plt.figure(figsize=(10, 6))
# plt.scatter(embeddings_pca[:, 0], embeddings_pca[:, 1])
# plt.title('PCA visualization of Embeddings')
# plt.show()



In [32]:
# # Show results
# val_outputs = np.array(val_outputs) >= threshold
# val_targets = np.array(val_targets)
# print(classification_report(val_targets, val_outputs))

# class BERTClass(nn.Module):
#     def __init__(self,dropout_rate):
#         super(BERTClass, self).__init__()
#         self.l1 = BertModel.from_pretrained('bert-base-uncased')
#         self.l2 = nn.Dropout(dropout_rate)
#         self.l3 = nn.Linear(768, 1)

#     def forward(self, ids, mask, token_type_ids):
#         output_1 = self.l1(ids, attention_mask=mask, token_type_ids=token_type_ids)
#         output_2 = self.l2(output_1.pooler_output)
#         output = self.l3(output_2)
#         return output

In [33]:
# class BERTClass(nn.Module):
#     def __init__(self, bert_model, dropout_rate, word2vec_embeddings, num_classes=2, bert_embedding_dim=768):
#         super(BERTClass, self).__init__()
#         self.bert_model = bert_model
#         self.global_avg_pooling = nn.AdaptiveAvgPool1d(1)
#         self.dropout = nn.Dropout(dropout_rate)
#         self.word2vec_embeddings = word2vec_embeddings
#         # Input dimension to the output layer is doubled because of concatenation
#         self.output_layer = nn.Linear(bert_embedding_dim * 2, num_classes)

#     def forward(self, input_ids, attention_mask, word2vec_embeddings):
#         # Obtain BERT embeddings
#         with torch.no_grad():
#             bert_outputs = self.bert_model(input_ids=input_ids, attention_mask=attention_mask)
#         bert_embeddings = bert_outputs.last_hidden_state

#         # Concatenate BERT and Word2Vec embeddings along the embedding dimension
#         print('bert_embeddings:',bert_embeddings.shape)
#         print('word2vec_embeddings:',word2vec_embeddings)
#         combined_embeddings = torch.cat((bert_embeddings, word2vec_embeddings), dim=-1)

#         # Apply global average pooling across the sequence length dimension
#         pooled_embeddings = self.global_avg_pooling(combined_embeddings.permute(0, 2, 1)).squeeze(-1)

#         # Apply dropout
#         final_embeddings = self.dropout(pooled_embeddings)

#         # Pass through the output layer for final classification scores
#         output = self.output_layer(final_embeddings)

#         return output


In [34]:
# class BERTCNNClass(nn.Module):
#     def __init__(self, dropout_rate, embedding_dim, cnn_output_channels, kernel_size, bert_hidden_size):
#         super(BERTCNNClass, self).__init__()
#         self.bert = BertModel.from_pretrained('bert-base-uncased')
#         self.dropout = nn.Dropout(dropout_rate)

#         # CNN for static embeddings
#         self.cnn = nn.Conv1d(in_channels=embedding_dim, out_channels=cnn_output_channels, kernel_size=kernel_size, padding=1)

#         # Global Average Pooling
#         self.cnn_gap = nn.AdaptiveAvgPool1d(1)
#         self.bert_gap = nn.AdaptiveAvgPool1d(1)

#         # Linear layer for concatenated features
#         self.fc = nn.Linear(cnn_output_channels + bert_hidden_size, 1)

#     def forward(self, ids, mask, token_type_ids, static_embeddings):
#         # BERT path
#         bert_output = self.bert(ids, attention_mask=mask, token_type_ids=token_type_ids)
#         bert_last_hidden_state = bert_output.last_hidden_state
#         bert_gap = self.bert_gap(bert_last_hidden_state.transpose(1, 2)).squeeze(2)

#         # Prepare static embeddings for CNN
#         static_embeddings = static_embeddings.permute(0, 2, 1)

#         # Prepare for CNN
#         cnn_output = torch.relu(self.cnn(static_embeddings.transpose(1, 2)))
#         cnn_gap = self.cnn_gap(cnn_output).squeeze(2)

#         # Concatenate and final linear layer
#         concatenated_features = torch.cat([bert_gap, cnn_gap], dim=1)
#         output = self.dropout(concatenated_features)
#         output = self.fc(output)

#         return output


# # Create a class for the dataset
# class CustomDataset(Dataset):
#     def __init__(self, dataframe, tokenizer, max_len):
#         self.tokenizer = tokenizer
#         self.data = dataframe
#         self.text = dataframe.text
#         self.targets = self.data.label
#         self.max_len = max_len

#     def __len__(self):
#         return len(self.text)

#     def __getitem__(self, index):
#         text = str(self.text[index])
#         text = " ".join(text.split())

#         inputs = self.tokenizer.encode_plus(
#             text,
#             add_special_tokens=True,
#             max_length=self.max_len,
#             padding="max_length",
#             truncation=True,
#             return_token_type_ids=True,
#             return_attention_mask=True,
#             return_tensors='pt'
#         )

#         ids = inputs['input_ids']
#         mask = inputs['attention_mask']
#         token_type_ids = inputs.get("token_type_ids", None)

#         return {
#                 'ids': ids.squeeze(),
#                 'mask': mask.squeeze(),
#                 'token_type_ids': token_type_ids.squeeze() if token_type_ids is not None else None,
#                 'targets': torch.tensor(self.targets[index], dtype=torch.float)
#             }


# # Function to calculate metrics
# def calculate_metrics(targets, outputs):
#     accuracy = accuracy_score(targets, outputs)
#     precision = precision_score(targets, outputs)
#     recall = recall_score(targets, outputs)
#     f1 = f1_score(targets, outputs)
#     precision_vals, recall_vals, _ = precision_recall_curve(targets, outputs)
#     pr_auc = auc(recall_vals, precision_vals)
#     roc_auc = roc_auc_score(targets, outputs)
#     return accuracy, precision, recall, f1, pr_auc, roc_auc


# def objective(trial):

#   parameters = {
#       'batch_size': trial.suggest_int('batch_size', 4, 4),
#       'learning_rate': trial.suggest_float('learning_rate', 1e-5, 1e-2, log=True),
#       'epochs': trial.suggest_int('epochs', 1, 1),
#       # trial.suggest_int('max_len', 128, 512),
#       'learning_rate': trial.suggest_float('learning_rate', 1e-5, 1e-2, log=True),
#       'dropout_rate': trial.suggest_float('dropout_rate', 0.1, 0.5, log=True),
#       'optimizer': trial.suggest_categorical('optimizer', ['AdamW', 'Adam','SGD']),
#       'weight_decay': trial.suggest_float('weight_decay', 1e-5, 1e-2, log=True),
#       'kernel_size': trial.suggest_int('kernel_size', 3, 5),
#       'cnn_output_channels': trial.suggest_int('cnn_output_channels', 64, 256)
#     }

#   # Set the parameters
#   batch_size = parameters['batch_size']
#   learning_rate = parameters['learning_rate']
#   epochs = parameters['epochs']
#   max_len = 65 # parameters['max_len']
#   dropout_rate = parameters['dropout_rate']
#   optimizer = parameters['optimizer']
#   weight_decay = parameters['weight_decay']
#   kernel_size = parameters['kernel_size']
#   cnn_output_channels = parameters['cnn_output_channels']
#   train_params = {'batch_size': batch_size,
#                   'shuffle': True,
#                   }

#   # Pass train and test to dataloader
#   training_set = CustomDataset(train_df, tokenizer, max_len)
#   val_set = CustomDataset(val_df, tokenizer, max_len)

#   # Create the dataloaders
#   training_loader = DataLoader(training_set, **train_params)
#   val_loader = DataLoader(val_set, **train_params)

#   # Instantiate model
#   static_embeddings_padded = pad_sequence([torch.tensor(seq).clone().detach() for seq in padded_sequences], batch_first=True).to('cuda')
#   embedding_dim = 100 # static_embeddings_padded.size(1)

#   # Instantiate model
#   model = BERTCNNClass(
#     dropout_rate=dropout_rate,
#     cnn_output_channels=cnn_output_channels,
#     embedding_dim=embedding_dim,
#     kernel_size=kernel_size,
#     bert_hidden_size=768
# )

#   # Move the model to the GPU
#   if torch.cuda.is_available():
#       model = model.to('cuda')

#   # Create the optimizer
#   if optimizer == 'AdamW':
#     optimizer = AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
#   elif optimizer == 'Adam':
#     optimizer = Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
#   elif optimizer == 'SGD':
#     optimizer = SGD(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
#   else:
#     raise ValueError("Invalid optimizer")

#   # Create the loss function
#   loss_function = nn.BCEWithLogitsLoss()

#   # Instantiate pruner
#   pruner = MedianPruner()

#   # Initialize lists to store metrics
#   metrics = {
#       'train': {'accuracy': [], 'precision': [], 'recall': [], 'f1': [], 'pr_auc': [], 'roc_auc': []},
#       'val': {'accuracy': [], 'precision': [], 'recall': [], 'f1': [], 'pr_auc': [], 'roc_auc': []}
#   }

#   # Define threshold
#   threshold = 0.5

#   # Training loop with metrics calculation
#   for epoch in range(epochs):
#       model.train()
#       train_targets = []
#       train_outputs = []

#       # Training phase
#       total_train_iterations = len(training_loader)
#       for i, data in tqdm(enumerate(training_loader,0),total=total_train_iterations, desc="Training"):
#           ids = data['ids'].to('cuda', dtype=torch.long)
#           mask = data['mask'].to('cuda', dtype=torch.long)
#           token_type_ids = data['token_type_ids'].to('cuda', dtype=torch.long)
#           targets = data['targets'].to('cuda', dtype=torch.float)

#           # Forward pass
#           outputs = model(ids, mask, token_type_ids, static_embeddings_padded[i])
#           optimizer.zero_grad()
#           loss = loss_function(outputs, targets.unsqueeze(1))
#           loss.backward()
#           optimizer.step()
#           train_targets.extend(targets.cpu().detach().numpy().tolist())
#           train_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())

#       # Calculate and store training metrics
#       train_outputs_bin = np.array(train_outputs) >= threshold
#       train_acc, train_prec, train_rec, train_f1, train_pr_auc, train_roc_auc = calculate_metrics(np.array(train_targets), train_outputs_bin)
#       metrics['train']['accuracy'].append(round(train_acc,4))
#       metrics['train']['precision'].append(round(train_prec,4))
#       metrics['train']['recall'].append(round(train_rec,4))
#       metrics['train']['f1'].append(round(train_f1,4))
#       metrics['train']['pr_auc'].append(round(train_pr_auc,4))
#       metrics['train']['roc_auc'].append(round(train_roc_auc,4))

#       # Validation phase
#       model.eval()
#       val_targets = []
#       val_outputs = []
#       with torch.no_grad():
#           total_val_iterations = len(val_loader)
#           for data in tqdm(val_loader, total=total_val_iterations, desc="Validation"):
#             ids = data['ids'].to('cuda', dtype=torch.long)
#             mask = data['mask'].to('cuda', dtype=torch.long)
#             token_type_ids = data['token_type_ids'].to('cuda', dtype=torch.long)
#             targets = data['targets'].to('cuda', dtype=torch.float)

#             # Forward pass
#             outputs = model(ids, mask, token_type_ids, static_embeddings_padded[i])
#             outputs = torch.sigmoid(outputs).squeeze()
#             val_targets.extend(targets.cpu().detach().numpy().tolist())
#             val_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())

#       # Calculate and store validation metrics
#       val_outputs_bin = np.array(val_outputs) >= threshold
#       val_acc, val_prec, val_rec, val_f1, val_pr_auc, val_roc_auc = calculate_metrics(np.array(val_targets), val_outputs_bin)
#       metrics['val']['accuracy'].append(round(val_acc,4))
#       metrics['val']['precision'].append(round(val_prec,4))
#       metrics['val']['recall'].append(round(val_rec,4))
#       metrics['val']['f1'].appendround(round(val_f1,4))
#       metrics['val']['pr_auc'].append(round(val_pr_auc,4))
#       metrics['val']['roc_auc'].append(round(val_roc_auc,4))

#       print(f"Epoch {epoch+1}/{epochs} - Train Metrics: Accuracy: {train_acc}, Precision: {train_prec}, Recall: {train_rec}, F1: {train_f1}, PR AUC: {train_pr_auc}, ROC AUC: {train_roc_auc}")
#       print(f"Epoch {epoch+1}/{epochs} - Val Metrics: Accuracy: {val_acc}, Precision: {val_prec}, Recall: {val_rec}, F1: {val_f1}, PR AUC: {val_pr_auc}, ROC AUC: {val_roc_auc}")
#       trial.report(val_f1, epoch)
#       if trial.should_prune():
#         raise optuna.exceptions.TrialPruned()

#   return np.max(metrics['val']['f1'])


In [35]:
# class CNNForWord2Vec(nn.Module):
#     def __init__(self, vocab_size, embedding_dim, num_filters, filter_sizes, dropout_rate):
#         super(CNNForWord2Vec, self).__init__()

#         # Static word embeddings layer.
#         self.embedding = nn.Embedding(vocab_size, embedding_dim)

#         # Convolutional layers for different filter sizes applied to the word embeddings.
#         self.convs = nn.ModuleList([
#             nn.Conv2d(in_channels=1, out_channels=num_filters, kernel_size=(k, embedding_dim), padding=(k - 1, 0))
#             for k in filter_sizes
#         ])

#         # Batch normalization applied to the output of convolutional layers.
#         self.conv_bn = nn.ModuleList([
#             nn.BatchNorm2d(num_filters) for _ in filter_sizes
#         ])

#         # Global average pooling applied to the output of each convolutional layer.
#         self.cnn_global_avg_pool = nn.AdaptiveAvgPool2d((1, num_filters))

#         # BERT model to obtain contextual embeddings from input tokens.
#         self.bert_embedding = BertModel.from_pretrained('bert-base-uncased')

#         # Global average pooling layer to process BERT embeddings.
#         self.bert_global_avg_pool = nn.AdaptiveAvgPool1d(1)

#         # Dropout layer for regularization.
#         self.dropout = nn.Dropout(dropout_rate)

#         # Fully connected layer for classification. Since features from both CNN and BERT embeddings are concatenated,
#         # the input features are doubled.
#         self.fc = nn.Linear(num_filters * len(filter_sizes) + embedding_dim, 1)

#     def forward(self, x, bert_input_ids, bert_attention_mask):
#         # Convert token ids to embeddings
#         x = self.embedding(x)  # [batch_size, seq_length, embedding_dim]

#         # Add a channel dimension and apply convolutional layers followed by batch normalization.
#         x = x.unsqueeze(1)  # Add channel dimension
#         x = [F.relu(bn(conv(x))) for conv, bn in zip(self.convs, self.conv_bn)]

#         # Apply global average pooling to the output of each convolutional layer and flatten the result.
#         x = [self.cnn_global_avg_pool(xi).view(xi.size(0), -1) for xi in x]
#         x = torch.cat(x, 1)  # Concatenate along the filter dimension

#         # Get BERT embeddings and apply global average pooling.
#         bert_embeddings = self.bert_embedding(input_ids=bert_input_ids, attention_mask=bert_attention_mask)['last_hidden_state']
#         x_bert = self.bert_global_avg_pool(bert_embeddings.permute(0, 2, 1)).squeeze(2)

#         # Concatenate the outputs from CNN and BERT embeddings.
#         x = torch.cat((x, x_bert), 1)

#         # Apply dropout and pass through the fully connected layer for classification.
#         x = self.dropout(x)
#         x = self.fc(x)
#         return x
