# Project 266

## Setup

## Import libraries

In [2]:
!pip install loralib
!pip install optuna

Collecting loralib
  Downloading loralib-0.1.2-py3-none-any.whl (10 kB)
Installing collected packages: loralib
Successfully installed loralib-0.1.2
Collecting optuna
  Downloading optuna-3.6.1-py3-none-any.whl (380 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m380.1/380.1 kB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.13.1-py3-none-any.whl (233 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.4/233.4 kB[0m [31m24.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting colorlog (from optuna)
  Downloading colorlog-6.8.2-py3-none-any.whl (11 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.2-py3-none-any.whl (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.7/78.7 kB[0m [31m12.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: Mako, colorlog, alembic, optuna
Successfully installed Mako-1.3.2 alembic-1.13.1 colorlog-6.8

In [3]:
# Torch
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim import AdamW, Adam, SGD
from torch.nn.utils.rnn import pad_sequence
from torch.cuda.amp import autocast

# from keras.preprocessing.sequence import pad_sequences

# Gensim
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess

# Sklearn
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, auc, roc_auc_score, precision_recall_curve, classification_report

# Bert
from transformers import AutoModel, AutoTokenizer, get_linear_schedule_with_warmup
from torch.utils.data import Dataset, DataLoader, TensorDataset

# Admin
import os
import time
from tqdm.auto import tqdm
import re
import gc

# Data
import pandas as pd
import numpy as np
import random
import h5py

# loRA
import loralib as lora

# Gradients
import csv

# Optuna
import optuna
from optuna.pruners import MedianPruner

# Visualizations
import matplotlib.pyplot as plt

## Functions

In [4]:
# Combined Cleaning and Preprocessing Function
def clean_and_preprocess_tweets(df):
    def clean_tweet(tweet):
        # Remove URLs
        tweet = re.sub(r'http\S+|www\S+|https\S+', '', tweet, flags=re.MULTILINE)
        # Remove user mentions
        tweet = re.sub(r'@\w+', '', tweet)
        # Remove excessive whitespace
        tweet = re.sub(r'\s+', ' ', tweet).strip()
        return tweet

    # Apply cleaning and simple preprocessing
    return df['text'].apply(lambda x: simple_preprocess(clean_tweet(x)))

# Streamlined Training and Sequence Preparation
def prepare_word2vec_sequences(df, max_len=None):
    # Step 1: Clean and preprocess tweets
    tweets_preprocessed = clean_and_preprocess_tweets(df)

    # Step 2: Train Word2Vec
    word2vec_model = Word2Vec(sentences=tweets_preprocessed, vector_size=768, window=5, min_count=1, workers=4)

    # Step 3: Create word to index mapping
    word_index = {word: i for i, word in enumerate(word2vec_model.wv.index_to_key)}

    # Step 4: Convert tweets to sequences of indices
    sequences = [[word_index.get(word, 0) for word in tweet] for tweet in tweets_preprocessed]

    # Convert sequences to PyTorch tensors before padding
    sequences_tensors = [torch.tensor(seq) for seq in sequences]

    # Step 5: Pad sequences
    max_len = max(len(seq) for seq in sequences_tensors)

    # Step 6: Make padded sequence
    padded_sequences = pad_sequence(sequences_tensors, batch_first=True, padding_value=0)

    # Dimensions
    word2vec_dim = 300
    bert_dim = 768

    # Linear transformation layer
    projection_layer = nn.Linear(in_features=word2vec_dim, out_features=bert_dim)
    torch.nn.init.xavier_uniform_(projection_layer.weight)

    # Example Word2Vec embeddings tensor ([batch_size, sequence_length, word2vec_dim])
    word2vec_embeddings = torch.randn(len(padded_sequences), max_len, word2vec_dim)

    # Project embeddings
    projected_embeddings = projection_layer(word2vec_embeddings)

    return projected_embeddings, word2vec_model

# Prepare embeddings

In [5]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [6]:
# Import df
df = pd.read_csv('/content/drive/My Drive/266 - Project/csv files for train_val_test and embeddings/text_for_embeddings.csv', encoding='utf-8')#, skiprows=[34331,67469,98406], error_bad_lines=False)
print('df size:', df.shape)

# Print pivot table
pd.pivot_table(df, index=['source'], columns='label', values=['text'], aggfunc='count')

# Instantiate tweets
tweets = df[df['source'].isin(['t3','t5'])].copy()

df size: (43102, 4)


In [7]:
# # Get padded sequences and word2vec model
# word2vec_embeddings, word2vec_model = prepare_word2vec_sequences(df)
# #Save
# word2vec_model.save("/content/drive/My Drive/266 - Project/word2vec_model.model")
# torch.save(word2vec_embeddings, '/content/drive/My Drive/266 - Project/word2vec_embeddings.pt')

In [8]:
# Load Word2vec model
# word2vec_model = Word2Vec.load("/content/drive/My Drive/266 - Project/project/support/word2vec_model.model")

In [9]:
# Load the precomputed Word2vec embeddings
# word2vec_embeddings = torch.load('/content/drive/My Drive/266 - Project/project/support/word2vec_embeddings.pt')

In [10]:
# with h5py.File('/content/drive/My Drive/266 - Project/project/support/word2vec_embeddings.h5', 'w') as h5f:
#     h5f.create_dataset('word2vec_embeddings', data=word2vec_embeddings.detach().numpy())

In [11]:
# # Print word2vec_embeddings shape, size
# print(f"word2vec_embeddings.shape: {word2vec_embeddings.shape}")
# print(f"Word2Vec embeddings size: {len(word2vec_embeddings)}")

In [12]:
# vocab_size = len(word2vec_model.wv.index_to_key)
# print(f"vocab_size: {vocab_size}")

In [13]:
# # Word2vec matrix
# embedding_matrix = np.zeros((len(word2vec_model.wv.index_to_key) + 1, word2vec_model.vector_size))  # +1 for padding token
# for i, word in enumerate(word2vec_model.wv.index_to_key):
#     embedding_vector = word2vec_model.wv[word]
#     if embedding_vector is not None:
#         embedding_matrix[i + 1] = embedding_vector  # i+1 to account for padding token at index 0

# print(f"embedding_matrix.shape: {embedding_matrix.shape}")

In [14]:
# # Generate word indices
# max_seq_length = 512
# word_indices = [[word2vec_model.wv.key_to_index[word] if word in word2vec_model.wv else 0 for word in simple_preprocess(text)] for text in df['text']]
# word_indices = pad_sequences(word_indices, maxlen=max_seq_length, padding='post', truncating='post')

In [15]:
# # Load the precomputed word indices
# word_indices = np.load('/content/drive/My Drive/266 - Project/csv files for train_val_test and embeddings/word_indices.npy', allow_pickle=True)

In [16]:
# # Print word_indices shape
# print(f"word_indices shape: {np.array(word_indices).shape}")

In [17]:
# Save the precomputed word indices
# np.save('/content/drive/My Drive/266 - Project/csv files for train_val_test and embeddings/word_indices.npy', word_indices)

# Create BERT embeddings

In [18]:
# bert_model = AutoModel.from_pretrained('bert-base-uncased')
# bert_model = bert_model.to('cuda')

In [19]:
# tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

In [20]:
# tokenized_texts = []
# for text in tweets['text']:
#     tokens = tokenizer.encode_plus(
#             text,
#             add_special_tokens=True,
#             max_length = 512,
#             padding="max_length",
#             truncation=True,
#             return_tensors='pt'
#         )
#     tokenized_texts.append(tokens)

In [21]:
# # Batch embeddings generating
# class TextDataset(Dataset):
#     def __init__(self, encodings):
#         self.encodings = encodings

#     def __len__(self):
#         return len(self.encodings['input_ids'])

#     def __getitem__(self, idx):
#         return {key: val[idx].clone().detach() for key, val in self.encodings.items()}

# texts = tweets['text'].tolist()

# # Tokenization
# encodings = tokenizer(texts, padding=True, truncation=True, return_tensors='pt', max_length=512)
# dataset = TextDataset(encodings)

# # DataLoader
# loader = DataLoader(dataset, batch_size=32, shuffle=False, num_workers=8)

# def generate_embeddings(model, dataloader):
#     model.eval()
#     embeddings = []
#     for batch in dataloader:
#         input_ids = batch['input_ids'].to('cuda')
#         attention_mask = batch['attention_mask'].to('cuda')
#         with torch.no_grad(), autocast():
#             outputs = model(input_ids=input_ids, attention_mask=attention_mask)
#             # # Extract the [CLS] token embeddings
#             batch_embeddings = outputs.last_hidden_state[:,0,:].cpu()
#             embeddings.append(batch_embeddings)
#     # Concatenate the embeddings from each batch
#     embeddings = torch.cat(embeddings, dim=0)
#     return embeddings

# # Generate embeddings
# bert_embeddings = generate_embeddings(bert_model, loader)

In [22]:
# Load the precomputed BERT embeddings
# bert_embeddings = torch.load('/content/drive/My Drive/266 - Project/csv files for train_val_test and embeddings/bert_embeddings.pt')

In [23]:
# with h5py.File('/content/drive/My Drive/266 - Project/project/support/bert_embeddings.h5', 'w') as h5f:
#     h5f.create_dataset('bert_embeddings', data=bert_embeddings.detach().numpy())

In [24]:
# # Print bert_embeddings shape, size
# print(f"BERT embeddings shape: {bert_embeddings.shape}")
# print(f"BERT embeddings size: {len(bert_embeddings)}")

In [25]:
# Save the precomputed BERT embeddings
# torch.save(bert_embeddings, '/content/drive/My Drive/266 - Project/csv files for train_val_test and embeddings/bert_embeddings.pt')

In [26]:
# Empty GPU cache
gc.collect()
torch.cuda.empty_cache()
gc.collect()
torch.cuda.empty_cache()

# Import training

In [27]:
# Import data sets
train_df = pd.read_csv(f'/content/drive/My Drive/266 - Project/csv files for train_val_test and embeddings/train_df.csv')
val_df = pd.read_csv(f'/content/drive/My Drive/266 - Project/csv files for train_val_test and embeddings/val_df.csv')
test_df = pd.read_csv(f'/content/drive/My Drive/266 - Project/csv files for train_val_test and embeddings/test_df.csv')

In [28]:
tf = train_df.copy()
tf['len'] = tf['text'].apply(lambda x: len(x))
tf.len.describe()

count    17990.000000
mean        96.605003
std        108.156943
min          7.000000
25%         52.000000
50%         67.000000
75%         86.000000
max       1167.000000
Name: len, dtype: float64

In [29]:
tf = val_df.copy()
tf['len'] = tf['text'].apply(lambda x: len(x))
tf.len.describe()

count    4498.000000
mean       95.427968
std       105.036609
min         8.000000
25%        52.000000
50%        67.000000
75%        86.000000
max      1312.000000
Name: len, dtype: float64

In [30]:

tf = test_df.copy()
tf['len'] = tf['text'].apply(lambda x: len(x))
tf.len.describe()

count    7028.000000
mean       98.281588
std       110.679004
min         8.000000
25%        52.000000
50%        67.000000
75%        86.000000
max       957.000000
Name: len, dtype: float64

# CNN with Global Average Pooling for Word2Vec Embeddings

In [33]:
class CNNForWord2VecBERT(nn.Module):
    def __init__(self, num_filters, filter_sizes, dropout_rate, word2vec_dim = 768, bert_dim = 768):
        super(CNNForWord2VecBERT, self).__init__()

        # Convolutional layers: Adjusted for embedding dimensions
        self.convs = nn.ModuleList([
            nn.Conv2d(in_channels=1, out_channels=num_filters, kernel_size=(k, word2vec_dim), padding=(k - 1, 0)) for k in filter_sizes
        ])

        # Batch normalization layers: https://pytorch.org/docs/stable/generated/torch.nn.BatchNorm2d.html#torch.nn.BatchNorm2d
        self.conv_bn = nn.ModuleList([
            nn.BatchNorm2d(num_filters) for _ in filter_sizes
        ])

        # Global Average Pooling layer for CNN features
        self.cnn_global_avg_pool = nn.AdaptiveAvgPool2d((1, num_filters))

        # global average pooling layer for BERT embeddings
        self.bert_global_avg_pool = nn.AdaptiveAvgPool1d(1)

        # Dropout layer
        self.dropout = nn.Dropout(dropout_rate)

        # Fully connected layer: Adjust according to your task
        self.fc = nn.Linear(num_filters * len(filter_sizes) + bert_dim, 1) # The "* 2" accounts for concatenation of avg and max pooling features

    def forward(self, x_w2v, x_bert):

        # Add a channel dimension: [batch_size, 1, max_sequence_length, embedding_dim]
        x_w2v = x_w2v.unsqueeze(1)

        # Apply convolutions and ReLU
        x_w2v = [F.relu(conv(x_w2v)).squeeze(3) for conv in self.convs]

        # Apply global average pooling
        x_w2v = [self.cnn_global_avg_pool(xi).squeeze(2) for xi in x_w2v]

        # Concatenate along the filter dimension
        x_w2v = torch.cat(x_w2v, 1)

        # Flatten
        x_w2v = x_w2v.view(x_w2v.size(0), -1)

        # Concatenate Word2Vec and BERT embeddings
        x_combined = torch.cat((x_w2v, x_bert), 1)

        # Apply dropout
        x_combined = self.dropout(x_combined)

        # Apply fully connected layer
        x_combined = self.fc(x_combined)

        return x_combined

In [42]:
# Create a class for the dataset
class CustomDataset(Dataset):
    def __init__(self, dataframe):
        self.data = dataframe
        self.text = dataframe.text
        self.targets = torch.tensor(dataframe.label.values, dtype=torch.float32)
        self.word2vec_embeddings = h5py.File('/content/drive/My Drive/266 - Project/project/support/word2vec_embeddings.h5', 'r')['word2vec_embeddings']
        self.bert_embeddings = h5py.File('/content/drive/My Drive/266 - Project/project/support/bert_embeddings.h5', 'r')['bert_embeddings']

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        word2vec_embeddings = torch.tensor(self.word2vec_embeddings[index][()])#.view(max_len, word2vec_dim)
        bert_embeddings = torch.tensor(self.bert_embeddings[index][()])#.view(max_len, bert_dim)
        targets = self.targets[index]
        return {
            'word2vec_embeddings': word2vec_embeddings,
            'bert_embeddings': bert_embeddings,
            'targets': targets
        }

    # Method to close the HDF5 file
    def close(self):
        self.word2vec_embeddings.file.close()
        self.bert_embeddings.file.close()

# Function to calculate metrics
def calculate_metrics(targets, outputs):
    accuracy = accuracy_score(targets, outputs)
    precision = precision_score(targets, outputs)
    recall = recall_score(targets, outputs)
    f1 = f1_score(targets, outputs)
    precision_vals, recall_vals, _ = precision_recall_curve(targets, outputs)
    pr_auc = auc(recall_vals, precision_vals)
    roc_auc = roc_auc_score(targets, outputs)
    return accuracy, precision, recall, f1, pr_auc, roc_auc

def objective(trial):
  # Define the hyperparameters to tune
  parameters = {
      'batch_size': trial.suggest_int('batch_size', 2, 4),
      'learning_rate': trial.suggest_float('learning_rate', 1e-5, 1e-2, log=True),
      'epochs': trial.suggest_int('epochs', 3, 5),
      'learning_rate': trial.suggest_float('learning_rate', 1e-5, 1e-2, log=True),
      'dropout_rate': trial.suggest_float('dropout_rate', 0.1, 0.5, log=True),
      'optimizer': trial.suggest_categorical('optimizer', ['AdamW', 'Adam','SGD']),
      'weight_decay': trial.suggest_float('weight_decay', 1e-5, 1e-2, log=True),
      'max_len': trial.suggest_int('max_len', 50, 100),
      'num_filters': trial.suggest_int('num_filters', 64, 256),
      'accumulation_steps': trial.suggest_int('accumulation_steps', 1, 10),

    }

  # Set the parameters
  batch_size = parameters['batch_size']
  learning_rate = parameters['learning_rate']
  epochs = parameters['epochs']
  dropout_rate = parameters['dropout_rate']
  max_len = parameters['max_len']
  optimizer = parameters['optimizer']
  weight_decay = parameters['weight_decay']
  num_filters = parameters['num_filters']
  accumulation_steps = parameters['accumulation_steps']

  # Define teh parameters
  train_params = {'batch_size': batch_size,'shuffle': True}
  filter_sizes = [3, 4, 5]  # Sizes of filters
  word2vec_dim = 768
  bert_dim = 768

  # Pass train and test to dataloader
  training_set = CustomDataset(train_df.head(10000))
  val_set = CustomDataset(val_df.head(5000))

  # Create the dataloaders
  training_loader = DataLoader(training_set, **train_params)
  val_loader = DataLoader(val_set, **train_params)

  # Instantiate model
  model = CNNForWord2VecBERT(num_filters, filter_sizes, dropout_rate, word2vec_dim, bert_dim)

  # Move the model to the GPU
  if torch.cuda.is_available():
      model = model.to('cuda')

  # Create the optimizer
  if optimizer == 'AdamW':
    optimizer = AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
  elif optimizer == 'Adam':
    optimizer = Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
  elif optimizer == 'SGD':
    optimizer = SGD(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
  else:
    raise ValueError("Invalid optimizer")

  # Create the loss function
  loss_function = nn.BCEWithLogitsLoss()

  # Instantiate pruner
  pruner = MedianPruner()

  # Initialize lists to store metrics
  metrics = {
      'train': {'loss': [], 'accuracy': [], 'precision': [], 'recall': [], 'f1': [], 'pr_auc': [], 'roc_auc': []},
      'val': {'loss':[], 'accuracy': [], 'precision': [], 'recall': [], 'f1': [], 'pr_auc': [], 'roc_auc': []}
  }

  # Define threshold
  threshold = 0.5

  # Training loop with metrics calculation
  for epoch in range(epochs):
      model.train()
      train_targets = []
      train_outputs = []

      # Training phase
      total_train_iterations = len(training_loader)
      total_loss = 0
      for i, data in tqdm(enumerate(training_loader,0),total=total_train_iterations, desc="Training"):
          word2vec_embeddings = data['word2vec_embeddings'].to('cuda')
          bert_embeddings = data['bert_embeddings'].to('cuda')
          targets = data['targets'].to('cuda')

          # Forward pass
          outputs = model(word2vec_embeddings, bert_embeddings)
          loss = loss_function(outputs, targets.unsqueeze(1))
          loss.backward()
          if (i + 1) % accumulation_steps == 0:  # Wait for several backward steps
              optimizer.step()  # Now we can do an optimizer step
              optimizer.zero_grad()  # Reset gradients tensors
          train_targets.extend(targets.cpu().detach().numpy().tolist())
          train_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())

      # Calculate and store training metrics
      train_outputs_bin = np.array(train_outputs) >= threshold
      train_acc, train_prec, train_rec, train_f1, train_pr_auc, train_roc_auc = calculate_metrics(np.array(train_targets), train_outputs_bin)
      total_loss += loss.item()
      train_loss = total_loss / len(training_loader)
      metrics['train']['loss'].append(round(train_loss,4))
      metrics['train']['accuracy'].append(round(train_acc,4))
      metrics['train']['precision'].append(round(train_prec,4))
      metrics['train']['recall'].append(round(train_rec,4))
      metrics['train']['f1'].append(round(train_f1,4))
      metrics['train']['pr_auc'].append(round(train_pr_auc,4))
      metrics['train']['roc_auc'].append(round(train_roc_auc,4))

     # Validation phase
      model.eval()
      val_targets = []
      val_outputs = []
      val_loss_accumulated = 0.0  # To accumulate loss over all validation batches

      with torch.no_grad():
          total_val_iterations = len(val_loader)
          for data in tqdm(val_loader, total=total_val_iterations, desc="Validation"):
              word2vec_embeddings = data['word2vec_embeddings'].to('cuda' if torch.cuda.is_available() else 'cpu')
              bert_embeddings = data['bert_embeddings'].to('cuda' if torch.cuda.is_available() else 'cpu')
              targets = data['targets'].to('cuda' if torch.cuda.is_available() else 'cpu')

              # Forward pass
              outputs = model(word2vec_embeddings, bert_embeddings)  # Assuming model outputs logits
              loss = loss_function(outputs, targets.unsqueeze(1))
              val_loss_accumulated += loss.item()

              outputs = torch.sigmoid(outputs).squeeze()  # Apply sigmoid once to get probabilities
              val_targets.extend(targets.cpu().detach().numpy())
              # Assuming outputs could be a scalar or an array, ensure it's always treated as an iterable
              outputs_np = outputs.cpu().detach().numpy()  # Convert to numpy array

              # If outputs_np is a scalar (0-d array), convert it into a 1-d array with a single value
              if outputs_np.ndim == 0:
                  outputs_np = np.expand_dims(outputs_np, axis=0)

              val_outputs.extend(outputs_np)
              # val_outputs.extend(outputs.cpu().detach().numpy())

      # Calculate average validation loss
      val_loss = val_loss_accumulated / total_val_iterations

      # Convert outputs to binary predictions based on the threshold
      val_outputs_bin = np.array(val_outputs) >= threshold
      # Now calculate and print metrics using val_targets and val_outputs_bin
      val_acc, val_prec, val_rec, val_f1, val_pr_auc, val_roc_auc = calculate_metrics(np.array(val_targets), val_outputs_bin)
      metrics['val']['loss'].append(round(val_loss,4))
      metrics['val']['accuracy'].append(round(val_acc,4))
      metrics['val']['precision'].append(round(val_prec,4))
      metrics['val']['recall'].append(round(val_rec,4))
      metrics['val']['f1'].append(round(val_f1,4))
      metrics['val']['pr_auc'].append(round(val_pr_auc,4))
      metrics['val']['roc_auc'].append(round(val_roc_auc,4))

      print(f"Epoch {epoch+1}/{epochs} - Train Metrics: Loss: {train_loss}, Accuracy: {train_acc}, Precision: {train_prec}, Recall: {train_rec}, F1: {train_f1}, PR AUC: {train_pr_auc}, ROC AUC: {train_roc_auc}")
      print(f"Epoch {epoch+1}/{epochs} - Val Metrics: Loss: {val_loss},  Accuracy: {val_acc}, Precision: {val_prec}, Recall: {val_rec}, F1: {val_f1}, PR AUC: {val_pr_auc}, ROC AUC: {val_roc_auc}")
      trial.report(val_f1, epoch)
      if trial.should_prune():
        raise optuna.exceptions.TrialPruned()

      # At the end of your objective function, before returning the optimization metric
      trial.set_user_attr("train_loss", train_loss)
      trial.set_user_attr("train_accuracy", train_acc)
      trial.set_user_attr("train_precision", train_prec)
      trial.set_user_attr("train_recall", train_rec)
      trial.set_user_attr("train_f1", train_f1)
      trial.set_user_attr("train_pr_auc", train_pr_auc)
      trial.set_user_attr("train_roc_auc", train_roc_auc)

      trial.set_user_attr("val_loss", val_loss)
      trial.set_user_attr("val_accuracy", val_acc)
      trial.set_user_attr("val_precision", val_prec)
      trial.set_user_attr("val_recall", val_rec)
      trial.set_user_attr("val_f1", val_f1)
      trial.set_user_attr("val_pr_auc", val_pr_auc)
      trial.set_user_attr("val_roc_auc", val_roc_auc)

  return np.max(metrics['val']['f1'])


In [40]:
# Empty cash
!PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
gc.collect()
torch.cuda.empty_cache()
gc.collect()
torch.cuda.empty_cache()

# Run trials
study = optuna.create_study(direction='maximize', pruner=MedianPruner())
study.optimize(objective, n_trials=10)

# Get the best hyperparameters
best_params = study.best_params
print(best_params)



[I 2024-04-02 10:28:34,670] A new study created in memory with name: no-name-fc3186b5-98f5-48fc-8b2f-520993740014


Training:   0%|          | 0/2500 [00:00<?, ?it/s]

Validation:   0%|          | 0/1125 [00:00<?, ?it/s]

Epoch 1/5 - Train Metrics: Loss: 0.00024695143699645993, Accuracy: 0.4994, Precision: 0.48042616451932607, Recall: 0.39995874587458746, F1: 0.43651508329581273, PR AUC: 0.5856424551969568, ROC AUC: 0.49646617417952976
Epoch 1/5 - Val Metrics: Loss: 0.6941715146700541,  Accuracy: 0.493552690084482, Precision: 0.4864864864864865, Recall: 0.6551410373066424, F1: 0.5583559519193485, PR AUC: 0.6550734328614377, ROC AUC: 0.4971357360446256


Training:   0%|          | 0/2500 [00:00<?, ?it/s]

Validation:   0%|          | 0/1125 [00:00<?, ?it/s]

Epoch 2/5 - Train Metrics: Loss: 0.00026579763889312744, Accuracy: 0.4999, Precision: 0.4794961136424551, Recall: 0.3690181518151815, F1: 0.4170649259820492, PR AUC: 0.5772071327288183, ROC AUC: 0.4960385790131807
Epoch 2/5 - Val Metrics: Loss: 0.6941659150653415,  Accuracy: 0.5133392618941752, Precision: 0.513595166163142, Recall: 0.07734303912647862, F1: 0.13444049031237645, PR AUC: 0.5209026286563709, ROC AUC: 0.5036715195632393


Training:   0%|          | 0/2500 [00:00<?, ?it/s]

Validation:   0%|          | 0/1125 [00:00<?, ?it/s]

Epoch 3/5 - Train Metrics: Loss: 0.0002875178098678589, Accuracy: 0.5081, Precision: 0.48995756718528993, Recall: 0.3572607260726073, F1: 0.4132172253369915, PR AUC: 0.5794091466289486, ROC AUC: 0.5036497729741918
Epoch 3/5 - Val Metrics: Loss: 0.6938258958922492,  Accuracy: 0.4968875055580258, Precision: 0.4839347503707365, Recall: 0.4454049135577798, F1: 0.46387112058753843, PR AUC: 0.6001745007059212, ROC AUC: 0.49574593503975944


Training:   0%|          | 0/2500 [00:00<?, ?it/s]

Validation:   0%|          | 0/1125 [00:00<?, ?it/s]

Epoch 4/5 - Train Metrics: Loss: 0.0002739218235015869, Accuracy: 0.5143, Precision: 0.4987860803884543, Recall: 0.3813943894389439, F1: 0.4322618351841028, PR AUC: 0.5900402349136991, ROC AUC: 0.5103788717381055
Epoch 4/5 - Val Metrics: Loss: 0.6937984007729424,  Accuracy: 0.510226767452201, Precision: 0.496551724137931, Recall: 0.1637852593266606, F1: 0.24632227163872736, PR AUC: 0.5344815197447458, ROC AUC: 0.5025448035763738


Training:   0%|          | 0/2500 [00:00<?, ?it/s]

Validation:   0%|          | 0/1125 [00:00<?, ?it/s]

[I 2024-04-02 10:30:53,882] Trial 0 finished with value: 0.5861 and parameters: {'batch_size': 4, 'learning_rate': 0.00029788540000304036, 'epochs': 5, 'dropout_rate': 0.42872571616305444, 'optimizer': 'SGD', 'weight_decay': 4.6585304074828056e-05, 'max_len': 74, 'num_filters': 254, 'accumulation_steps': 3}. Best is trial 0 with value: 0.5861.


Epoch 5/5 - Train Metrics: Loss: 0.00027144355773925784, Accuracy: 0.5149, Precision: 0.49958847736625517, Recall: 0.3756188118811881, F1: 0.42882373719533734, PR AUC: 0.5889536446237217, ROC AUC: 0.5107907724002214
Epoch 5/5 - Val Metrics: Loss: 0.6948813226487901,  Accuracy: 0.49533125833703867, Precision: 0.48904443091905053, Recall: 0.7311191992720655, F1: 0.5860685630926331, PR AUC: 0.6757776799243708, ROC AUC: 0.5005595996360328


Training:   0%|          | 0/3334 [00:00<?, ?it/s]

Validation:   0%|          | 0/1500 [00:00<?, ?it/s]

Epoch 1/3 - Train Metrics: Loss: 0.00022900370902191326, Accuracy: 0.5037, Precision: 0.48218159281066003, Recall: 0.320957095709571, F1: 0.3853869969040248, PR AUC: 0.5661693442601154, ROC AUC: 0.4983085167988848
Epoch 1/3 - Val Metrics: Loss: 0.6938865738709767,  Accuracy: 0.506447309915518, Precision: 0.4916918429003021, Recall: 0.2961783439490446, F1: 0.36967632027257236, PR AUC: 0.5659004113437485, ROC AUC: 0.5017848241484354


Training:   0%|          | 0/3334 [00:00<?, ?it/s]

Validation:   0%|          | 0/1500 [00:00<?, ?it/s]

Epoch 2/3 - Train Metrics: Loss: 0.00019494124565856786, Accuracy: 0.5148, Precision: 0.4993788819875776, Recall: 0.3316831683168317, F1: 0.3986117997025285, PR AUC: 0.5775310251522047, ROC AUC: 0.5093974847795338
Epoch 2/3 - Val Metrics: Loss: 0.6940356679757436,  Accuracy: 0.5062249888839484, Precision: 0.4883720930232558, Recall: 0.2197452229299363, F1: 0.3031063696266081, PR AUC: 0.5446989425475164, ROC AUC: 0.4998726114649682


Training:   0%|          | 0/3334 [00:00<?, ?it/s]

Validation:   0%|          | 0/1500 [00:00<?, ?it/s]

[I 2024-04-02 10:31:44,672] Trial 1 finished with value: 0.3851 and parameters: {'batch_size': 3, 'learning_rate': 1.7654759495880274e-05, 'epochs': 3, 'dropout_rate': 0.12007063816069286, 'optimizer': 'Adam', 'weight_decay': 0.0027330255357404082, 'max_len': 80, 'num_filters': 94, 'accumulation_steps': 1}. Best is trial 0 with value: 0.5861.


Epoch 3/3 - Train Metrics: Loss: 0.00023289973248102455, Accuracy: 0.5219, Precision: 0.5112453843571668, Recall: 0.31415016501650167, F1: 0.3891657084451259, PR AUC: 0.5789477746868341, ROC AUC: 0.515770734682164
Epoch 3/3 - Val Metrics: Loss: 0.6940634963512421,  Accuracy: 0.5080035571365051, Precision: 0.49464668094218417, Recall: 0.31528662420382164, F1: 0.38510697415948875, PR AUC: 0.5722632288291167, ROC AUC: 0.50373026862365


Training:   0%|          | 0/2500 [00:00<?, ?it/s]

Validation:   0%|          | 0/1125 [00:00<?, ?it/s]

Epoch 1/4 - Train Metrics: Loss: 0.00029036295413970947, Accuracy: 0.5024, Precision: 0.484819734345351, Recall: 0.42161716171617164, F1: 0.45101500441306264, PR AUC: 0.5934184480307613, ROC AUC: 0.5000166553922474
Epoch 1/4 - Val Metrics: Loss: 0.6935692981614007,  Accuracy: 0.5131169408626056, Precision: 0.5028694404591105, Recall: 0.318926296633303, F1: 0.3903118040089087, PR AUC: 0.5773051606760422, ROC AUC: 0.5088109744036081


Training:   0%|          | 0/2500 [00:00<?, ?it/s]

Validation:   0%|          | 0/1125 [00:00<?, ?it/s]

Epoch 2/4 - Train Metrics: Loss: 0.00028083741664886475, Accuracy: 0.5087, Precision: 0.4926153147012043, Recall: 0.4471947194719472, F1: 0.4688074386420154, PR AUC: 0.6039050170865757, ROC AUC: 0.5068854032142345
Epoch 2/4 - Val Metrics: Loss: 0.6962022102673848,  Accuracy: 0.5100044464206314, Precision: 0.49444444444444446, Recall: 0.12147406733393995, F1: 0.19503287070854639, PR AUC: 0.522610211869628, ROC AUC: 0.5013892075800134


Training:   0%|          | 0/2500 [00:00<?, ?it/s]

Validation:   0%|          | 0/1125 [00:00<?, ?it/s]

Epoch 3/4 - Train Metrics: Loss: 0.0003036217212677002, Accuracy: 0.5088, Precision: 0.4926739926739927, Recall: 0.4438943894389439, F1: 0.4670138888888889, PR AUC: 0.6030841910564683, ROC AUC: 0.5068850829182298
Epoch 3/4 - Val Metrics: Loss: 0.7151887495782641,  Accuracy: 0.48799466429524235, Precision: 0.488268156424581, Recall: 0.9940855323020928, F1: 0.654877866027274, PR AUC: 0.7426219310685392, ROC AUC: 0.49921667919452467


Training:   0%|          | 0/2500 [00:00<?, ?it/s]

Validation:   0%|          | 0/1125 [00:00<?, ?it/s]

[I 2024-04-02 10:32:51,380] Trial 2 finished with value: 0.6549 and parameters: {'batch_size': 4, 'learning_rate': 0.0023210504217389414, 'epochs': 4, 'dropout_rate': 0.37284646626008966, 'optimizer': 'SGD', 'weight_decay': 0.0037089747420278227, 'max_len': 62, 'num_filters': 170, 'accumulation_steps': 10}. Best is trial 2 with value: 0.6549.


Epoch 4/4 - Train Metrics: Loss: 0.0003080068826675415, Accuracy: 0.5091, Precision: 0.49317520698142764, Recall: 0.4546204620462046, F1: 0.4731136631963078, PR AUC: 0.606097834513816, ROC AUC: 0.5074926844392513
Epoch 4/4 - Val Metrics: Loss: 0.7321414005491469,  Accuracy: 0.5113383726100489, Precision: 0.5, Recall: 0.0022747952684258415, F1: 0.004528985507246376, PR AUC: 0.4949124087502645, ROC AUC: 0.5000504411124739


Training:   0%|          | 0/5000 [00:00<?, ?it/s]

Validation:   0%|          | 0/2249 [00:00<?, ?it/s]

Epoch 1/5 - Train Metrics: Loss: 0.0001373710870742798, Accuracy: 0.5085, Precision: 0.49037632864119507, Recall: 0.3521039603960396, F1: 0.4098931444351062, PR AUC: 0.5782901445186173, ROC AUC: 0.5038858311296969
Epoch 1/5 - Val Metrics: Loss: 0.6960153581567106,  Accuracy: 0.5095598043574923, Precision: 0.4880239520958084, Recall: 0.07415832575068244, F1: 0.12875197472353872, PR AUC: 0.5073027885452996, ROC AUC: 0.4999052498318629


Training:   0%|          | 0/5000 [00:00<?, ?it/s]

Validation:   0%|          | 0/2249 [00:00<?, ?it/s]

Epoch 2/5 - Train Metrics: Loss: 0.00011517384052276611, Accuracy: 0.5898, Precision: 0.6056657223796034, Recall: 0.44100660066006603, F1: 0.5103843399379328, PR AUC: 0.6588361615198346, ROC AUC: 0.5854101326281695
Epoch 2/5 - Val Metrics: Loss: 0.6973158399785239,  Accuracy: 0.5048910626945309, Precision: 0.4923884514435696, Recall: 0.4267515923566879, F1: 0.45722641969290767, PR AUC: 0.5996322717889682, ROC AUC: 0.5031584048739962


Training:   0%|          | 0/5000 [00:00<?, ?it/s]

Validation:   0%|          | 0/2249 [00:00<?, ?it/s]

Epoch 3/5 - Train Metrics: Loss: 0.0001454751968383789, Accuracy: 0.6639, Precision: 0.6766452839154193, Recall: 0.5874587458745875, F1: 0.628905818703765, PR AUC: 0.7320520148950034, ROC AUC: 0.6616447456081012
Epoch 3/5 - Val Metrics: Loss: 0.7123758295553427,  Accuracy: 0.506447309915518, Precision: 0.4935672514619883, Recall: 0.38398544131028206, F1: 0.4319344933469806, PR AUC: 0.5892876847587452, ROC AUC: 0.5037318510899237


Training:   0%|          | 0/5000 [00:00<?, ?it/s]

Validation:   0%|          | 0/2249 [00:00<?, ?it/s]

Epoch 4/5 - Train Metrics: Loss: 0.00015162577629089356, Accuracy: 0.7095, Precision: 0.7169979897252624, Recall: 0.6621287128712872, F1: 0.6884718498659517, PR AUC: 0.7714633512982747, ROC AUC: 0.7081023999139044
Epoch 4/5 - Val Metrics: Loss: 0.7371494863737207,  Accuracy: 0.5046687416629613, Precision: 0.49403341288782815, Recall: 0.565059144676979, F1: 0.5271646859083192, PR AUC: 0.6358157318726659, ROC AUC: 0.5060078332080548


Training:   0%|          | 0/5000 [00:00<?, ?it/s]

Validation:   0%|          | 0/2249 [00:00<?, ?it/s]

[I 2024-04-02 10:34:52,502] Trial 3 finished with value: 0.5272 and parameters: {'batch_size': 2, 'learning_rate': 0.00012845227300457637, 'epochs': 5, 'dropout_rate': 0.1588919823308448, 'optimizer': 'AdamW', 'weight_decay': 8.366152042295279e-05, 'max_len': 91, 'num_filters': 184, 'accumulation_steps': 6}. Best is trial 2 with value: 0.6549.


Epoch 5/5 - Train Metrics: Loss: 0.00013573768138885499, Accuracy: 0.746, Precision: 0.7487068965517242, Recall: 0.7165841584158416, F1: 0.7322934232715009, PR AUC: 0.801345527483783, ROC AUC: 0.7451321413197219
Epoch 5/5 - Val Metrics: Loss: 0.7604733958754236,  Accuracy: 0.5142285460204535, Precision: 0.5031660983925962, Recall: 0.4699727024567789, F1: 0.4860032933427429, PR AUC: 0.6160714013139716, ROC AUC: 0.5132472207936069


Training:   0%|          | 0/3334 [00:00<?, ?it/s]

Validation:   0%|          | 0/1500 [00:00<?, ?it/s]

Epoch 1/4 - Train Metrics: Loss: 0.00019159097834554488, Accuracy: 0.4941, Precision: 0.4737888198757764, Recall: 0.39335808580858084, F1: 0.4298433449791502, PR AUC: 0.5806234528421786, ROC AUC: 0.49112780066826556
Epoch 1/4 - Val Metrics: Loss: 0.6957558081150055,  Accuracy: 0.5017785682525567, Precision: 0.45742574257425744, Recall: 0.10509554140127389, F1: 0.1709211986681465, PR AUC: 0.49991337653645396, ROC AUC: 0.49298255330933266


Training:   0%|          | 0/3334 [00:00<?, ?it/s]

Validation:   0%|          | 0/1500 [00:00<?, ?it/s]

Epoch 2/4 - Train Metrics: Loss: 0.00019250237162269084, Accuracy: 0.5486, Precision: 0.5534913516976298, Recall: 0.3564356435643564, F1: 0.43362609786700124, PR AUC: 0.6109634976309931, ROC AUC: 0.5429305547014328
Epoch 2/4 - Val Metrics: Loss: 0.6962598491112392,  Accuracy: 0.4946642952423299, Precision: 0.46113989637305697, Recall: 0.2024567788898999, F1: 0.28137843819159025, PR AUC: 0.526662721802221, ROC AUC: 0.48818491118408036


Training:   0%|          | 0/3334 [00:00<?, ?it/s]

Validation:   0%|          | 0/1500 [00:00<?, ?it/s]

Epoch 3/4 - Train Metrics: Loss: 0.00015788837757808546, Accuracy: 0.6178, Precision: 0.6494755244755245, Recall: 0.4597772277227723, F1: 0.5384057971014493, PR AUC: 0.6855763760991485, ROC AUC: 0.6131378374638706
Epoch 3/4 - Val Metrics: Loss: 0.6984942397673924,  Accuracy: 0.4951089373054691, Precision: 0.4801953336950624, Recall: 0.402638762511374, F1: 0.43801039346696363, PR AUC: 0.5873708053286517, ROC AUC: 0.49305851169046966


Training:   0%|          | 0/3334 [00:00<?, ?it/s]

Validation:   0%|          | 0/1500 [00:00<?, ?it/s]

[I 2024-04-02 10:36:34,059] Trial 4 finished with value: 0.438 and parameters: {'batch_size': 3, 'learning_rate': 3.770258092802565e-05, 'epochs': 4, 'dropout_rate': 0.17952350649692025, 'optimizer': 'Adam', 'weight_decay': 8.055907454474322e-05, 'max_len': 90, 'num_filters': 213, 'accumulation_steps': 1}. Best is trial 2 with value: 0.6549.


Epoch 4/4 - Train Metrics: Loss: 0.0002569407671600598, Accuracy: 0.6594, Precision: 0.6868843960601347, Recall: 0.5466171617161716, F1: 0.6087755570870663, PR AUC: 0.7266507788881532, ROC AUC: 0.6560725560133652
Epoch 4/4 - Val Metrics: Loss: 0.7038082087039947,  Accuracy: 0.4966651845264562, Precision: 0.48162583518930957, Recall: 0.3935395814376706, F1: 0.4331497245868803, PR AUC: 0.5857596758546195, ROC AUC: 0.4943784863710092


Training:   0%|          | 0/3334 [00:00<?, ?it/s]

Validation:   0%|          | 0/1500 [00:00<?, ?it/s]

[I 2024-04-02 10:37:00,895] Trial 5 pruned. 


Epoch 1/5 - Train Metrics: Loss: 0.0004030269423715355, Accuracy: 0.505, Precision: 0.48911187019641333, Recall: 0.47256600660066006, F1: 0.4806966009232061, PR AUC: 0.6086889383985368, ROC AUC: 0.5040430964680319
Epoch 1/5 - Val Metrics: Loss: 0.8306300072868665,  Accuracy: 0.5113383726100489, Precision: 0.5, Recall: 0.0009099181073703367, F1: 0.0018165304268846505, PR AUC: 0.49456345171709115, ROC AUC: 0.5000201764449895


Training:   0%|          | 0/3334 [00:00<?, ?it/s]

Validation:   0%|          | 0/1500 [00:00<?, ?it/s]

[I 2024-04-02 10:37:30,278] Trial 6 pruned. 


Epoch 1/4 - Train Metrics: Loss: 0.00027560744374257473, Accuracy: 0.5069, Precision: 0.49092102384598557, Recall: 0.4628712871287129, F1: 0.476483703153201, PR AUC: 0.6070961554873493, ROC AUC: 0.5056010162351638
Epoch 1/4 - Val Metrics: Loss: 0.7089282596707344,  Accuracy: 0.509115162294353, Precision: 0.49559859154929575, Recall: 0.25614194722474976, F1: 0.33773245350929815, PR AUC: 0.5576177126951598, ROC AUC: 0.5035057562210705


Training:   0%|          | 0/2500 [00:00<?, ?it/s]

Validation:   0%|          | 0/1125 [00:00<?, ?it/s]

Epoch 1/5 - Train Metrics: Loss: 0.00030468342304229736, Accuracy: 0.4993, Precision: 0.4797452229299363, Recall: 0.3884075907590759, F1: 0.4292716288612789, PR AUC: 0.5823264068445062, ROC AUC: 0.49602832954102866
Epoch 1/5 - Val Metrics: Loss: 0.696023204697503,  Accuracy: 0.49644286349488664, Precision: 0.4825429911412194, Recall: 0.4212920837124659, F1: 0.4498421180471217, PR AUC: 0.5933137135050996, ROC AUC: 0.4947764766388416


Training:   0%|          | 0/2500 [00:00<?, ?it/s]

Validation:   0%|          | 0/1125 [00:00<?, ?it/s]

Epoch 2/5 - Train Metrics: Loss: 0.00032187724113464354, Accuracy: 0.5915, Precision: 0.5953988497124281, Recall: 0.49113036303630364, F1: 0.5382615575901436, PR AUC: 0.6666146063743659, ROC AUC: 0.5885387840026239
Epoch 2/5 - Val Metrics: Loss: 0.7044007655779521,  Accuracy: 0.4946642952423299, Precision: 0.4824848201774872, Recall: 0.4699727024567789, F1: 0.47614657755243145, PR AUC: 0.6057307622064172, ROC AUC: 0.49411678601099807


Training:   0%|          | 0/2500 [00:00<?, ?it/s]

Validation:   0%|          | 0/1125 [00:00<?, ?it/s]

Epoch 3/5 - Train Metrics: Loss: 0.00027853398323059083, Accuracy: 0.6661, Precision: 0.6653517422748192, Recall: 0.6262376237623762, F1: 0.6452024226968441, PR AUC: 0.7363946830185977, ROC AUC: 0.6649239361047906
Epoch 3/5 - Val Metrics: Loss: 0.7290353958341811,  Accuracy: 0.49955535793686084, Precision: 0.48903599503516754, Recall: 0.5377616014558689, F1: 0.5122426868905742, PR AUC: 0.6263378822828681, ROC AUC: 0.5004025398583692


Training:   0%|          | 0/2500 [00:00<?, ?it/s]

Validation:   0%|          | 0/1125 [00:00<?, ?it/s]

Epoch 4/5 - Train Metrics: Loss: 0.0002329875946044922, Accuracy: 0.705, Precision: 0.7036480686695279, Recall: 0.6763613861386139, F1: 0.6897349600336559, PR AUC: 0.768454727404071, ROC AUC: 0.7041550719512945
Epoch 4/5 - Val Metrics: Loss: 0.7541001256042057,  Accuracy: 0.5057803468208093, Precision: 0.49452474813841435, Recall: 0.513648771610555, F1: 0.5039053782637803, PR AUC: 0.6229173512484286, ROC AUC: 0.5059548205878861


Training:   0%|          | 0/2500 [00:00<?, ?it/s]

Validation:   0%|          | 0/1125 [00:00<?, ?it/s]

[I 2024-04-02 10:38:54,518] Trial 7 pruned. 


Epoch 5/5 - Train Metrics: Loss: 0.0002938145160675049, Accuracy: 0.7438, Precision: 0.7412410299704517, Recall: 0.7244224422442245, F1: 0.7327352388900481, PR AUC: 0.7996317361073382, ROC AUC: 0.7432283018674539
Epoch 5/5 - Val Metrics: Loss: 0.7964179891612795,  Accuracy: 0.5, Precision: 0.48619382782891174, Recall: 0.40855323020928114, F1: 0.44400494437577254, PR AUC: 0.5918821995393276, ROC AUC: 0.4979722672785536


Training:   0%|          | 0/5000 [00:00<?, ?it/s]

Validation:   0%|          | 0/2249 [00:00<?, ?it/s]

[I 2024-04-02 10:39:18,971] Trial 8 pruned. 


Epoch 1/5 - Train Metrics: Loss: 0.00011674232482910157, Accuracy: 0.5069, Precision: 0.4892792560061999, Recall: 0.39067656765676567, F1: 0.4344534923729785, PR AUC: 0.5876779118314829, ROC AUC: 0.503471047803538
Epoch 1/5 - Val Metrics: Loss: 0.6961683302839473,  Accuracy: 0.5122276567363273, Precision: 0.532258064516129, Recall: 0.015013648771610554, F1: 0.029203539823008842, PR AUC: 0.5142983733179471, ROC AUC: 0.5012024765597183


Training:   0%|          | 0/3334 [00:00<?, ?it/s]

Validation:   0%|          | 0/1500 [00:00<?, ?it/s]

Epoch 1/4 - Train Metrics: Loss: 0.00019467002128367662, Accuracy: 0.5114, Precision: 0.494424882629108, Recall: 0.34756600660066006, F1: 0.40818798449612403, PR AUC: 0.579145444614884, ROC AUC: 0.5065663883934978
Epoch 1/4 - Val Metrics: Loss: 0.6954209675788879,  Accuracy: 0.4982214317474433, Precision: 0.489579653832568, Recall: 0.6305732484076433, F1: 0.5512030224696759, PR AUC: 0.6503387899373578, ROC AUC: 0.5011561894212129


Training:   0%|          | 0/3334 [00:00<?, ?it/s]

Validation:   0%|          | 0/1500 [00:00<?, ?it/s]

Epoch 2/4 - Train Metrics: Loss: 0.00021470921369963372, Accuracy: 0.5759, Precision: 0.579098253844149, Recall: 0.4583333333333333, F1: 0.5116868163500288, PR AUC: 0.6500157935887412, ROC AUC: 0.5724314182194616
Epoch 2/4 - Val Metrics: Loss: 0.6973307241201401,  Accuracy: 0.5024455313472654, Precision: 0.4852724594992636, Recall: 0.29981801637852595, F1: 0.37064116985376827, PR AUC: 0.5636212717316916, ROC AUC: 0.49795248645013257


Training:   0%|          | 0/3334 [00:00<?, ?it/s]

Validation:   0%|          | 0/1500 [00:00<?, ?it/s]

Epoch 3/4 - Train Metrics: Loss: 0.00024040905958746606, Accuracy: 0.6515, Precision: 0.6678984971667898, Recall: 0.5591996699669967, F1: 0.6087347030425507, PR AUC: 0.7203990835668933, ROC AUC: 0.6487768536170386
Epoch 3/4 - Val Metrics: Loss: 0.7107180742820104,  Accuracy: 0.5013339261894175, Precision: 0.49136939010356734, Recall: 0.5828025477707006, F1: 0.5331945889698231, PR AUC: 0.6390201619117895, ROC AUC: 0.503140404320133


Training:   0%|          | 0/3334 [00:00<?, ?it/s]

Validation:   0%|          | 0/1500 [00:00<?, ?it/s]

[I 2024-04-02 10:40:36,428] Trial 9 finished with value: 0.5512 and parameters: {'batch_size': 3, 'learning_rate': 0.00011789812599160922, 'epochs': 4, 'dropout_rate': 0.14445287111635247, 'optimizer': 'AdamW', 'weight_decay': 0.00020103434318475722, 'max_len': 70, 'num_filters': 157, 'accumulation_steps': 3}. Best is trial 2 with value: 0.6549.


Epoch 4/4 - Train Metrics: Loss: 4.1125707627773e-05, Accuracy: 0.705, Precision: 0.7073863636363636, Recall: 0.6676980198019802, F1: 0.686969439728353, PR AUC: 0.768092191719172, ROC AUC: 0.7038994757394994
Epoch 4/4 - Val Metrics: Loss: 0.7286988019943237,  Accuracy: 0.5046687416629613, Precision: 0.49269717624148, Recall: 0.4604185623293904, F1: 0.4760112888052681, PR AUC: 0.6083942410062, ROC AUC: 0.5036875420342605
{'batch_size': 4, 'learning_rate': 0.0023210504217389414, 'epochs': 4, 'dropout_rate': 0.37284646626008966, 'optimizer': 'SGD', 'weight_decay': 0.0037089747420278227, 'max_len': 62, 'num_filters': 170, 'accumulation_steps': 10}


# Best hyperparams
{'batch_size': 3, 'learning_rate': 0.00019520581759930927, 'epochs': 5, 'dropout_rate': 0.16625737730659051, 'optimizer': 'AdamW', 'weight_decay': 0.0026115288105976, 'max_len': 79}

{'batch_size': 2, 'learning_rate': 5.1511113476899056e-05, 'epochs': 5, 'dropout_rate': 0.17285221381789886, 'optimizer': 'Adam', 'weight_decay': 0.00014380834458476438, 'max_len': 71}

In [43]:
def create_results_dataframe(study):
    # Create a list to hold all trial data
    trial_data = []

    # Iterate through all completed trials
    for trial in study.trials:
        # Retrieve the user attributes for the trial
        user_attrs = trial.user_attrs
        user_attrs["trial_number"] = trial.number
        user_attrs["value"] = trial.value  # The objective value (e.g., validation F1 score)

        # Append the trial data to the list
        trial_data.append(user_attrs)

    # Create a DataFrame from the list of trial data
    df = pd.DataFrame(trial_data)

    # Optionally, you might want to sort the DataFrame based on the objective value or another metric
    df = df.sort_values("value", ascending=False)

    return df

# Assuming 'study' is your Optuna study object
df_results = create_results_dataframe(study)

In [44]:
df_results.head()

Unnamed: 0,train_loss,train_accuracy,train_precision,train_recall,train_f1,train_pr_auc,train_roc_auc,val_loss,val_accuracy,val_precision,val_recall,val_f1,val_pr_auc,val_roc_auc,trial_number,value
2,0.000308,0.5091,0.493175,0.45462,0.473114,0.606098,0.507493,0.732141,0.511338,0.5,0.002275,0.004529,0.494912,0.50005,2,0.6549
0,0.000271,0.5149,0.499588,0.375619,0.428824,0.588954,0.510791,0.694881,0.495331,0.489044,0.731119,0.586069,0.675778,0.50056,0,0.5861
9,4.1e-05,0.705,0.707386,0.667698,0.686969,0.768092,0.703899,0.728699,0.504669,0.492697,0.460419,0.476011,0.608394,0.503688,9,0.5512
3,0.000136,0.746,0.748707,0.716584,0.732293,0.801346,0.745132,0.760473,0.514229,0.503166,0.469973,0.486003,0.616071,0.513247,3,0.5272
7,0.000233,0.705,0.703648,0.676361,0.689735,0.768455,0.704155,0.7541,0.50578,0.494525,0.513649,0.503905,0.622917,0.505955,7,0.444005


# Graveyard

In [None]:
# class CNNForWord2Vec(nn.Module):
#     def __init__(self, input, embedding_dim, num_filters, filter_sizes, dropout_rate):
#         super(CNNForWord2Vec, self).__init__()
#         self.convs = nn.ModuleList([
#             nn.Conv2d(in_channels=1, out_channels=num_filters, kernel_size=k) for k in filter_sizes
#         ])
#         self.dropout = nn.Dropout(dropout_rate)

#     def forward(self, x):
#         # x shape: [batch_size, max_sequence_length, embedding_dim]
#         x = x.unsqueeze(1)  # Add channel dimension: [batch_size, 1, max_sequence_length, embedding_dim]

#         # Apply convolution and ReLU. Output shape: [batch_size, num_filters, L, 1]
#         x = [F.relu(conv(x)).squeeze(3) for conv in self.convs]

#         # Apply global average pooling. Output shape: [batch_size, num_filters]
#         x = [F.avg_pool1d(i, i.size(2)).squeeze(2) for i in x]

#         # Concatenate along the filter dimension
#         x = torch.cat(x, 1)

#         x = self.dropout(x)  # Apply dropout
#         return x


# embedding_dim = 768  # Dimension of Word2Vec embeddings
# num_filters = 100  # Number of filters per filter size
# filter_sizes = [3, 4, 5]  # Sizes of filters

# model = CNNForWord2Vec(embedding_dim, num_filters, filter_sizes,dropout_rate=0.1)

# # Example input tensor representing padded sequences of Word2Vec embeddings
# word2vec_embeddings = torch.randn(32, 65, embedding_dim)  # Example: batch_size=32, max_sequence_length=65

# # Forward pass through the model
# cnn_output = model(word2vec_embeddings)

# print("Output shape:", cnn_output.shape)
# # The output shape will be [batch_size, num_filters * len(filter_sizes)] due to the concatenation

In [None]:
# class LSTMWithGAP(nn.Module):
#     def __init__(self, embedding_dim, hidden_dim, num_layers):
#         super(LSTMWithGAP, self).__init__()
#         self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers, batch_first=True)

#     def forward(self, x):
#         # x shape: [batch_size, sequence_length, embedding_dim]
#         lstm_out, (hidden, cell) = self.lstm(x)
#         # lstm_out shape: [batch_size, sequence_length, hidden_dim]

#         # Apply Global Average Pooling across the sequence dimension
#         gap_out = torch.mean(lstm_out, dim=1)
#         # gap_out shape: [batch_size, hidden_dim]

#         return gap_out

# # Example usage
# embedding_dim = 768  # Dimension of Word2Vec embeddings
# hidden_dim = 128  # Hidden dimension of the LSTM
# num_layers = 2  # Number of LSTM layers

# model = LSTMWithGAP(embedding_dim, hidden_dim, num_layers)

# # Example input tensor representing padded sequences of Word2Vec embeddings
# word2vec_embeddings = torch.randn(32, 65, embedding_dim)  # Example: batch_size=32, sequence_length=65

# # Forward pass through the model
# lstm_output = model(word2vec_embeddings)

# print("Output shape:", lstm_output.shape)
# # The output shape will be [batch_size, hidden_dim] because of the Global Average Pooling

In [None]:

# # Reduce
# n_rows = 1000
# train_df = pd.concat([train_df[train_df['label']==0].sample(n=n_rows, random_state=42),train_df[train_df['label']==1].sample(n=n_rows, random_state=42)])
# val_df = pd.concat([val_df[val_df['label']==0].sample(n=n_rows, random_state=42),val_df[val_df['label']==1].sample(n=n_rows, random_state=42)])

In [None]:
# train_df.head()

In [None]:
# class BERTClass(nn.Module):
#     def __init__(self, dropout_rate):
#         super(BERTClass, self).__init__()
#         self.l1 = BertModel.from_pretrained('bert-base-uncased')
#         self.pooling = nn.AdaptiveAvgPool1d(1)
#         self.l2 = lora.Linear(768, 1, r=16)  # LoRA layer
#         self.l3 = nn.Dropout(dropout_rate)

#     def forward(self, ids, mask, token_type_ids, return_embeddings=False):
#         outputs = self.l1(ids, attention_mask=mask, token_type_ids=token_type_ids)
#         last_hidden_state = outputs.last_hidden_state
#         # Apply pooling across the sequence dimension (dim=1) and then squeeze the pooled output
#         pooled_output = self.pooling(last_hidden_state.transpose(1, 2)).squeeze(-1)
#         output_2 = self.l2(pooled_output)
#         output_3 = self.l3(output_2)
#         if return_embeddings:
#             return output_3, output_2, last_hidden_state

#         return output_3

# # Create a class for the dataset
# class CustomDataset(Dataset):
#     def __init__(self, dataframe, tokenizer, max_len):
#         self.tokenizer = tokenizer
#         self.data = dataframe
#         self.text = dataframe.text
#         self.targets = self.data.label
#         self.max_len = max_len

#     def __len__(self):
#         return len(self.text)

#     def __getitem__(self, index):
#         text = str(self.text[index])
#         text = " ".join(text.split())

#         inputs = self.tokenizer.encode_plus(
#             text,
#             add_special_tokens=True,
#             max_length=self.max_len,
#             padding="max_length",
#             truncation=True,
#             return_token_type_ids=True,
#             return_attention_mask=True,
#             return_tensors='pt'
#         )

#         ids = inputs['input_ids']
#         mask = inputs['attention_mask']
#         token_type_ids = inputs.get("token_type_ids", None)

#         return {
#                 'ids': ids.squeeze(),
#                 'mask': mask.squeeze(),
#                 'token_type_ids': token_type_ids.squeeze() if token_type_ids is not None else None,
#                 'targets': torch.tensor(self.targets[index], dtype=torch.float)
#             }


# # Function to calculate metrics
# def calculate_metrics(targets, outputs):
#     accuracy = accuracy_score(targets, outputs)
#     precision = precision_score(targets, outputs)
#     recall = recall_score(targets, outputs)
#     f1 = f1_score(targets, outputs)
#     precision_vals, recall_vals, _ = precision_recall_curve(targets, outputs)
#     pr_auc = auc(recall_vals, precision_vals)
#     roc_auc = roc_auc_score(targets, outputs)
#     return accuracy, precision, recall, f1, pr_auc, roc_auc


# def objective(trial):

#   parameters = {
#       'batch_size': trial.suggest_int('batch_size', 2, 4),
#       'learning_rate': trial.suggest_float('learning_rate', 1e-5, 1e-2, log=True),
#       'epochs': trial.suggest_int('epochs', 3, 5),
#       'learning_rate': trial.suggest_float('learning_rate', 1e-5, 1e-2, log=True),
#       'dropout_rate': trial.suggest_float('dropout_rate', 0.1, 0.5, log=True),
#       'optimizer': trial.suggest_categorical('optimizer', ['AdamW', 'Adam','SGD']),
#       'weight_decay': trial.suggest_float('weight_decay', 1e-5, 1e-2, log=True),
#       'max_len': trial.suggest_int('max_len', 50, 100)
#     }

#   # Set the parameters
#   batch_size = parameters['batch_size']
#   learning_rate = parameters['learning_rate']
#   epochs = parameters['epochs']
#   dropout_rate = parameters['dropout_rate']
#   max_len = parameters['max_len']
#   optimizer = parameters['optimizer']
#   weight_decay = parameters['weight_decay']
#   train_params = {'batch_size': batch_size,'shuffle': True}

#   # Instantiate tokenizer and model
#   tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
#   bert_model = BertModel.from_pretrained('bert-base-uncased')

#   # Pass train and test to dataloader
#   training_set = CustomDataset(train_df, tokenizer, max_len)
#   val_set = CustomDataset(val_df, tokenizer, max_len)

#   # Create the dataloaders
#   training_loader = DataLoader(training_set, **train_params)
#   val_loader = DataLoader(val_set, **train_params)

#   # Instantiate model
#   model = BERTClass(dropout_rate)
#   lora.mark_only_lora_as_trainable(model)

#   # Move the model to the GPU
#   if torch.cuda.is_available():
#       model = model.to('cuda')

#   # Create the optimizer
#   if optimizer == 'AdamW':
#     optimizer = AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
#   elif optimizer == 'Adam':
#     optimizer = Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
#   elif optimizer == 'SGD':
#     optimizer = SGD(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
#   else:
#     raise ValueError("Invalid optimizer")

#   # Create the loss function
#   loss_function = nn.BCEWithLogitsLoss()

#   # Instantiate pruner
#   pruner = MedianPruner()

#   # Initialize lists to store metrics
#   metrics = {
#       'train': {'loss': [], 'accuracy': [], 'precision': [], 'recall': [], 'f1': [], 'pr_auc': [], 'roc_auc': []},
#       'val': {'loss':[], 'accuracy': [], 'precision': [], 'recall': [], 'f1': [], 'pr_auc': [], 'roc_auc': []}
#   }

#   gradients_stats = []

#   def collect_gradients(module, grad_input, grad_output):
#         grad_stats = {
#           'layer': module.__class__.__name__,
#           'grad_input_mean': [grad.mean().item() for grad in grad_input if grad is not None],
#           'grad_input_std': [grad.std().item() for grad in grad_input if grad is not None],
#           'grad_output_mean': [grad.mean().item() for grad in grad_output if grad is not None],
#           'grad_output_std': [grad.std().item() for grad in grad_output if grad is not None],
#         }
#         gradients_stats.append(grad_stats)

#   # Assuming `model` is already defined
#   model.l2.register_full_backward_hook(collect_gradients)

#   # Define threshold
#   threshold = 0.5

#   # Training loop with metrics calculation
#   for epoch in range(epochs):
#       model.train()
#       train_targets = []
#       train_outputs = []

#       # Training phase
#       total_train_iterations = len(training_loader)
#       for i, data in tqdm(enumerate(training_loader,0),total=total_train_iterations, desc="Training"):
#           ids = data['ids'].to('cuda', dtype=torch.long)
#           mask = data['mask'].to('cuda', dtype=torch.long)
#           token_type_ids = data['token_type_ids'].to('cuda', dtype=torch.long)
#           targets = data['targets'].to('cuda', dtype=torch.float)

#           # Forward pass
#           outputs = model(ids, mask, token_type_ids)
#           optimizer.zero_grad()
#           loss = loss_function(outputs, targets.unsqueeze(1))
#           loss.backward()
#           optimizer.step()
#           train_targets.extend(targets.cpu().detach().numpy().tolist())
#           train_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())

#       # Save gradients for analysis
#       with open('./gradient_statistics.csv', 'w', newline='') as csvfile:
#         fieldnames = ['layer', 'grad_input_mean', 'grad_input_std', 'grad_output_mean', 'grad_output_std']
#         writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
#         writer.writeheader()
#         for grad_stat in gradients_stats:
#           writer.writerow(grad_stat)

#       # Calculate and store training metrics
#       train_outputs_bin = np.array(train_outputs) >= threshold
#       train_acc, train_prec, train_rec, train_f1, train_pr_auc, train_roc_auc = calculate_metrics(np.array(train_targets), train_outputs_bin)
#       train_loss = loss.item()
#       metrics['train']['loss'].append(round(train_loss,4))
#       metrics['train']['accuracy'].append(round(train_acc,4))
#       metrics['train']['precision'].append(round(train_prec,4))
#       metrics['train']['recall'].append(round(train_rec,4))
#       metrics['train']['f1'].append(round(train_f1,4))
#       metrics['train']['pr_auc'].append(round(train_pr_auc,4))
#       metrics['train']['roc_auc'].append(round(train_roc_auc,4))

#       # Validation phase
#       model.eval()
#       val_targets = []
#       val_outputs = []
#       with torch.no_grad():
#           total_val_iterations = len(val_loader)
#           for data in tqdm(val_loader, total=total_val_iterations, desc="Validation"):
#             ids = data['ids'].to('cuda', dtype=torch.long)
#             mask = data['mask'].to('cuda', dtype=torch.long)
#             token_type_ids = data['token_type_ids'].to('cuda', dtype=torch.long)
#             targets = data['targets'].to('cuda', dtype=torch.float)

#             # Forward pass
#             outputs = model(ids, mask, token_type_ids)
#             outputs = torch.sigmoid(outputs).squeeze()
#             val_targets.extend(targets.cpu().detach().numpy().tolist())
#             output_list = torch.sigmoid(outputs).cpu().detach().numpy().flatten().tolist()
#             val_outputs.extend(output_list)

#       # Calculate and store validation metrics
#       val_outputs_bin = np.array(val_outputs) >= threshold
#       val_acc, val_prec, val_rec, val_f1, val_pr_auc, val_roc_auc = calculate_metrics(np.array(val_targets), val_outputs_bin)
#       val_loss = loss.item()
#       metrics['val']['loss'].append(round(val_loss,4))
#       metrics['val']['accuracy'].append(round(val_acc,4))
#       metrics['val']['precision'].append(round(val_prec,4))
#       metrics['val']['recall'].append(round(val_rec,4))
#       metrics['val']['f1'].append(round(val_f1,4))
#       metrics['val']['pr_auc'].append(round(val_pr_auc,4))
#       metrics['val']['roc_auc'].append(round(val_roc_auc,4))

#       print(f"Epoch {epoch+1}/{epochs} - Train Metrics: Loss: {train_loss}, Accuracy: {train_acc}, Precision: {train_prec}, Recall: {train_rec}, F1: {train_f1}, PR AUC: {train_pr_auc}, ROC AUC: {train_roc_auc}")
#       print(f"Epoch {epoch+1}/{epochs} - Val Metrics: Loss: {val_loss},  Accuracy: {val_acc}, Precision: {val_prec}, Recall: {val_rec}, F1: {val_f1}, PR AUC: {val_pr_auc}, ROC AUC: {val_roc_auc}")
#       trial.report(val_f1, epoch)
#       if trial.should_prune():
#         raise optuna.exceptions.TrialPruned()

#       # At the end of your objective function, before returning the optimization metric
#       trial.set_user_attr("train_loss", train_loss)
#       trial.set_user_attr("train_accuracy", train_acc)
#       trial.set_user_attr("train_precision", train_prec)
#       trial.set_user_attr("train_recall", train_rec)
#       trial.set_user_attr("train_f1", train_f1)
#       trial.set_user_attr("train_pr_auc", train_pr_auc)
#       trial.set_user_attr("train_roc_auc", train_roc_auc)

#       trial.set_user_attr("val_loss", val_loss)
#       trial.set_user_attr("val_accuracy", val_acc)
#       trial.set_user_attr("val_precision", val_prec)
#       trial.set_user_attr("val_recall", val_rec)
#       trial.set_user_attr("val_f1", val_f1)
#       trial.set_user_attr("val_pr_auc", val_pr_auc)
#       trial.set_user_attr("val_roc_auc", val_roc_auc)

#   return np.max(metrics['val']['f1'])


In [None]:
# # Empty cash
# torch.cuda.empty_cache()

# # Run trials
# study = optuna.create_study(direction='maximize', pruner=MedianPruner())
# study.optimize(objective, n_trials=1)

# # Get the best hyperparameters
# best_params = study.best_params
# print(best_params)

# gr = pd.read_csv(f'./gradient_statistics.csv')
# print(gr.describe())

# def create_results_dataframe(study):
#     # Create a list to hold all trial data
#     trial_data = []

#     # Iterate through all completed trials
#     for trial in study.trials:
#         # Retrieve the user attributes for the trial
#         user_attrs = trial.user_attrs
#         user_attrs["trial_number"] = trial.number
#         user_attrs["value"] = trial.value  # The objective value (e.g., validation F1 score)

#         # Append the trial data to the list
#         trial_data.append(user_attrs)

#     # Create a DataFrame from the list of trial data
#     df = pd.DataFrame(trial_data)

#     # Optionally, you might want to sort the DataFrame based on the objective value or another metric
#     df = df.sort_values("value", ascending=False)

#     return df

# # Assuming 'study' is your Optuna study object
# df_results = create_results_dataframe(study)

In [None]:
# # Instantiate tokenizer, data and model
# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# train_dataset = CustomDataset(train_df, tokenizer, max_len=128)  # Example max_len
# train_loader = DataLoader(train_dataset, batch_size=1, shuffle=False)  # Single batch for simplicity
# model = BERTClass(dropout_rate=0.1).to('cuda')
# model.eval()

# # Get a batch from your DataLoader
# for batch in train_loader:
#     ids, mask, token_type_ids, targets = batch['ids'].to('cuda'), batch['mask'].to('cuda'), batch['token_type_ids'].to('cuda'), batch['targets'].to('cuda')

#     # Forward pass to get raw and adapted embeddings
#     output, adapted_embeddings, raw_embeddings = model(ids, mask, token_type_ids, return_embeddings=True)

#     break



In [None]:
# raw_embeddings.size()

In [None]:
# from sklearn.decomposition import PCA
# import matplotlib.pyplot as plt

# # Assuming raw_embeddings is the tensor you're trying to visualize
# embeddings_pca = PCA(n_components=2).fit_transform(raw_embeddings.detach().cpu().numpy())

# plt.figure(figsize=(10, 6))
# plt.scatter(embeddings_pca[:, 0], embeddings_pca[:, 1])
# plt.title('PCA visualization of Embeddings')
# plt.show()



In [None]:
# # Show results
# val_outputs = np.array(val_outputs) >= threshold
# val_targets = np.array(val_targets)
# print(classification_report(val_targets, val_outputs))

# class BERTClass(nn.Module):
#     def __init__(self,dropout_rate):
#         super(BERTClass, self).__init__()
#         self.l1 = BertModel.from_pretrained('bert-base-uncased')
#         self.l2 = nn.Dropout(dropout_rate)
#         self.l3 = nn.Linear(768, 1)

#     def forward(self, ids, mask, token_type_ids):
#         output_1 = self.l1(ids, attention_mask=mask, token_type_ids=token_type_ids)
#         output_2 = self.l2(output_1.pooler_output)
#         output = self.l3(output_2)
#         return output

In [None]:
# class BERTClass(nn.Module):
#     def __init__(self, bert_model, dropout_rate, word2vec_embeddings, num_classes=2, bert_embedding_dim=768):
#         super(BERTClass, self).__init__()
#         self.bert_model = bert_model
#         self.global_avg_pooling = nn.AdaptiveAvgPool1d(1)
#         self.dropout = nn.Dropout(dropout_rate)
#         self.word2vec_embeddings = word2vec_embeddings
#         # Input dimension to the output layer is doubled because of concatenation
#         self.output_layer = nn.Linear(bert_embedding_dim * 2, num_classes)

#     def forward(self, input_ids, attention_mask, word2vec_embeddings):
#         # Obtain BERT embeddings
#         with torch.no_grad():
#             bert_outputs = self.bert_model(input_ids=input_ids, attention_mask=attention_mask)
#         bert_embeddings = bert_outputs.last_hidden_state

#         # Concatenate BERT and Word2Vec embeddings along the embedding dimension
#         print('bert_embeddings:',bert_embeddings.shape)
#         print('word2vec_embeddings:',word2vec_embeddings)
#         combined_embeddings = torch.cat((bert_embeddings, word2vec_embeddings), dim=-1)

#         # Apply global average pooling across the sequence length dimension
#         pooled_embeddings = self.global_avg_pooling(combined_embeddings.permute(0, 2, 1)).squeeze(-1)

#         # Apply dropout
#         final_embeddings = self.dropout(pooled_embeddings)

#         # Pass through the output layer for final classification scores
#         output = self.output_layer(final_embeddings)

#         return output


In [None]:
# class BERTCNNClass(nn.Module):
#     def __init__(self, dropout_rate, embedding_dim, cnn_output_channels, kernel_size, bert_hidden_size):
#         super(BERTCNNClass, self).__init__()
#         self.bert = BertModel.from_pretrained('bert-base-uncased')
#         self.dropout = nn.Dropout(dropout_rate)

#         # CNN for static embeddings
#         self.cnn = nn.Conv1d(in_channels=embedding_dim, out_channels=cnn_output_channels, kernel_size=kernel_size, padding=1)

#         # Global Average Pooling
#         self.cnn_gap = nn.AdaptiveAvgPool1d(1)
#         self.bert_gap = nn.AdaptiveAvgPool1d(1)

#         # Linear layer for concatenated features
#         self.fc = nn.Linear(cnn_output_channels + bert_hidden_size, 1)

#     def forward(self, ids, mask, token_type_ids, static_embeddings):
#         # BERT path
#         bert_output = self.bert(ids, attention_mask=mask, token_type_ids=token_type_ids)
#         bert_last_hidden_state = bert_output.last_hidden_state
#         bert_gap = self.bert_gap(bert_last_hidden_state.transpose(1, 2)).squeeze(2)

#         # Prepare static embeddings for CNN
#         static_embeddings = static_embeddings.permute(0, 2, 1)

#         # Prepare for CNN
#         cnn_output = torch.relu(self.cnn(static_embeddings.transpose(1, 2)))
#         cnn_gap = self.cnn_gap(cnn_output).squeeze(2)

#         # Concatenate and final linear layer
#         concatenated_features = torch.cat([bert_gap, cnn_gap], dim=1)
#         output = self.dropout(concatenated_features)
#         output = self.fc(output)

#         return output


# # Create a class for the dataset
# class CustomDataset(Dataset):
#     def __init__(self, dataframe, tokenizer, max_len):
#         self.tokenizer = tokenizer
#         self.data = dataframe
#         self.text = dataframe.text
#         self.targets = self.data.label
#         self.max_len = max_len

#     def __len__(self):
#         return len(self.text)

#     def __getitem__(self, index):
#         text = str(self.text[index])
#         text = " ".join(text.split())

#         inputs = self.tokenizer.encode_plus(
#             text,
#             add_special_tokens=True,
#             max_length=self.max_len,
#             padding="max_length",
#             truncation=True,
#             return_token_type_ids=True,
#             return_attention_mask=True,
#             return_tensors='pt'
#         )

#         ids = inputs['input_ids']
#         mask = inputs['attention_mask']
#         token_type_ids = inputs.get("token_type_ids", None)

#         return {
#                 'ids': ids.squeeze(),
#                 'mask': mask.squeeze(),
#                 'token_type_ids': token_type_ids.squeeze() if token_type_ids is not None else None,
#                 'targets': torch.tensor(self.targets[index], dtype=torch.float)
#             }


# # Function to calculate metrics
# def calculate_metrics(targets, outputs):
#     accuracy = accuracy_score(targets, outputs)
#     precision = precision_score(targets, outputs)
#     recall = recall_score(targets, outputs)
#     f1 = f1_score(targets, outputs)
#     precision_vals, recall_vals, _ = precision_recall_curve(targets, outputs)
#     pr_auc = auc(recall_vals, precision_vals)
#     roc_auc = roc_auc_score(targets, outputs)
#     return accuracy, precision, recall, f1, pr_auc, roc_auc


# def objective(trial):

#   parameters = {
#       'batch_size': trial.suggest_int('batch_size', 4, 4),
#       'learning_rate': trial.suggest_float('learning_rate', 1e-5, 1e-2, log=True),
#       'epochs': trial.suggest_int('epochs', 1, 1),
#       # trial.suggest_int('max_len', 128, 512),
#       'learning_rate': trial.suggest_float('learning_rate', 1e-5, 1e-2, log=True),
#       'dropout_rate': trial.suggest_float('dropout_rate', 0.1, 0.5, log=True),
#       'optimizer': trial.suggest_categorical('optimizer', ['AdamW', 'Adam','SGD']),
#       'weight_decay': trial.suggest_float('weight_decay', 1e-5, 1e-2, log=True),
#       'kernel_size': trial.suggest_int('kernel_size', 3, 5),
#       'cnn_output_channels': trial.suggest_int('cnn_output_channels', 64, 256)
#     }

#   # Set the parameters
#   batch_size = parameters['batch_size']
#   learning_rate = parameters['learning_rate']
#   epochs = parameters['epochs']
#   max_len = 65 # parameters['max_len']
#   dropout_rate = parameters['dropout_rate']
#   optimizer = parameters['optimizer']
#   weight_decay = parameters['weight_decay']
#   kernel_size = parameters['kernel_size']
#   cnn_output_channels = parameters['cnn_output_channels']
#   train_params = {'batch_size': batch_size,
#                   'shuffle': True,
#                   }

#   # Pass train and test to dataloader
#   training_set = CustomDataset(train_df, tokenizer, max_len)
#   val_set = CustomDataset(val_df, tokenizer, max_len)

#   # Create the dataloaders
#   training_loader = DataLoader(training_set, **train_params)
#   val_loader = DataLoader(val_set, **train_params)

#   # Instantiate model
#   static_embeddings_padded = pad_sequence([torch.tensor(seq).clone().detach() for seq in padded_sequences], batch_first=True).to('cuda')
#   embedding_dim = 100 # static_embeddings_padded.size(1)

#   # Instantiate model
#   model = BERTCNNClass(
#     dropout_rate=dropout_rate,
#     cnn_output_channels=cnn_output_channels,
#     embedding_dim=embedding_dim,
#     kernel_size=kernel_size,
#     bert_hidden_size=768
# )

#   # Move the model to the GPU
#   if torch.cuda.is_available():
#       model = model.to('cuda')

#   # Create the optimizer
#   if optimizer == 'AdamW':
#     optimizer = AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
#   elif optimizer == 'Adam':
#     optimizer = Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
#   elif optimizer == 'SGD':
#     optimizer = SGD(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
#   else:
#     raise ValueError("Invalid optimizer")

#   # Create the loss function
#   loss_function = nn.BCEWithLogitsLoss()

#   # Instantiate pruner
#   pruner = MedianPruner()

#   # Initialize lists to store metrics
#   metrics = {
#       'train': {'accuracy': [], 'precision': [], 'recall': [], 'f1': [], 'pr_auc': [], 'roc_auc': []},
#       'val': {'accuracy': [], 'precision': [], 'recall': [], 'f1': [], 'pr_auc': [], 'roc_auc': []}
#   }

#   # Define threshold
#   threshold = 0.5

#   # Training loop with metrics calculation
#   for epoch in range(epochs):
#       model.train()
#       train_targets = []
#       train_outputs = []

#       # Training phase
#       total_train_iterations = len(training_loader)
#       for i, data in tqdm(enumerate(training_loader,0),total=total_train_iterations, desc="Training"):
#           ids = data['ids'].to('cuda', dtype=torch.long)
#           mask = data['mask'].to('cuda', dtype=torch.long)
#           token_type_ids = data['token_type_ids'].to('cuda', dtype=torch.long)
#           targets = data['targets'].to('cuda', dtype=torch.float)

#           # Forward pass
#           outputs = model(ids, mask, token_type_ids, static_embeddings_padded[i])
#           optimizer.zero_grad()
#           loss = loss_function(outputs, targets.unsqueeze(1))
#           loss.backward()
#           optimizer.step()
#           train_targets.extend(targets.cpu().detach().numpy().tolist())
#           train_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())

#       # Calculate and store training metrics
#       train_outputs_bin = np.array(train_outputs) >= threshold
#       train_acc, train_prec, train_rec, train_f1, train_pr_auc, train_roc_auc = calculate_metrics(np.array(train_targets), train_outputs_bin)
#       metrics['train']['accuracy'].append(round(train_acc,4))
#       metrics['train']['precision'].append(round(train_prec,4))
#       metrics['train']['recall'].append(round(train_rec,4))
#       metrics['train']['f1'].append(round(train_f1,4))
#       metrics['train']['pr_auc'].append(round(train_pr_auc,4))
#       metrics['train']['roc_auc'].append(round(train_roc_auc,4))

#       # Validation phase
#       model.eval()
#       val_targets = []
#       val_outputs = []
#       with torch.no_grad():
#           total_val_iterations = len(val_loader)
#           for data in tqdm(val_loader, total=total_val_iterations, desc="Validation"):
#             ids = data['ids'].to('cuda', dtype=torch.long)
#             mask = data['mask'].to('cuda', dtype=torch.long)
#             token_type_ids = data['token_type_ids'].to('cuda', dtype=torch.long)
#             targets = data['targets'].to('cuda', dtype=torch.float)

#             # Forward pass
#             outputs = model(ids, mask, token_type_ids, static_embeddings_padded[i])
#             outputs = torch.sigmoid(outputs).squeeze()
#             val_targets.extend(targets.cpu().detach().numpy().tolist())
#             val_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())

#       # Calculate and store validation metrics
#       val_outputs_bin = np.array(val_outputs) >= threshold
#       val_acc, val_prec, val_rec, val_f1, val_pr_auc, val_roc_auc = calculate_metrics(np.array(val_targets), val_outputs_bin)
#       metrics['val']['accuracy'].append(round(val_acc,4))
#       metrics['val']['precision'].append(round(val_prec,4))
#       metrics['val']['recall'].append(round(val_rec,4))
#       metrics['val']['f1'].appendround(round(val_f1,4))
#       metrics['val']['pr_auc'].append(round(val_pr_auc,4))
#       metrics['val']['roc_auc'].append(round(val_roc_auc,4))

#       print(f"Epoch {epoch+1}/{epochs} - Train Metrics: Accuracy: {train_acc}, Precision: {train_prec}, Recall: {train_rec}, F1: {train_f1}, PR AUC: {train_pr_auc}, ROC AUC: {train_roc_auc}")
#       print(f"Epoch {epoch+1}/{epochs} - Val Metrics: Accuracy: {val_acc}, Precision: {val_prec}, Recall: {val_rec}, F1: {val_f1}, PR AUC: {val_pr_auc}, ROC AUC: {val_roc_auc}")
#       trial.report(val_f1, epoch)
#       if trial.should_prune():
#         raise optuna.exceptions.TrialPruned()

#   return np.max(metrics['val']['f1'])


In [None]:
# class CNNForWord2Vec(nn.Module):
#     def __init__(self, vocab_size, embedding_dim, num_filters, filter_sizes, dropout_rate):
#         super(CNNForWord2Vec, self).__init__()

#         # Static word embeddings layer.
#         self.embedding = nn.Embedding(vocab_size, embedding_dim)

#         # Convolutional layers for different filter sizes applied to the word embeddings.
#         self.convs = nn.ModuleList([
#             nn.Conv2d(in_channels=1, out_channels=num_filters, kernel_size=(k, embedding_dim), padding=(k - 1, 0))
#             for k in filter_sizes
#         ])

#         # Batch normalization applied to the output of convolutional layers.
#         self.conv_bn = nn.ModuleList([
#             nn.BatchNorm2d(num_filters) for _ in filter_sizes
#         ])

#         # Global average pooling applied to the output of each convolutional layer.
#         self.cnn_global_avg_pool = nn.AdaptiveAvgPool2d((1, num_filters))

#         # BERT model to obtain contextual embeddings from input tokens.
#         self.bert_embedding = BertModel.from_pretrained('bert-base-uncased')

#         # Global average pooling layer to process BERT embeddings.
#         self.bert_global_avg_pool = nn.AdaptiveAvgPool1d(1)

#         # Dropout layer for regularization.
#         self.dropout = nn.Dropout(dropout_rate)

#         # Fully connected layer for classification. Since features from both CNN and BERT embeddings are concatenated,
#         # the input features are doubled.
#         self.fc = nn.Linear(num_filters * len(filter_sizes) + embedding_dim, 1)

#     def forward(self, x, bert_input_ids, bert_attention_mask):
#         # Convert token ids to embeddings
#         x = self.embedding(x)  # [batch_size, seq_length, embedding_dim]

#         # Add a channel dimension and apply convolutional layers followed by batch normalization.
#         x = x.unsqueeze(1)  # Add channel dimension
#         x = [F.relu(bn(conv(x))) for conv, bn in zip(self.convs, self.conv_bn)]

#         # Apply global average pooling to the output of each convolutional layer and flatten the result.
#         x = [self.cnn_global_avg_pool(xi).view(xi.size(0), -1) for xi in x]
#         x = torch.cat(x, 1)  # Concatenate along the filter dimension

#         # Get BERT embeddings and apply global average pooling.
#         bert_embeddings = self.bert_embedding(input_ids=bert_input_ids, attention_mask=bert_attention_mask)['last_hidden_state']
#         x_bert = self.bert_global_avg_pool(bert_embeddings.permute(0, 2, 1)).squeeze(2)

#         # Concatenate the outputs from CNN and BERT embeddings.
#         x = torch.cat((x, x_bert), 1)

#         # Apply dropout and pass through the fully connected layer for classification.
#         x = self.dropout(x)
#         x = self.fc(x)
#         return x
