# Import dependencies 

In [1]:
import pandas as pd
import numpy as np
import torch
import nltk
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler
from torch.nn.utils.rnn import pad_sequence
import pytorch_lightning as pl
from torchmetrics.classification import (MultilabelF1Score, MultilabelPrecision, 
                                         MultilabelRecall, MultilabelHammingDistance, 
                                         MultilabelJaccardIndex)
from torchtext.vocab import build_vocab_from_iterator, GloVe
from sklearn.model_selection import train_test_split
from pytorch_lightning.callbacks import EarlyStopping, LearningRateMonitor
from pytorch_lightning.loggers import CSVLogger, TensorBoardLogger
from sklearn.metrics import classification_report
nltk.download('punkt_tab')



[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/daniellai/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

# Read files Parquet 

In [2]:
path = "/Users/daniellai/MSDS_2026/MSDS_2024_2026/Winter_2025/DSCI521/Project/data/data_final.parquet"
data = pd.read_parquet(path)
data.head()
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150171 entries, 0 to 150170
Data columns (total 9 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   title       150171 non-null  object
 1   authors     150171 non-null  object
 2   summary     150171 non-null  object
 3   published   150171 non-null  object
 4   updated     150171 non-null  object
 5   link        150171 non-null  object
 6   pdf_url     150171 non-null  object
 7   categories  150171 non-null  object
 8   target      150171 non-null  object
dtypes: object(9)
memory usage: 10.3+ MB


In [None]:
data_full = data[['title','summary','target']]
data_full.head()

Unnamed: 0,title,summary,target
0,Is Physics Sick? [In Praise of Classical Physics],"In this paper, it is argued that theoretical p...",physic
1,Modern Mathematical Physics: what it should be?,Personal view of author on goals and content o...,"math-stats,physic"
2,Topology in Physics,The phenomenon of quantum number fractionaliza...,"math-stats,physic"
3,Contents of Physics Related E-Print Archives,The frontiers of physics related e-print archi...,physic
4,Fundamental Dilemmas in Theoretical Physics,"In this paper, we argue that there are foundat...",physic


In [4]:
data_full['full_title'] = data_full['summary']
data_full.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_full['full_title'] = data_full['summary']


Unnamed: 0,title,summary,target,full_title
0,Is Physics Sick? [In Praise of Classical Physics],"In this paper, it is argued that theoretical p...",physic,"In this paper, it is argued that theoretical p..."
1,Modern Mathematical Physics: what it should be?,Personal view of author on goals and content o...,"math-stats,physic",Personal view of author on goals and content o...
2,Topology in Physics,The phenomenon of quantum number fractionaliza...,"math-stats,physic",The phenomenon of quantum number fractionaliza...
3,Contents of Physics Related E-Print Archives,The frontiers of physics related e-print archi...,physic,The frontiers of physics related e-print archi...
4,Fundamental Dilemmas in Theoretical Physics,"In this paper, we argue that there are foundat...",physic,"In this paper, we argue that there are foundat..."


In [5]:
data_full['full_title'][0]

'In this paper, it is argued that theoretical physics is more akin to an organism than to a rigid structure.It is in this sense that the epithet, "sick", applies to it. It is argued that classical physics is a model of a healthy science, and the degree of sickness of modern physics is measured accordingly. The malady is located in the relationship between mathematics and physical meaning in physical theory.'

# Pre-processing Text

In [6]:
wpt = nltk.WordPunctTokenizer()
stop_words_init = nltk.corpus.stopwords.words('english')
stop_words = [i for i in stop_words_init if i not in ('not','and','for')]
print(stop_words)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't

In [7]:
# normalizing the text 
def normalized_text(text):
    text = text.lower()
    text = re.sub(r'\[.*?\]', ' ', text)
    text = re.sub(r'https?://\S+|www\.\S+', ' ', text)
    text = re.sub(r'<.*?>+', ' ', text)
    text = re.sub(r'[%s]' % re.escape(string.punctuation), ' ', text)
    text = re.sub(r'\n', ' ', text)
    text = re.sub(r'\w*\d\w*', ' ', text)
    tokens = wpt.tokenize(text)
    filtered_tokens = [token for token in tokens if token not in stop_words]
    cleaned_text = ' '.join(filtered_tokens)

    return cleaned_text

In [8]:
data_full['full_title'] = data_full['full_title'].apply(lambda x: normalized_text(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_full['full_title'] = data_full['full_title'].apply(lambda x: normalized_text(x))


In [9]:
data_full.head()

Unnamed: 0,title,summary,target,full_title
0,Is Physics Sick? [In Praise of Classical Physics],"In this paper, it is argued that theoretical p...",physic,paper argued theoretical physics akin organism...
1,Modern Mathematical Physics: what it should be?,Personal view of author on goals and content o...,"math-stats,physic",personal view author goals and content mathema...
2,Topology in Physics,The phenomenon of quantum number fractionaliza...,"math-stats,physic",phenomenon quantum number fractionalization ex...
3,Contents of Physics Related E-Print Archives,The frontiers of physics related e-print archi...,physic,frontiers physics related e print archives web...
4,Fundamental Dilemmas in Theoretical Physics,"In this paper, we argue that there are foundat...",physic,paper argue foundational dilemmas theoretical ...


# Train-val-test split 

In [10]:
# Split data into train (80%), validation (10%), and test (10%)
train_val_df, test_df = train_test_split(data_full, test_size=0.1, random_state=42)
train_df, val_df = train_test_split(train_val_df, test_size=0.1, random_state=42)

In [11]:
train_texts = train_df['full_title'].tolist()
train_targets = train_df['target'].tolist()
val_texts = val_df['full_title'].tolist()
val_targets = val_df['target'].tolist()
test_texts = test_df['full_title'].tolist()
test_targets = test_df['target'].tolist()


In [12]:
# Define a simple tokenizer (splitting on whitespace)
tokenizer = lambda x: x.split()

def yield_tokens(data_iter):
    for text in data_iter:
        yield tokenizer(text)
        
# Build vocabulary from training texts
vocab = build_vocab_from_iterator(yield_tokens(train_texts), specials=['<unk>'])
vocab.set_default_index(vocab['<unk>'])

In [13]:
all_targets = data_full['target'].tolist()
all_labels = set()
for target in all_targets:
    all_labels.update(target.split(','))
label_to_idx = {label: idx for idx, label in enumerate(sorted(all_labels))}
print("Label Mapping:", label_to_idx)

Label Mapping: {'bio': 0, 'cs': 1, 'econ-qfin': 2, 'eess': 3, 'math-stats': 4, 'physic': 5}


# Model

In [14]:
# 2. Load GloVe Embeddings using TorchText
# -------------------------------
# This will download GloVe if not already available.
glove = GloVe(name='6B', dim=100)
# Create an embedding matrix for our vocabulary
embedding_dim = 100
vocab_size = len(vocab)
embedding_matrix = torch.randn(vocab_size, embedding_dim)
# Optionally, set the embedding for the padding token (index 0) to zeros
embedding_matrix[0] = torch.zeros(embedding_dim)
# Build the embedding matrix: for each word in our vocab, if it exists in GloVe, use its vector.
for word, idx in vocab.get_stoi().items():
    if word in glove.stoi:
        embedding_matrix[idx] = glove.vectors[glove.stoi[word]]

In [None]:
# Custom Dataset with Target Transformation
class MultiLabelDataset(Dataset):
    def __init__(self, texts, targets, tokenizer, vocab, label_to_idx):
        self.texts = texts
        self.targets = targets
        self.tokenizer = tokenizer
        self.vocab = vocab
        self.label_to_idx = label_to_idx

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        # Convert text to token indices
        text = self.texts[idx]
        tokens = self.tokenizer(text)
        indices = [self.vocab[token] for token in tokens if token in self.vocab]
        if not indices:
            indices = [self.vocab['<unk>']]
        indices = torch.tensor(indices, dtype=torch.long)
        labels = self.targets[idx].split(',')
        label_vec = torch.zeros(len(self.label_to_idx))
        for label in labels:
            if label in self.label_to_idx:
                label_vec[self.label_to_idx[label]] = 1
        return indices, label_vec



In [None]:
train_dataset = MultiLabelDataset(train_texts, train_targets, tokenizer, vocab, label_to_idx)
val_dataset = MultiLabelDataset(val_texts, val_targets, tokenizer, vocab, label_to_idx)
test_dataset = MultiLabelDataset(test_texts, test_targets, tokenizer, vocab, label_to_idx)


In [17]:
def collate_fn(batch):
    sequences, labels = zip(*batch)
    sequences_padded = pad_sequence(sequences, batch_first=True, padding_value=0)
    labels = torch.stack(labels)
    return sequences_padded, labels

In [18]:
# 4. Handle Imbalance with WeightedRandomSampler
def compute_sample_weight(label_vec):
    return 1.0 / (label_vec.sum().item() + 1e-4)

sample_weights = [compute_sample_weight(train_dataset[i][1]) for i in range(len(train_dataset))]
sample_weights = np.array(sample_weights)
sampler = WeightedRandomSampler(sample_weights, num_samples=len(sample_weights), replacement=True)

In [19]:
'''''''''''''''
# Count occurrences of each class
class_counts = torch.zeros(len(label_to_idx))
for _, labels in train_dataset:
    class_counts += labels  # Sum multi-hot vectors

# Avoid division by zero
class_weights = 1.0 / (class_counts + 1e-4) 


def compute_sample_weight(label_vec):
    return sum(class_weights[label_vec == 1]).item()

sample_weights = [
    compute_sample_weight(train_dataset[i][1])
    for i in range(len(train_dataset))
]
sampler = WeightedRandomSampler(sample_weights, len(sample_weights), replacement=True)
'''''''''''''''

'\n# Count occurrences of each class\nclass_counts = torch.zeros(len(label_to_idx))\nfor _, labels in train_dataset:\n    class_counts += labels  # Sum multi-hot vectors\n\n# Avoid division by zero\nclass_weights = 1.0 / (class_counts + 1e-4) \n\n\ndef compute_sample_weight(label_vec):\n    return sum(class_weights[label_vec == 1]).item()\n\nsample_weights = [\n    compute_sample_weight(train_dataset[i][1])\n    for i in range(len(train_dataset))\n]\nsampler = WeightedRandomSampler(sample_weights, len(sample_weights), replacement=True)\n'

In [20]:
train_loader = DataLoader(train_dataset, batch_size=32, sampler=sampler, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn)

In [None]:
class FocalLoss(nn.Module):
    # Could change the gamma and alpha parameters ( gamma is focus on majority class, alpha is focus on minority class try consider alpha = 0.5)
    def __init__(self, gamma= 2, alpha=0.75, reduction='mean'):
        super(FocalLoss, self).__init__()
        self.gamma = gamma
        self.alpha = alpha
        self.reduction = reduction

    def forward(self, logits, targets):
        bce_loss = nn.functional.binary_cross_entropy_with_logits(logits, targets, reduction='none')
        probs = torch.sigmoid(logits)
        p_t = targets * probs + (1 - targets) * (1 - probs)
        focal_factor = (1 - p_t) ** self.gamma
        loss = focal_factor * bce_loss
        if self.alpha is not None:
            if isinstance(self.alpha, (float, int)):
                alpha_factor = targets * self.alpha + (1 - targets) * (1 - self.alpha)
            else:
                alpha_factor = targets * self.alpha + (1 - targets) * (1 - self.alpha)
            loss = alpha_factor * loss
        if self.reduction == 'mean':
            return loss.mean()
        elif self.reduction == 'sum':
            return loss.sum()
        else:
            return loss

In [None]:
# Define the LSTM Classifier Model Using GloVe Embeddings
class LSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_labels, embedding_matrix, num_layers=2, dropout=0.5):
        super(LSTMClassifier, self).__init__()
        self.embedding = nn.Embedding.from_pretrained(embedding_matrix, freeze=False)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers, batch_first=True)
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden_dim, num_labels)

    def forward(self, x):
        embedded = self.embedding(x)          # (batch, seq_len, embed_dim)
        lstm_out, (h_n, _) = self.lstm(embedded)
        last_hidden = h_n[-1, :, :]           # (batch, hidden_dim)
        dropped = self.dropout(last_hidden)
        output = self.fc(dropped)             # (batch, num_labels)
        return output

In [23]:
class MultiLabelLSTM(pl.LightningModule):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_labels, embedding_matrix, num_layers=2, learning_rate=0.0005, dropout=0.5):
        super(MultiLabelLSTM, self).__init__()
        self.model = LSTMClassifier(vocab_size, embedding_dim, hidden_dim, num_labels, embedding_matrix, num_layers, dropout)
        self.criterion = FocalLoss(gamma=3, alpha=0.5, reduction='mean')
        self.f1 = MultilabelF1Score(num_labels=num_labels, average='macro')
        self.precision = MultilabelPrecision(num_labels=num_labels, average='macro')
        self.recall = MultilabelRecall(num_labels=num_labels, average='macro')
        self.hamming = MultilabelHammingDistance(num_labels=num_labels)
        self.jaccard = MultilabelJaccardIndex(num_labels=num_labels)
        self.learning_rate = learning_rate

        self.train_losses = []  # Accumulate training losses per epoch
        self.val_losses = []    # Accumulate validation losses per epoch
        self.test_losses = []   # Accumulate test losses per epoch
        
        # Lists to store all predictions and labels for the classification report
        self.all_preds = []
        self.all_labels = []

    def forward(self, x):
        return self.model(x)

    def training_step(self, batch, batch_idx):
        x, y = batch
        logits = self(x)
        loss = self.criterion(logits, y)
        self.train_losses.append(loss.detach())
        self.log('train_loss', loss, on_epoch=True, prog_bar=True)
        return loss

    def on_train_epoch_end(self):
        avg_loss = torch.stack(self.train_losses).mean()
        print(f"Epoch {self.current_epoch}: Training Loss: {avg_loss.item():.4f}")
        self.train_losses.clear()

    def validation_step(self, batch, batch_idx):
        x, y = batch
        logits = self(x)
        loss = self.criterion(logits, y)
        self.val_losses.append(loss.detach())
        self.log('val_loss', loss, on_epoch=True, prog_bar=True)
        return loss

    def on_validation_epoch_end(self):
        avg_val_loss = torch.stack(self.val_losses).mean()
        print(f"Epoch {self.current_epoch}: Validation Loss: {avg_val_loss.item():.4f}")
        self.val_losses.clear()

    def test_step(self, batch, batch_idx):
        x, y = batch
        logits = self(x)
        loss = self.criterion(logits, y)
        self.test_losses.append(loss.detach())
        threshold = 0.45 # Adjust threshold as needed
        preds = (torch.sigmoid(logits) > threshold).float()  # Apply sigmoid to logits
        
        # Update metrics
        self.f1.update(preds, y)
        self.precision.update(preds, y)
        self.recall.update(preds, y)
        self.hamming.update(preds, y)
        self.jaccard.update(preds, y)
        self.log('test_loss', loss, on_epoch=True)
        
        # Accumulate predictions and labels for classification report
        self.all_preds.append(preds.detach().cpu().numpy())
        self.all_labels.append(y.detach().cpu().numpy())
        
        return loss

    def on_test_epoch_end(self):
        avg_test_loss = torch.stack(self.test_losses).mean()
        print(f"\nTest Loss: {avg_test_loss.item():.4f}")
        self.test_losses.clear()
        
        # Compute and print test metrics
        f1 = self.f1.compute()
        precision = self.precision.compute()
        recall = self.recall.compute()
        hamming = self.hamming.compute()
        jaccard = self.jaccard.compute()
        print("\n=== Classification Report ===")
        print(f"Test F1-Score: {f1:.4f}")
        print(f"Test Precision: {precision:.4f}")
        print(f"Test Recall: {recall:.4f}")
        print(f"Test Hamming Loss: {hamming:.4f}")
        print(f"Test Jaccard Score: {jaccard:.4f}")
        self.log('test_f1', f1)
        self.log('test_precision', precision)
        self.log('test_recall', recall)
        self.log('test_hamming', hamming)
        self.log('test_jaccard', jaccard)
        
        # Reset metrics for the next test epoch
        self.f1.reset()
        self.precision.reset()
        self.recall.reset()
        self.hamming.reset()
        self.jaccard.reset()
        
        # Concatenate accumulated predictions and labels
        all_preds = np.concatenate(self.all_preds, axis=0)
        all_labels = np.concatenate(self.all_labels, axis=0)
        
        # Generate and print a detailed classification report using scikit-learn
        # Here we use string representations for target names (e.g., "0", "1", ..., "num_labels-1")
        report = classification_report(
            all_labels,
            all_preds,
            target_names=[str(i) for i in range(all_labels.shape[1])],
            zero_division=0
        )
        print("\nDetailed Classification Report:\n", report)
        
        # Clear stored predictions and labels for the next test run
        self.all_preds.clear()
        self.all_labels.clear()

    def configure_optimizers(self):
        return torch.optim.AdamW(self.parameters(), lr=self.learning_rate, weight_decay=1e-4)

In [24]:
csv_logger = CSVLogger("logs", name="multilabel_model")
tb_logger = TensorBoardLogger("logs", name="multilabel_model_tensorboard")

# EarlyStopping callback: monitor 'val_loss'
early_stop_callback = EarlyStopping(
    monitor='val_loss',
    min_delta=0.00,
    patience=1,
    verbose=True,
    mode='min'
)
lr_monitor = LearningRateMonitor(logging_interval='epoch')

trainer = pl.Trainer(
    max_epochs=8,
    accelerator='mps' if torch.backends.mps.is_available() else 'cpu',
    devices=1,
    logger=[csv_logger, tb_logger],
    log_every_n_steps=1,
    callbacks=[early_stop_callback, lr_monitor]
)

GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


In [25]:
model = MultiLabelLSTM(
    vocab_size=len(vocab),
    embedding_dim=embedding_dim,
    hidden_dim=512,
    num_labels=len(label_to_idx),
    embedding_matrix=embedding_matrix,
    num_layers=2,
    learning_rate=0.0005,
    dropout=0.5
)

trainer.fit(model, train_loader, val_loader)
trainer.test(model, test_loader)


  | Name      | Type                      | Params | Mode 
----------------------------------------------------------------
0 | model     | LSTMClassifier            | 14.0 M | train
1 | criterion | FocalLoss                 | 0      | train
2 | f1        | MultilabelF1Score         | 0      | train
3 | precision | MultilabelPrecision       | 0      | train
4 | recall    | MultilabelRecall          | 0      | train
5 | hamming   | MultilabelHammingDistance | 0      | train
6 | jaccard   | MultilabelJaccardIndex    | 0      | train
----------------------------------------------------------------
14.0 M    Trainable params
0         Non-trainable params
14.0 M    Total params
55.890    Total estimated model params size (MB)
11        Modules in train mode
0         Modules in eval mode


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Epoch 0: Validation Loss: 0.0429


/opt/anaconda3/lib/python3.12/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:425: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.
/opt/anaconda3/lib/python3.12/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:425: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 0: Validation Loss: 0.0194


Metric val_loss improved. New best score: 0.019


Epoch 0: Training Loss: 0.0224


Validation: |          | 0/? [00:00<?, ?it/s]

Metric val_loss improved by 0.005 >= min_delta = 0.0. New best score: 0.014


Epoch 1: Validation Loss: 0.0142
Epoch 1: Training Loss: 0.0125


Validation: |          | 0/? [00:00<?, ?it/s]

Metric val_loss improved by 0.000 >= min_delta = 0.0. New best score: 0.014


Epoch 2: Validation Loss: 0.0138
Epoch 2: Training Loss: 0.0100


Validation: |          | 0/? [00:00<?, ?it/s]

Metric val_loss improved by 0.000 >= min_delta = 0.0. New best score: 0.013


Epoch 3: Validation Loss: 0.0134
Epoch 3: Training Loss: 0.0087


Validation: |          | 0/? [00:00<?, ?it/s]

Monitored metric val_loss did not improve in the last 1 records. Best score: 0.013. Signaling Trainer to stop.


Epoch 4: Validation Loss: 0.0145
Epoch 4: Training Loss: 0.0077


/opt/anaconda3/lib/python3.12/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:425: The 'test_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.


Testing: |          | 0/? [00:00<?, ?it/s]


Test Loss: 0.0146

=== Classification Report ===
Test F1-Score: 0.7834
Test Precision: 0.7988
Test Recall: 0.7704
Test Hamming Loss: 0.0741
Test Jaccard Score: 0.6528

Detailed Classification Report:
               precision    recall  f1-score   support

           0       0.74      0.73      0.74      1027
           1       0.87      0.89      0.88      5819
           2       0.79      0.81      0.80      1284
           3       0.68      0.56      0.61       960
           4       0.81      0.78      0.80      5458
           5       0.90      0.85      0.87      5468

   micro avg       0.84      0.82      0.83     20016
   macro avg       0.80      0.77      0.78     20016
weighted avg       0.84      0.82      0.83     20016
 samples avg       0.89      0.87      0.86     20016



[{'test_loss': 0.014599100686609745,
  'test_f1': 0.7834431529045105,
  'test_precision': 0.798846423625946,
  'test_recall': 0.7704260945320129,
  'test_hamming': 0.0741332620382309,
  'test_jaccard': 0.6527856588363647}]

# Back-test

In [None]:
# 9. Back-Test with Example Sentences
def predict(sentence, tokenizer, vocab, label_to_idx, model, threshold=0.45):
    model.eval()
    with torch.no_grad():
        tokens = tokenizer(sentence)
        indices = [vocab[token] for token in tokens if token in vocab]
        if not indices:
            indices = [vocab['<unk>']]
        indices = torch.tensor(indices, dtype=torch.long).unsqueeze(0)  # Add batch dimension
        logits = model(indices)
        # Apply sigmoid to convert logits to probabilities and then threshold
        probs = torch.sigmoid(logits)
        preds = (probs > threshold).float().squeeze(0)
        predicted_labels = [label for label, idx in label_to_idx.items() if preds[idx] == 1]
        return predicted_labels

In [30]:
example_sentences = [
    "Attention is all you need The dominant sequence transduction models are based on complex recurrent or convolutional neural networks in an encoder-decoder configuration. The best performing models also connect the encoder and decoder through an attention mechanism. We propose a new simple network architecture, the Transformer, based solely on attention mechanisms, dispensing with recurrence and convolutions entirely. Experiments on two machine translation tasks show these models to be superior in quality while being more parallelizable and requiring significantly less time to train. Our model achieves 28.4 BLEU on the WMT 2014 English-to-German translation task, improving over the existing best results, including ensembles by over 2 BLEU. On the WMT 2014 English-to-French translation task, our model establishes a new single-model state-of-the-art BLEU score of 41.8 after training for 3.5 days on eight GPUs, a small fraction of the training costs of the best models from the literature. We show that the Transformer generalizes well to other tasks by applying it successfully to English constituency parsing both with large and limited training data.",
    "Vector autogressions (VARs) are widely applied when it comes to modeling and forecasting macroeconomic variables. In high dimensions, however, they are prone to overfitting. Bayesian methods, more concretely shrinkage priors, have shown to be successful in improving prediction performance. In the present paper, we introduce the semi-global framework, in which we replace the traditional global shrinkage parameter with group-specific shrinkage parameters. We show how this framework can be applied to various shrinkage priors, such as global-local priors and stochastic search variable selection priors. We demonstrate the virtues of the proposed framework in an extensive simulation study and in an empirical application forecasting data of the US economy. Further, we shed more light on the ongoing ``Illusion of Sparsity'' debate, finding that forecasting performances under sparse/dense priors vary across evaluated economic variables and across time frames. Dynamic model averaging, however, can combine the merits of both worlds.",
    "Classification can be performed using either a discriminative or a generative learning approach. Discriminative learning consists of constructing the conditional probability of the outputs given the inputs, while generative learning consists of constructing the joint probability density of the inputs and outputs. Although most classical and quantum methods are discriminative, there are some advantages of the generative learning approach. For instance, it can be applied to unsupervised learning, statistical inference, uncertainty estimation, and synthetic data generation. In this article, we present a quantum generative multiclass classification strategy, called quantum generative classification (QGC). This model uses a variational quantum algorithm to estimate the joint probability density function of features and labels of a data set by means of a mixed quantum state. We also introduce a quantum map called quantum-enhanced Fourier features (QEFF), which leverages quantum superposition to prepare high-dimensional data samples in quantum hardware using a small number of qubits. We show that the quantum generative classification algorithm can be viewed as a Gaussian mixture that reproduces a kernel Hilbert space of the training data. In addition, we developed a hybrid quantum-classical neural network that shows that it is possible to perform generative classification on high-dimensional data sets. The method was tested on various low- and high-dimensional data sets including the 10-class MNIST and Fashion-MNIST data sets, illustrating that the generative classification strategy is competitive against other previous quantum models.",
    "Research on human skin anatomy reveals its complex multi-scale, multi-phase nature, with up to 70% of its composition being bounded and free water. Fluid movement plays a key role in the skin's mechanical and biological responses, influencing its time-dependent behavior and nutrient transport.Poroelastic modeling is a promising approach for studying skin dynamics across scales by integrating multi-physics processes. This paper introduces a biology hierarchical two-compartment model capturing fluid distribution in the interstitium and micro-circulation. A theoretical framework is developed with a biphasic interstitium -- distinguishing interstitial fluid and non-structural cells -- and analyzed through a one-dimensional consolidation test of a column. This biphasic approach allows separate modeling of cell and fluid motion, considering their differing characteristic times. An appendix discusses extending the model to include biological exchanges like oxygen transport. Preliminary results indicate that cell viscosity introduces a second characteristic time, and at high viscosity and short time scales, cells behave similarly to solids.A simplified model was used to replicate an experimental campaign on short time scales. Local pressure (up to 31 kPa) was applied to dorsal finger skin using a laser Doppler probe PF801 (Perimed Sweden), following a setup described in Fromy Brain Res (1998). The model qualitatively captured ischemia and post-occlusive reactive hyperemia, aligning with experimental data.All numerical simulations used the open-source software FEniCSx v0.9.0. To ensure transparency and reproducibility, anonymized experimental data and finite element codes are publicly available on GitHub.",
    "Currency arbitrage capitalizes on price discrepancies in currency exchange rates between markets to produce profits with minimal risk. By employing a combinatorial optimization problem, one can ascertain optimal paths within directed graphs, thereby facilitating the efficient identification of profitable trading routes. This research investigates the methodologies of quantum annealing and gate-based quantum computing in relation to the currency arbitrage problem. In this study, we implement the Quantum Approximate Optimization Algorithm (QAOA) utilizing Qiskit version 1.2. In order to optimize the parameters of QAOA, we perform simulations utilizing the AerSimulator and carry out experiments in simulation. Furthermore, we present an NchooseK-based methodology utilizing D-Wave's Ocean suite. This methodology enables a comparison of the effectiveness of quantum techniques in identifying optimal arbitrage paths. The results of our study enhance the existing literature on the application of quantum computing in financial optimization challenges, emphasizing both the prospective benefits and the present limitations of these developing technologies in real-world scenarios.",
    "Despite advances in methods to interrogate tumor biology, the observational and population-based approach of classical cancer research and clinical oncology does not enable anticipation of tumor outcomes to hasten the discovery of cancer mechanisms and personalize disease management. To address these limitations, individualized cancer forecasts have been shown to predict tumor growth and therapeutic response, inform treatment optimization, and guide experimental efforts. These predictions are obtained via computer simulations of mathematical models that are constrained with data from a patient's cancer and experiments. This book chapter addresses the validation of these mathematical models to forecast tumor growth and treatment response. We start with an overview of mathematical modeling frameworks, model selection techniques, and fundamental metrics. We then describe the usual strategies employed to validate cancer forecasts in preclinical and clinical scenarios. Finally, we discuss existing barriers in validating these predictions along with potential strategies to address them."]

In [31]:
print("\n=== Back-Testing Results ===")
for sentence in example_sentences:
    predicted_labels = predict(sentence, tokenizer, vocab, label_to_idx, model)
    print(f"Sentence: {sentence[:80]}...")
    print(f"Predicted Labels: {predicted_labels}\n")


=== Back-Testing Results ===
Sentence: Attention is all you need The dominant sequence transduction models are based on...
Predicted Labels: ['cs']

Sentence: Vector autogressions (VARs) are widely applied when it comes to modeling and for...
Predicted Labels: ['econ-qfin']

Sentence: Classification can be performed using either a discriminative or a generative le...
Predicted Labels: ['cs', 'physic']

Sentence: Research on human skin anatomy reveals its complex multi-scale, multi-phase natu...
Predicted Labels: ['bio', 'physic']

Sentence: Currency arbitrage capitalizes on price discrepancies in currency exchange rates...
Predicted Labels: ['econ-qfin', 'math-stats']

Sentence: Despite advances in methods to interrogate tumor biology, the observational and ...
Predicted Labels: ['bio']

