<a href="https://colab.research.google.com/github/nickeubank/leaa_subj/blob/main/leaa_predict.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import os

import numpy as np
import numpy.random as npr
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from torch import nn
from torch.optim import AdamW
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
from transformers import (
    BertForSequenceClassification,
    BertTokenizer,
)

pd.set_option("mode.copy_on_write", True)

repo_id = "nickeubank/leaa_grant_subjects"
dir = "/hpc/group/ssri/nce8/leaa_subj/"

assert torch.cuda.is_available()

grants = pd.read_parquet(dir + "subj_text_and_labels.parquet")

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
#########
# Split into train test and for predict
#########
grants = grants.drop_duplicates("description")
unlabeled = grants[grants["label_1"].isnull()]
unlabeled["label_1_encoded"] = grants["label_1"] - 1

# Load Model and Tokenizer

assert torch.cuda.is_available()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = BertForSequenceClassification.from_pretrained(repo_id).to(device)
tokenizer = BertTokenizer.from_pretrained(repo_id)

model.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [34]:
descriptions = list(grants["description"].values)

all_predictions = []

for i in range(0, len(descriptions), 16):
    if i % 10_000 == 0:
        print(f"starting batch {i}")

    batch = descriptions[i : i + 16]

    inputs = tokenizer(
        batch,
        return_tensors="pt",
        padding="max_length",
        truncation=True,
        max_length=128,
    ).to(device)

    outputs = model(**inputs)
    predicted_classes = torch.argmax(outputs.logits, dim=1)
    formatted_to_list = list(map(lambda x: x.item(), predicted_classes))
    all_predictions.extend(formatted_to_list)

starting batch 0


OutOfMemoryError: CUDA out of memory. Tried to allocate 20.00 MiB. GPU 0 has a total capacity of 10.57 GiB of which 3.12 MiB is free. Including non-PyTorch memory, this process has 3.89 GiB memory in use. Process 2592298 has 6.67 GiB memory in use. Of the allocated memory 3.50 GiB is allocated by PyTorch, and 217.42 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [33]:
list(map(lambda x: x.item(), predicted_classes))

[2, 2, 0, 0, 0, 1, 0, 2, 0, 0, 1, 3, 2, 1, 2, 0]

In [31]:
# Add the predicted labels to the 'unlabeled' DataFrame
unlabeled["predicted_label"] = all_predictions

print(unlabeled[["description", "predicted_label"]].head())

ValueError: Length of values (0) does not match length of index (63478)

In [9]:
unlabeled.to_parquet(dir + "predicted_labels_1digit.parquet")