<a href="https://colab.research.google.com/github/nickeubank/leaa_subj/blob/main/leaa_predict.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os

import numpy as np
import numpy.random as npr
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from torch import nn
from torch.optim import AdamW
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
from transformers import (
    BertForSequenceClassification,
    BertTokenizer,
)

pd.set_option("mode.copy_on_write", True)

repo_id = "nickeubank/leaa_grant_subjects"
dir = "/hpc/group/ssri/nce8/leaa_subj/"

assert torch.cuda.is_available()

grants = pd.read_parquet(dir + "subj_text_and_labels.parquet")

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
#########
# Split into train test and for predict
#########
grants = grants.drop_duplicates("description")
unlabeled = grants[grants["label_1"].isnull()]
unlabeled["label_1_encoded"] = grants["label_1"] - 1

# Load Model and Tokenizer
assert torch.cuda.is_available()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = BertForSequenceClassification.from_pretrained(repo_id).to(device)
tokenizer = BertTokenizer.from_pretrained(repo_id)

model.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 1024, padding_idx=0)
      (position_embeddings): Embedding(512, 1024)
      (token_type_embeddings): Embedding(2, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-23): 24 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (LayerNorm): LayerNorm((1

In [13]:
descriptions = list(unlabeled["description"].values)

all_predictions = []

for i in range(0, len(descriptions), 16):
    if i % 10_000 == 0:
        print(f"starting batch {i}")

    batch = descriptions[i : i + 16]

    inputs = tokenizer(
        batch,
        return_tensors="pt",
        padding="max_length",
        truncation=True,
        max_length=256,
    ).to(device)

    outputs = model(**inputs)
    predicted_classes = torch.argmax(outputs.logits, dim=1)
    formatted_to_list = list(map(lambda x: x.item(), predicted_classes))
    all_predictions.extend(formatted_to_list)

starting batch 0
starting batch 10000
starting batch 20000
starting batch 30000
starting batch 40000
starting batch 50000
starting batch 60000


In [14]:
len(all_predictions)

63478

In [15]:
# Add the predicted labels to the 'unlabeled' DataFrame
unlabeled["predicted_label"] = all_predictions

print(unlabeled[["description", "predicted_label"]].head())

                                         description  predicted_label
0  Title: COMMUNICATIONS EQUIP.\nDescription: NO ...                1
1  Title: EQUIP.\nDescription: THE NEWLY CREATED ...                3
2  Title: NARCO EFFECTIVENESS\nDescription: SUMMA...                0
3  Title: ADVANCED TRAINING OF POLYGRAPH EXAMINER...                1
4  Title: RIOT CONTROL EQUIP.\nDescription: NO PR...                1


In [16]:
unlabeled.to_parquet(dir + "predicted_labels_1digit.parquet")