<a href="https://colab.research.google.com/github/nickeubank/leaa_subj/blob/main/leaa_predict.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os

import numpy as np
import numpy.random as npr
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from torch import nn
from torch.optim import AdamW
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
from transformers import (
    BertForSequenceClassification,
    BertTokenizer,
)

pd.set_option("mode.copy_on_write", True)

dir = "https://github.com/nickeubank/leaa_subj/raw/refs/heads/main/"
grants = pd.read_parquet(dir + "subj_text_and_labels.parquet")


In [2]:


####
# Google drive
#####

from google.colab import drive
drive.mount('/content/gdrive/')
dir = "/content/gdrive/MyDrive/leaa/"

#########
# Split into train test and for predict
#########
grants = grants.drop_duplicates("description")
unlabeled = grants[grants["label_1"].isnull()]

# Load Model and Tokenizer
model = BertForSequenceClassification.from_pretrained(dir + "bert_grant_classifier")
tokenizer = BertTokenizer.from_pretrained(dir + "bert_grant_classifier")
label_encoder = torch.load(dir + "label_encoder.pth", weights_only=False)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


Drive already mounted at /content/gdrive/; to attempt to forcibly remount, call drive.mount("/content/gdrive/", force_remount=True).


In [5]:
descriptions = list(grants["description"].values)[0:100]

all_predictions = []

for i in range(0, len(descriptions), 16):
  print(f"starting batch {i}")
  batch = descriptions[i : i + 16]

  inputs = tokenizer(
      batch,
      return_tensors="pt",
      padding="max_length",
      truncation=True,
      max_length=128,
  ).to(device)
  outputs = model(**inputs)
  predicted_classes = torch.argmax(outputs.logits, dim=1)
  batch_predictions = label_encoder.inverse_transform(predicted_classes.cpu().numpy())
  all_predictions.extend(batch_predictions)


starting batch 0
starting batch 16
starting batch 32
starting batch 48
starting batch 64
starting batch 80
starting batch 96


In [7]:
# Add the predicted labels to the 'unlabeled' DataFrame
unlabeled["predicted_label"] = all_predictions

print(unlabeled[["description", "predicted_label"]].head())


KeyError: 'predicted_label'

In [8]:
all_predictions

[np.float64(2.0),
 np.float64(2.0),
 np.float64(2.0),
 np.float64(2.0),
 np.float64(2.0),
 np.float64(2.0),
 np.float64(2.0),
 np.float64(2.0),
 np.float64(2.0),
 np.float64(2.0),
 np.float64(2.0),
 np.float64(2.0),
 np.float64(2.0),
 np.float64(2.0),
 np.float64(2.0),
 np.float64(2.0),
 np.float64(2.0),
 np.float64(2.0),
 np.float64(2.0),
 np.float64(2.0),
 np.float64(2.0),
 np.float64(2.0),
 np.float64(2.0),
 np.float64(2.0),
 np.float64(2.0),
 np.float64(2.0),
 np.float64(2.0),
 np.float64(2.0),
 np.float64(2.0),
 np.float64(2.0),
 np.float64(2.0),
 np.float64(2.0),
 np.float64(2.0),
 np.float64(2.0),
 np.float64(2.0),
 np.float64(2.0),
 np.float64(2.0),
 np.float64(2.0),
 np.float64(2.0),
 np.float64(2.0),
 np.float64(2.0),
 np.float64(2.0),
 np.float64(2.0),
 np.float64(2.0),
 np.float64(2.0),
 np.float64(2.0),
 np.float64(2.0),
 np.float64(2.0),
 np.float64(2.0),
 np.float64(2.0),
 np.float64(2.0),
 np.float64(2.0),
 np.float64(2.0),
 np.float64(2.0),
 np.float64(2.0),
 np.float6

In [9]:
unlabeled.to_parquet(dir + "predicted_labels_1digit.parquet")