In [9]:
import os
import json

from huggingface_hub import HfApi, hf_hub_download

import numpy as np
import numpy.random as npr
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

from torch import nn
from torch.optim import AdamW
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
from transformers import (
    BertForSequenceClassification,
    BertTokenizer,
)

pd.set_option("mode.copy_on_write", True)

repo_id = "nickeubank/leaa_grant_subjects_2digits_invweighted"

assert torch.cuda.is_available()

In [None]:
grants = pd.read_parquet(
    "../20_intermediate_data/predicted_labels_1digit_weighted.parquet"
)
grants = grants.drop_duplicates("description")

# Load Model and Tokenizer
assert torch.cuda.is_available()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = BertForSequenceClassification.from_pretrained(repo_id).to(device)
tokenizer = BertTokenizer.from_pretrained(repo_id)

model.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 1024, padding_idx=0)
      (position_embeddings): Embedding(512, 1024)
      (token_type_embeddings): Embedding(2, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-23): 24 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (LayerNorm): LayerNorm((1

In [12]:
MAX_LEN = 256
BATCH_SIZE = 8

descriptions = list(grants["description"].values)

all_predictions = []

for i in range(0, len(descriptions), BATCH_SIZE):
    if i % 10_000 == 0:
        print(f"starting batch {i}")

    batch = descriptions[i : i + BATCH_SIZE]

    inputs = tokenizer(
        batch,
        return_tensors="pt",
        padding="max_length",
        truncation=True,
        max_length=MAX_LEN,
    ).to(device)

    outputs = model(**inputs)
    predicted_classes = torch.argmax(outputs.logits, dim=1)
    formatted_to_list = list(map(lambda x: x.item(), predicted_classes))
    all_predictions.extend(formatted_to_list)

starting batch 0
starting batch 10000
starting batch 20000
starting batch 30000
starting batch 40000
starting batch 50000
starting batch 60000
starting batch 70000
starting batch 80000
starting batch 90000
starting batch 100000
starting batch 110000
starting batch 120000
starting batch 130000
starting batch 140000


In [13]:
# Load encoder
encoder_name = "label_mapping_2digit_invweighted.json"

file_path = hf_hub_download(repo_id=repo_id, filename=encoder_name, repo_type="model")
with open(file_path, "r", encoding="utf-8") as f:
    encodings_json = json.load(f)

labels_to_fit = list(encodings_json.keys())

# Validate `.keys()` ordered correctly
assert list(map(lambda k: encodings_json[k], labels_to_fit)) == list(
    range(len(labels_to_fit))
)
label_encoder = LabelEncoder()

# Destring
labels_to_fit_str = list(map(float, labels_to_fit))
label_encoder.fit(labels_to_fit_str)

In [14]:
len(all_predictions)

143664

In [15]:
# Add the predicted labels to the 'unlabeled' DataFrame
grants["predicted_label_2"] = label_encoder.inverse_transform(all_predictions)
print(
    grants[
        ["description", "predicted_label_1", "label_1", "predicted_label_2", "label_2"]
    ].head()
)

                                         description  predicted_label_1  \
0  Title: COMMUNICATIONS EQUIP.\nDescription: NO ...                  2   
1  Title: EQUIP.\nDescription: THE NEWLY CREATED ...                  4   
2  Title: NARCO EFFECTIVENESS\nDescription: SUMMA...                  1   
3  Title: ADVANCED TRAINING OF POLYGRAPH EXAMINER...                  2   
4  Title: RIOT CONTROL EQUIP.\nDescription: NO PR...                  4   

   label_1  predicted_label_2  label_2  
0      NaN               51.0      NaN  
1      NaN               41.0      NaN  
2      NaN               12.0      NaN  
3      NaN               21.0      NaN  
4      NaN               20.0      NaN  


In [16]:
assert (
    grants.loc[grants["label_1"].notnull(), "predicted_label_1"]
    == grants.loc[grants["label_1"].notnull(), "label_1"]
).mean() > 0.95

assert (
    grants.loc[grants["label_2"].notnull(), "predicted_label_2"]
    == grants.loc[grants["label_2"].notnull(), "label_2"]
).mean() > 0.70

In [18]:
grants[
    ["description", "predicted_label_1", "label_1", "predicted_label_2", "label_2"]
].to_parquet("../20_intermediate_data/predicted_labels_1and2digit_weighted.parquet")

grants[
    ["description", "predicted_label_1", "label_1", "predicted_label_2", "label_2"]
].to_stata("../20_intermediate_data/predicted_labels_1and2digit_weighted.dta", version=117)