In [None]:
from transformers import AutoTokenizer
import torch
from transformers import logging
import pandas as pd
import src
from transformers import AutoModelForSequenceClassification

In [None]:
logging.set_verbosity_error()
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(DEVICE)

In [None]:
COMMIT_HASH = "cf44004e90045cde298e28605ff105747d58aa7a"

tokenizer = AutoTokenizer.from_pretrained("luerhard/PopBERT", revision=COMMIT_HASH)
model = AutoModelForSequenceClassification.from_pretrained("luerhard/PopBERT", revision=COMMIT_HASH)

In [None]:
df = pd.read_parquet(src.PATH / "data/raw/sentences.parquet.gzip")

In [None]:
df = df.iloc[:8]

In [None]:
def iter_batches(df, batch_size):
    i = 0
    while (i + batch_size) <= len(df):
        slice_df = df.iloc[i : i + batch_size]
        i += batch_size
        yield slice_df.to_dict(orient="list")
    if (i + batch_size) != len(df):
        slice_df = df.iloc[i:]
        yield slice_df.to_dict(orient="list")

In [None]:
results = []
with torch.inference_mode():
    for batch in iter_batches(df, 2):
        ids, text = batch["sample_id"], batch["text"]
        encodings = tokenizer(text, return_tensors="pt", padding=True).to(DEVICE)
        out = model(**encodings)
        proba_tensor = torch.nn.functional.sigmoid(out.logits)
        probas = proba_tensor.cpu().detach().numpy()
        
        result = pd.DataFrame(probas)
        result.columns = ["elite", "pplcentr", "left", "right"]
        result["sample_id"] = ids
        results.append(result)

In [None]:
out = pd.concat(results)
out = out[["sample_id", "elite", "pplcentr", "left", "right"]]
out.to_parquet(src.PATH / "data/interim/sentence_predictions.parquet.gzip", compression="gzip")