In [None]:
from datasets import Dataset
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    DataCollatorWithPadding,
    PreTrainedModel,
    PreTrainedTokenizer,
    Trainer,
    TrainingArguments 
)
import torch

In [None]:
def preprocess_func(text: str) -> str:
    return text

In [None]:
model = AutoModelForSequenceClassification.from_pretrained("pysentimiento/robertuito-sentiment-analysis")
tokenizer = AutoTokenizer.from_pretrained(
    "pysentimiento/robertuito-sentiment-analysis",
)
eval_trainer = Trainer(
    model=model,
    args=TrainingArguments(
        output_dir="../data",
        per_device_eval_batch_size=32
    ),
    data_collator=DataCollatorWithPadding(tokenizer, padding="longest"),
)

In [None]:
raw_inputs = [
    "I think it's getting a lot of traction.",
    "Reminds me of cosmos sdk revolution we had 😅",
    "Zora, PGN"
]
data = {
    "text": [preprocess_func(sent) for sent in raw_inputs]
}

def tokenize(batch):
    # If context is present, use it
    if "context" in batch:
        inputs = [batch["text"], batch["context"]]
    else:
        inputs = [batch["text"]]
    return tokenizer(
        *inputs, padding=False, truncation=True,
        max_length=tokenizer.model_max_length
    )

dataset = Dataset.from_dict(data)
dataset = dataset.map(tokenize, batched=True, batch_size=32)

In [None]:
output = eval_trainer.predict(dataset)
logits = torch.tensor(output.predictions)
output, logits

In [None]:
probs = torch.softmax(logits, dim=1).view(-1)
probs

In [None]:
id2label = model.config.id2label
probas = {id2label[i]: probs[i].item() for i in id2label}
probas

In [None]:
from farglot.analyzer import AnalyzerForSequenceClassification

analyzer = AnalyzerForSequenceClassification.from_model_name("pysentimiento/robertuito-sentiment-analysis")

In [None]:
inputs = [
    "I think it's getting a lot of traction.",
    "Reminds me of cosmos sdk revolution we had 😅",
    "Zora, PGN"
]
probas = analyzer.predict(inputs)
probas