In [None]:
from datasets import Dataset
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    DataCollatorWithPadding,
    PreTrainedModel,
    PreTrainedTokenizer,
    Trainer,
    TrainingArguments 
)
import torch

In [None]:
def preprocess_func(text: str) -> str:
    return text

In [None]:
model = AutoModelForSequenceClassification.from_pretrained("pysentimiento/robertuito-sentiment-analysis")
tokenizer = AutoTokenizer.from_pretrained(
    "pysentimiento/robertuito-sentiment-analysis",
)
eval_trainer = Trainer(
    model=model,
    args=TrainingArguments(
        output_dir="../data",
        per_device_eval_batch_size=32
    ),
    data_collator=DataCollatorWithPadding(tokenizer, padding="longest"),
)

In [None]:
raw_inputs = [
    "I think it's getting a lot of traction.",
    "Reminds me of cosmos sdk revolution we had 😅",
    "Zora, PGN"
]
data = {
    "text": [preprocess_func(sent) for sent in raw_inputs]
}

def tokenize(batch):
    # If context is present, use it
    if "context" in batch:
        inputs = [batch["text"], batch["context"]]
    else:
        inputs = [batch["text"]]
    return tokenizer(
        *inputs, padding=False, truncation=True,
        max_length=tokenizer.model_max_length
    )

dataset = Dataset.from_dict(data)
dataset = dataset.map(tokenize, batched=True, batch_size=32)

In [None]:
output = eval_trainer.predict(dataset)
logits = torch.tensor(output.predictions)
output, logits

In [None]:
probs = torch.softmax(logits, dim=1).view(-1)
probs

In [None]:
id2label = model.config.id2label
probas = {id2label[i]: probs[i].item() for i in id2label}
probas

In [1]:
from farglot.analyzer import AnalyzerForSequenceClassification, CastAnalyzer

analyzer = AnalyzerForSequenceClassification.from_model_name("pysentimiento/robertuito-sentiment-analysis")

  from .autonotebook import tqdm as notebook_tqdm


ImportError: cannot import name 'CastAnalyzer' from 'farglot.analyzer' (/Users/michaelhly/Projects/farsentimiento/.venv/lib/python3.9/site-packages/farglot/analyzer.py)

In [None]:
inputs = [
    "I think it's getting a lot of traction.",
    "Reminds me of cosmos sdk revolution we had 😅",
    "Zora, PGN"
]
probas = analyzer.predict(inputs)
probas

In [2]:
from farglot.cast_analyzer import CastAnalyzer

In [4]:
sentiment_analyzer=CastAnalyzer.sequence_analzyer_from_model_name(
    hub_address="nemes.farcaster.xyz:2283",
    model_name="pysentimiento/robertuito-sentiment-analysis"
)
sentiment_analyzer.predict_cast(fid=2, hash_hex="0bcdcbf006ec22b79f37f2cf2a09c33413883937")

Map: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 149.01 examples/s]
You're using a PreTrainedTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'NEG': 0.051998768001794815,
 'NEU': 0.22470703721046448,
 'POS': 0.7232941389083862}

In [5]:
sentiment_analyzer.predict_casts_by_fid(fid=2)

Map: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3894/3894 [00:00<00:00, 30235.51 examples/s]


{'NEG': 0.03734538331627846,
 'NEU': 0.505352795124054,
 'POS': 0.4573018550872803}