In [3]:
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer, RobertaForSequenceClassification
import numpy as np
from scipy.special import softmax
import csv
import urllib.request
import torch.utils.data as data_utils
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device


device(type='cpu')

In [2]:
def preprocess(text):
    new_text = []
    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)

In [4]:
task='sentiment'
MODEL = f"cardiffnlp/twitter-roberta-base-{task}"
# download label mapping
mapping_link = f"https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/{task}/mapping.txt"
with urllib.request.urlopen(mapping_link) as f:
    html = f.read().decode('utf-8').split("\n")
    csvreader = csv.reader(html, delimiter='\t')
labels = [row[1] for row in csvreader if len(row) > 1]
labels

['negative', 'neutral', 'positive']

In [66]:
MODEL = "FacebookAI/roberta-base"
model = RobertaForSequenceClassification.from_pretrained(
    MODEL, num_labels=3, problem_type="multi_label_classification")
tokenizer = AutoTokenizer.from_pretrained(MODEL)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [92]:
train_text_file = "train_text.txt"
with open(train_text_file, "r") as f:
    texts = f.readlines()

train_label_file = "train_labels.txt"
with open(train_label_file, "r") as f:
    labels = f.readlines()

len(texts), len(labels)

texts, labels = texts[:100], labels[:100]



In [None]:
encoded_inputs = tokenizer([ preprocess(t.strip()) for t in texts], return_tensors='pt',   padding=True,
    truncation=True)
labels = [int(labels[i].strip()) for i in range(len(labels))]
labels = torch.tensor(labels, dtype=torch.int)
len(encoded_inputs), len(labels)

(2, 100)

In [94]:
dataset = data_utils.TensorDataset(encoded_inputs["input_ids"], encoded_inputs["attention_mask"], labels)
test_dataloader = data_utils.DataLoader(dataset, batch_size=10, shuffle=True)



In [None]:

model = AutoModelForSequenceClassification.from_pretrained(MODEL)
tokenizer = AutoTokenizer.from_pretrained(MODEL)
text = "Good night ðŸ˜Š"
text = preprocess(text)
encoded_input = tokenizer(text, return_tensors='pt')
output = model(**encoded_input)
scores = output[0][0].detach().numpy()
scores = softmax(scores)


In [10]:
ranking = np.argsort(scores)
ranking = ranking[::-1]
for i in range(scores.shape[0]):
    l = labels[ranking[i]]
    s = scores[ranking[i]]
    print(f"{i+1}) {l} {np.round(float(s), 4)}")

1) positive 0.8466
2) neutral 0.1458
3) negative 0.0076


### Tentativo di training

In [3]:
import numpy as np
import evaluate

from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer
from datasets import load_dataset, concatenate_datasets

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels, average='macro')


dataset = load_dataset('tweet_eval', 'sentiment')




In [5]:
MODEL_NAME = 'cardiffnlp/twitter-roberta-base-sep2022'  # change to desired model from the hub
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenized_datasets = dataset.map(tokenize_function, batched=True)

# augment train set with test set, for downstream apps only - DO NOT EVALUATE ON TEST
# tokenized_datasets['train+test'] = concatenate_datasets([tokenized_datasets['train'],
#                                                          tokenized_datasets['test']])

model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=3)


training_args = TrainingArguments(
    output_dir="test_trainer",
    learning_rate=1e-5,
    per_device_train_batch_size=16,   # modern name
    per_device_eval_batch_size=16,    # modern name
    num_train_epochs=10,
    weight_decay=0.01,
    warmup_ratio=0.1,

    eval_strategy="epoch",
    logging_strategy="epoch",
    save_strategy="epoch",

    load_best_model_at_end=True,
    metric_for_best_model="recall",
    greater_is_better=True,
    report_to="none",
)

metric = evaluate.load('recall')  # default metric for sentiment dataset is recall (macro)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    compute_metrics=compute_metrics,
)

trainer.train()

trainer.create_model_card()
trainer.save_model('saved_model')




Map: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 45615/45615 [00:10<00:00, 4346.61 examples/s]
Map: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 12284/12284 [00:03<00:00, 3758.76 examples/s]
Map: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 2000/2000 [00:00<00:00, 4820.04 examples/s]
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-sep2022 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


: 

In [1]:
torch.cuda.is_available()


NameError: name 'torch' is not defined