In [None]:
import warnings

warnings.filterwarnings('ignore')

In [None]:
import pandas as pd
import numpy as np

file = "../data/training.1600000.processed.noemoticon.csv"
df = pd.read_csv(file, encoding='ISO-8859-1', usecols=[0, 5], header=None).sample(frac=0.5, random_state=42)

df.columns = ['label', 'sentence']
df.label = df.label.apply(lambda x: np.long(2) if x == 4 else np.long(0))

print("df.shape =", df.shape)
print(f"label distribution :\n{df.label.value_counts()}")
print(df.head())

In [None]:
df.head()

In [None]:
from datasets import Dataset

dataset = Dataset.from_pandas(df)
dataset = dataset.train_test_split(test_size=0.1)


In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

tokenizer = AutoTokenizer.from_pretrained("finiteautomata/bertweet-base-sentiment-analysis", normalization=True)

model = AutoModelForSequenceClassification.from_pretrained("finiteautomata/bertweet-base-sentiment-analysis")


In [None]:
def tokenize_function(example):
    return tokenizer(example["sentence"], truncation=True)

In [None]:
tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset

In [None]:
tokenized_dataset = tokenized_dataset.remove_columns('__index_level_0__')

In [None]:
tokenized_dataset = tokenized_dataset.remove_columns('sentence')

In [None]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


In [None]:
tokenized_dataset = tokenized_dataset.rename_column('label', 'labels')

In [None]:
tokenized_dataset.set_format('torch')

In [None]:
tokenized_dataset["train"].column_names

In [None]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(
    tokenized_dataset["train"], shuffle=True, batch_size=32, collate_fn=data_collator
)
eval_dataloader = DataLoader(
    tokenized_dataset["test"], batch_size=32, collate_fn=data_collator
)

In [None]:
for batch in train_dataloader:
    break
{k: v.shape for k, v in batch.items()}

In [None]:
outputs = model(**batch)
print(outputs.loss, outputs.logits.shape)

In [None]:
from transformers import AdamW

optimizer = AdamW(model.parameters(), lr=5e-5)

In [None]:
from transformers import get_scheduler

num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)
print(num_training_steps)

In [None]:
import torch

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
device

In [None]:
from tqdm.auto import tqdm

progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

In [None]:
import torch


def test_2(model, sentence, tokenizer):
    id_tolabel = {0: 'negative', 1: 'neutral', 2: 'positive'}
    modeleval = model.eval()
    tokenized = tokenizer(sentence, return_tensors='pt').to(modeleval.device)
    with torch.no_grad(): label = torch.argmax(model.forward(**tokenized).logits, dim=1)[0].cpu().item()
    return id_tolabel[label]

In [None]:
print(test_2(model, 'This game is lame', tokenizer))

In [None]:
from datasets import load_metric

metric = load_metric("accuracy")
model.eval()
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])

metric.compute()

In [None]:
model.save_pretrained('../models/trial5')
tokenizer.save_pretrained('../models/trial5')