In [None]:
!pip install datasets
!pip install transformers
!pip install torch
!pip install evaluate
!pip install tqdm
!pip install numpy

In [None]:
import datasets
from transformers import AutoTokenizer
import numpy as np
import evaluate
from transformers import AutoModelForSequenceClassification
import torch
from torch.utils.data import DataLoader
from torch.optim import AdamW
from transformers import get_scheduler
from tqdm.auto import tqdm

# Prepare the dataset

In [None]:
pre_trained_model = "google/bert_uncased_L-2_H-128_A-2"
feature = "violence"
number_classes = 5

In [None]:
#Tokenizer
tokenizer  = AutoTokenizer.from_pretrained(pre_trained_model, model_max_length=512)

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)


## Hate dataset
hate = datasets.load_dataset('ucberkeley-dlab/measuring-hate-speech')

train_testvalid = hate['train'].train_test_split(test_size=0.2)
# Split the 10% test + valid in half test, half valid
test_valid = train_testvalid['test'].train_test_split(test_size=0.5)
# gather everyone if you want to have a single DatasetDict
hate = datasets.DatasetDict({
    'train': train_testvalid['train'],
    'test': test_valid['test'],
    'valid': test_valid['train']})
hate_small = hate.select_columns(["text", feature])
hate_small = hate_small.rename_column(feature, "label")

new_features = hate_small['train'].features.copy()
new_features["label"] = datasets.ClassLabel(num_classes=number_classes)
hate_small['train'] = hate_small['train'].cast(new_features)

tokenized_hate_small = hate_small.map(tokenize_function, batched=True)
tokenized_hate_small = tokenized_hate_small.remove_columns(["text"])
tokenized_hate_small = tokenized_hate_small.rename_column("label", "labels")

tokenized_hate_small.set_format("torch")

# Save tokenized datasets
tokenized_hate_small.save_to_disk('./datasets/tokenized_hate_small')

# Train the model

In [None]:
tokenized_hate_small = datasets.load_from_disk('./datasets/tokenized_hate_small')

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print(device)
   
#tokenized_hate_small_train = tokenized_hate_small['train'].shuffle(seed=42)
#tokenized_hate_small_test = tokenized_hate_small['test'].shuffle(seed=42)

tokenized_hate_small_train = tokenized_hate_small['train'].shuffle(seed=42).select(range(10000))
tokenized_hate_small_test = tokenized_hate_small['test'].shuffle(seed=42).select(range(10000))

train_dataloader = DataLoader(tokenized_hate_small_train, shuffle=True, batch_size=8)
eval_dataloader = DataLoader(tokenized_hate_small_test, batch_size=8)

model = AutoModelForSequenceClassification.from_pretrained(pre_trained_model, num_labels=number_classes)

optimizer = AdamW(model.parameters(), lr=5e-5)

num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=1, num_training_steps=num_training_steps
)

model.to(device)

progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        batch["labels"] = batch["labels"].long()  # Convert labels to LongTensor
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)
        

metric = evaluate.load("accuracy")
progress_bar = tqdm(range(len(eval_dataloader)))
model.eval()
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    batch["labels"] = batch["labels"].long()  # Convert labels to LongTensor
    with torch.no_grad():
        outputs = model(**batch)
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])
    progress_bar.update(1)
metric.compute()
    
# Save the model
model.save_pretrained("./model")

# Use the model

In [None]:
text = input("Enter a sentence: ")

inputs = tokenizer(text, return_tensors="pt",  padding = True, truncation = True)
inputs = inputs.to(device)

with torch.no_grad():
    outputs = model(**inputs)
    logits = outputs.logits
    
predicted_class_id = logits.argmax().item()
print("predicted_class_id: ", predicted_class_id)