In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import evaluate
from torch.utils.data import TensorDataset, DataLoader
import torch
import itertools
from tqdm import tqdm

The dataset comprises a collection of tweets, each annotated to indicate whether it includes harmful content. The label '1' signifies harmful content, while '0' denotes content that is not harmful. To proceed, download the dataset and employ it within your .ipynb (Jupyter Notebook) environment to train and refine your model.

https://assets.ctfassets.net/x78yjrjc11pq/2A603tt1UJpHU2mE0eHeRz/c7802d1e577eaac4621f358a2c180ca5/tweets_flagged_v2.csv

Here is a preview of the dataset structure:

The 'Text' column contains the tweet text.
The 'Target' column contains the label, where '1' corresponds to "harmful" and '0' corresponds to "not harmful".

In [3]:
# device = "mps"
device = "cuda"

In [4]:
df = pd.read_csv('tweets_flagged_v2.csv')

In [5]:
df.head()

Unnamed: 0.1,Unnamed: 0,harmful,tweet
0,0,0,@user when a father is dysfunctional and is s...
1,1,0,@user @user thanks for #lyft credit i can't us...
2,2,0,bihday your majesty
3,3,0,#model i love u take with u all the time in ...
4,4,0,factsguide: society now #motivation


In [6]:
df.groupby('harmful').size()

harmful
0    32592
1    24153
dtype: int64

In [7]:
X = df[["tweet"]]
y = df["harmful"]

In [8]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)
X_train, X_eval, y_train, y_eval = train_test_split(
    X_train, y_train, test_size=0.2, random_state=42)


In [9]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [10]:
tokenized_train = tokenizer(list(X_train["tweet"]), padding=True, truncation=True, return_tensors="pt", max_length=256)
tokenized_eval = tokenizer(list(X_eval["tweet"]), padding=True, truncation=True, return_tensors="pt", max_length=256)
tokenized_test = tokenizer(list(X_test["tweet"]), padding=True, truncation=True, return_tensors="pt", max_length=256)

In [11]:
len(tokenized_test["input_ids"][1])

256

In [12]:
tokenized_train["label"] = list(y_train)
tokenized_eval["label"] = list(y_eval)
tokenized_test["label"] = list(y_test)

In [13]:
from datasets import Dataset
train = {"input_ids": tokenized_train["input_ids"],
         "attention_mask": tokenized_train["attention_mask"],
         "labels": tokenized_train["label"]}
train_dataset = Dataset.from_dict(train)

eval = {"input_ids": tokenized_eval["input_ids"],
         "attention_mask": tokenized_eval["attention_mask"],
         "labels": tokenized_eval["label"]}
eval_dataset = Dataset.from_dict(eval)

test = {"input_ids": tokenized_test["input_ids"],
         "attention_mask": tokenized_test["attention_mask"],
         "labels": tokenized_test["label"]}
test_dataset = Dataset.from_dict(test)

In [14]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer, max_length=256)

In [15]:


accuracy = evaluate.load("accuracy")
import numpy as np
from sklearn.metrics import confusion_matrix


def compute_metrics(eval_pred, calculate_confusion_matrix=False):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    acc = accuracy.compute(predictions=predictions, references=labels)
    if calculate_confusion_matrix:
        cm = confusion_matrix(labels, predictions)
        return {"accuracy": acc, "confusion_matrix": cm}
    return acc

In [16]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
id2label = {0: "NOT_HARMFUL", 1: "HARMFUL"}
label2id = {"NOT_HARMFUL": 0, "HARMFUL": 1}
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert/distilbert-base-uncased", num_labels=2, id2label=id2label, label2id=label2id
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [17]:
! pip list | grep accelerate

accelerate                       0.30.1


In [22]:
for param in model.distilbert.parameters():
    param.requires_grad = False
training_args = TrainingArguments(
    output_dir="my_awesome_model",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="steps",
    save_strategy="steps",
    load_best_model_at_end=True,
    push_to_hub=False,
    # use_cpu=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()



Step,Training Loss,Validation Loss,Accuracy
500,0.2986,0.280884,0.88315
1000,0.2934,0.271318,0.886454
1500,0.2704,0.26438,0.889648
2000,0.2755,0.261053,0.891079
2500,0.2722,0.258807,0.890529
3000,0.2653,0.258922,0.89196
3500,0.275,0.256315,0.892731
4000,0.2699,0.255763,0.892841
4500,0.2716,0.255719,0.892621




TrainOutput(global_step=4540, training_loss=0.27664879492200944, metrics={'train_runtime': 1225.2523, 'train_samples_per_second': 59.279, 'train_steps_per_second': 3.705, 'total_flos': 4810686049591296.0, 'train_loss': 0.27664879492200944, 'epoch': 2.0})

In [23]:
model.save_pretrained('../../data/bert_model')

In [24]:

batch_size = 16
input_ids_batches = tokenized_test["input_ids"].split(batch_size)
attention_mask_batches = tokenized_test["attention_mask"].split(batch_size)
preds = []
for input_ids, attention_mask in tqdm(zip(input_ids_batches, attention_mask_batches)):
    pred = model(input_ids=input_ids.to(device), attention_mask=attention_mask.to(device))
    preds.append(pred.logits.detach().cpu().numpy())


710it [01:22,  8.66it/s]


In [25]:
preds = np.concatenate(preds)

In [26]:
compute_metrics((preds,test_dataset["labels"]), calculate_confusion_matrix=True)

{'accuracy': {'accuracy': 0.8880077539871354},
 'confusion_matrix': array([[5872,  600],
        [ 671, 4206]])}