In [84]:
import pandas as pd
from sklearn.model_selection import train_test_split
import evaluate
from torch.utils.data import TensorDataset, DataLoader
import torch
import itertools
from tqdm import tqdm

In [None]:
device = "mps"

In [3]:
df = pd.read_csv('../../data/tweets_flagged_v2.csv')

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,harmful,tweet
0,0,0,@user when a father is dysfunctional and is s...
1,1,0,@user @user thanks for #lyft credit i can't us...
2,2,0,bihday your majesty
3,3,0,#model i love u take with u all the time in ...
4,4,0,factsguide: society now #motivation


In [5]:
df.groupby('harmful').size()

harmful
0    32592
1    24153
dtype: int64

In [6]:
X = df[["tweet"]]
y = df["harmful"]

In [7]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)
X_train, X_eval, y_train, y_eval = train_test_split(
    X_train, y_train, test_size=0.2, random_state=42)


In [8]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")



In [32]:
tokenized_train = tokenizer(list(X_train["tweet"]), padding=True, truncation=True, return_tensors="pt", max_length=256)
tokenized_eval = tokenizer(list(X_eval["tweet"]), padding=True, truncation=True, return_tensors="pt", max_length=256)
tokenized_test = tokenizer(list(X_test["tweet"]), padding=True, truncation=True, return_tensors="pt", max_length=256)

In [10]:
len(tokenized_test["input_ids"][1])

41

In [11]:
tokenized_train["label"] = list(y_train)
tokenized_eval["label"] = list(y_eval)
tokenized_test["label"] = list(y_test)

In [12]:
from datasets import Dataset
train = {"input_ids": tokenized_train["input_ids"],
         "attention_mask": tokenized_train["attention_mask"],
         "labels": tokenized_train["label"]}
train_dataset = Dataset.from_dict(train)

eval = {"input_ids": tokenized_eval["input_ids"],
         "attention_mask": tokenized_eval["attention_mask"],
         "labels": tokenized_eval["label"]}
eval_dataset = Dataset.from_dict(eval)

test = {"input_ids": tokenized_test["input_ids"],
         "attention_mask": tokenized_test["attention_mask"],
         "labels": tokenized_test["label"]}
test_dataset = Dataset.from_dict(test)

In [13]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer, max_length=256)

In [106]:


accuracy = evaluate.load("accuracy")
import numpy as np
from sklearn.metrics import confusion_matrix


def compute_metrics(eval_pred, calculate_confusion_matrix=False):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    acc = accuracy.compute(predictions=predictions, references=labels)
    if calculate_confusion_matrix:
        cm = confusion_matrix(labels, predictions)
        return {"accuracy": acc, "confusion_matrix": cm}
    return acc

In [15]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
id2label = {0: "NOT_HARMFUL", 1: "HARMFUL"}
label2id = {"NOT_HARMFUL": 0, "HARMFUL": 1}
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert/distilbert-base-uncased", num_labels=2, id2label=id2label, label2id=label2id
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [36]:
for param in model.distilbert.parameters():
    param.requires_grad = False
training_args = TrainingArguments(
    output_dir="my_awesome_model",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=1,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
    # use_cpu=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
  1%|▏         | 267/18158 [6:58:51<467:46:32, 94.13s/it]


  3%|▎         | 501/18158 [00:38<33:11,  8.87it/s]

{'loss': 0.2862, 'grad_norm': 0.06944984197616577, 'learning_rate': 1.944927855490693e-05, 'epoch': 0.03}


  6%|▌         | 1001/18158 [01:17<20:59, 13.62it/s]

{'loss': 0.3339, 'grad_norm': 0.13775746524333954, 'learning_rate': 1.889855710981386e-05, 'epoch': 0.06}


  8%|▊         | 1501/18158 [01:57<20:25, 13.60it/s]

{'loss': 0.3411, 'grad_norm': 0.14217278361320496, 'learning_rate': 1.8347835664720784e-05, 'epoch': 0.08}


 11%|█         | 2001/18158 [02:36<20:41, 13.01it/s]

{'loss': 0.3419, 'grad_norm': 0.11929817497730255, 'learning_rate': 1.7797114219627713e-05, 'epoch': 0.11}


 14%|█▍        | 2501/18158 [03:15<20:03, 13.01it/s]

{'loss': 0.3074, 'grad_norm': 2.5213003158569336, 'learning_rate': 1.724639277453464e-05, 'epoch': 0.14}


 17%|█▋        | 3001/18158 [03:54<20:43, 12.18it/s]

{'loss': 0.3004, 'grad_norm': 0.054651208221912384, 'learning_rate': 1.669567132944157e-05, 'epoch': 0.17}


 19%|█▉        | 3501/18158 [04:31<17:35, 13.88it/s]

{'loss': 0.3089, 'grad_norm': 0.03259854391217232, 'learning_rate': 1.61449498843485e-05, 'epoch': 0.19}


 21%|██        | 3795/18158 [04:54<17:36, 13.59it/s]

KeyboardInterrupt: 

 21%|██        | 3795/18158 [05:12<17:36, 13.59it/s]

In [93]:
model.save_pretrained('../../data/bert_model')

In [91]:

batch_size = 16
input_ids_batches = tokenized_test["input_ids"].split(batch_size)
attention_mask_batches = tokenized_test["attention_mask"].split(batch_size)
preds = []
for input_ids, attention_mask in tqdm(zip(input_ids_batches, attention_mask_batches)):
    pred = model(input_ids=input_ids.to(device), attention_mask=attention_mask.to(device))
    preds.append(pred.logits.detach().cpu().numpy())


710it [00:49, 14.32it/s]


In [101]:
preds = np.concatenate(preds)

In [107]:
compute_metrics((preds,test_dataset["labels"]), calculate_confusion_matrix=True)

{'accuracy': {'accuracy': 0.9137368931183364},
 'confusion_matrix': array([[6268,  204],
        [ 775, 4102]])}