In [1]:
import kagglehub

path = kagglehub.dataset_download("parthplc/facebook-hateful-meme-dataset")

print("Path to dataset files:", path)

Path to dataset files: /Users/maxcap/.cache/kagglehub/datasets/parthplc/facebook-hateful-meme-dataset/versions/1


In [2]:
from datasets import load_dataset

train_path = path + "/data/train.jsonl"
test_path = path + "/data/test.jsonl"
dev_path = path + "/data/dev.jsonl"

dataset = load_dataset("json", data_files={"train": train_path, "test": test_path, "dev": dev_path})


In [8]:
import torch
from transformers import DistilBertTokenizer

tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_dataset = {
    "train": dataset["train"].map(tokenize_function, batched=True),
    "dev": dataset["dev"].map(tokenize_function, batched=True),
    "test": dataset["test"].map(tokenize_function, batched=True)
}


In [19]:
from torch.utils.data import DataLoader

train_loader = DataLoader(tokenized_dataset["train"], shuffle=True, batch_size=32)
dev_loader = DataLoader(tokenized_dataset["dev"], shuffle=False, batch_size=32)
test_loader = DataLoader(tokenized_dataset["test"], shuffle=False, batch_size=32)

In [9]:
from transformers import DistilBertForSequenceClassification
import torch

model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2, torch_dtype=torch.float16)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    fp16=False
)


In [14]:
from transformers import Trainer, DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["dev"],
    data_collator=data_collator
)
