In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import Dataset
import evaluate
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
)

In [2]:
model = "tomh/toxigen_roberta"

In [3]:
df = pd.read_csv("IMDB Dataset.csv")
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [4]:
df["Category"] = df["sentiment"].apply(lambda x: 1 if x == "positive" else 0)
df.head()

Unnamed: 0,review,sentiment,Category
0,One of the other reviewers has mentioned that ...,positive,1
1,A wonderful little production. <br /><br />The...,positive,1
2,I thought this was a wonderful way to spend ti...,positive,1
3,Basically there's a family where a little boy ...,negative,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,1


In [5]:
df.shape

(50000, 3)

In [6]:
df = df.rename(columns={"review": "text", "Category": "label"})

In [7]:
df.label.value_counts()

label
1    25000
0    25000
Name: count, dtype: int64

In [8]:
X_train, X_test = train_test_split(df, test_size=0.2)
X_train, X_val = train_test_split(X_train, test_size=0.2)

In [9]:
print(X_train.head())

                                                    text sentiment  label
39724  Don't waste time reading my review. Go out and...  positive      1
28842  It does touch a few interesting points.. But! ...  negative      0
37823  In a future where an industrious travel agency...  negative      0
45542  I love this movie, though I don't like how the...  positive      1
21026  Othello, the classic Shakespearen story of lov...  positive      1


In [10]:
train_ds = Dataset.from_pandas(X_train, split="train")
val_ds = Dataset.from_pandas(X_val, split="validation")
test_ds = Dataset.from_pandas(X_test, split="test")

In [11]:
tokenizer = AutoTokenizer.from_pretrained(model)

In [12]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

In [13]:
tokenized_train = train_ds.map(preprocess_function, batched=True)
tokenized_val = val_ds.map(preprocess_function, batched=True)
tokenized_test = test_ds.map(preprocess_function, batched=True)

Map:   0%|          | 0/32000 [00:00<?, ? examples/s]

Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

In [14]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [15]:
accuracy = evaluate.load("accuracy")


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [16]:
id2label = {0: "negative", 1: "positive"}
label2id = {"negative": 0, "positive": 1}

In [17]:
model = AutoModelForSequenceClassification.from_pretrained(
    model,
    num_labels=2,
    id2label=id2label,
    label2id=label2id,
)

In [18]:
training_args = TrainingArguments(
    output_dir="toxigen_output",
    learning_rate=1e-4,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    report_to=None,
)

In [19]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [20]:
trainer.train()

RuntimeError: CUDA error: out of memory
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [None]:
preds_output = trainer.predict(tokenized_test)
preds_output.metrics