## Importing Libraries

In [1]:
from datasets import *
from datasets import load_dataset

## Loading Dataset

In [2]:
raw_datasets = load_dataset("csv",data_files="Dynamically Generated Hate Dataset v0.2.3.csv")

In [3]:
raw_datasets['train'].features

{'Unnamed: 0': Value(dtype='int64', id=None),
 'acl.id': Value(dtype='string', id=None),
 'X1': Value(dtype='int64', id=None),
 'text': Value(dtype='string', id=None),
 'label': Value(dtype='string', id=None),
 'type': Value(dtype='string', id=None),
 'target': Value(dtype='string', id=None),
 'level': Value(dtype='string', id=None),
 'split': Value(dtype='string', id=None),
 'round.base': Value(dtype='int64', id=None),
 'annotator': Value(dtype='string', id=None),
 'round': Value(dtype='string', id=None),
 'acl.id.matched': Value(dtype='string', id=None)}

## Preprocessing dataset for the input of the model

In [4]:
raw_datasets['train']= raw_datasets['train'].remove_columns(["Unnamed: 0"])

In [5]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(raw_datasets['train']['label'])
raw_datasets['train'] = raw_datasets['train'].add_column('labels',y)

## Transformer, tokenization and padding

In [6]:
from transformers import AutoTokenizer, DataCollatorWithPadding

#raw_datasets = load_dataset("glue", "mrpc")
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)


def tokenize_function(example):
    return tokenizer(example["text"], truncation=True, padding=True)


tokenized_datasets = raw_datasets.map(tokenize_function, batched=True) #using batch for batch input 
data_collator = DataCollatorWithPadding(tokenizer=tokenizer) #dynamic padding

In [7]:
tokenized_datasets['train']

Dataset({
    features: ['acl.id', 'X1', 'text', 'label', 'type', 'target', 'level', 'split', 'round.base', 'annotator', 'round', 'acl.id.matched', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 41144
})

##  Removing unnecessary columns

In [8]:
tokenized_datasets = tokenized_datasets.remove_columns(["text","label","target","annotator", "acl.id", "X1","type","level","round.base","acl.id.matched","round","split"])
tokenized_datasets.set_format("torch")
tokenized_datasets["train"].column_names

['labels', 'input_ids', 'token_type_ids', 'attention_mask']

## Splitting data into Test Train Validation

In [9]:
from datasets import *
train_testvalid = tokenized_datasets['train'].train_test_split(test_size=0.2)
# Split the 10% test + valid in half test, half valid
test_valid = train_testvalid['test'].train_test_split(test_size=0.5)
# gather everyone if you want to have a single DatasetDict
ds = DatasetDict({
    'train': train_testvalid['train'],
    'test': test_valid['test'],
    'valid': test_valid['train']})

In [10]:
ds

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 32915
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 4115
    })
    valid: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 4114
    })
})

## Loading the training argumets

In [11]:
import safetensors
from transformers import TrainingArguments

training_args = TrainingArguments("test-trainer")

## Using Classification model

In [12]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
ds['train']

Dataset({
    features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 32915
})

In [14]:
len(tokenized_datasets['train'])

41144

In [15]:
from transformers import Trainer

trainer = Trainer(
    model,
    training_args,
    train_dataset=ds["train"],
    eval_dataset=ds["valid"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

## Fine tunning model. Training time 15 hours and 45 min 24% loss on the 12000 step

In [16]:
trainer.train() 

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
500,0.6269
1000,0.566
1500,0.5386
2000,0.508
2500,0.4942
3000,0.5003
3500,0.4862
4000,0.488
4500,0.428
5000,0.3871


Checkpoint destination directory test-trainer\checkpoint-500 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory test-trainer\checkpoint-1000 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory test-trainer\checkpoint-1500 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory test-trainer\checkpoint-2000 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory test-trainer\checkpoint-2500 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory test-trainer\checkpoint-3000 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory test-trainer\checkpoint-3500 already exists and is non-empty.Saving will proceed

TrainOutput(global_step=12345, training_loss=0.39385226355143627, metrics={'train_runtime': 56759.3105, 'train_samples_per_second': 1.74, 'train_steps_per_second': 0.217, 'total_flos': 2.209970938669422e+16, 'train_loss': 0.39385226355143627, 'epoch': 3.0})

## Prediction on the student model

In [17]:
predictions = trainer.predict(ds["test"])
print(predictions.predictions.shape, predictions.label_ids.shape)

(4115, 2) (4115,)


In [18]:
import numpy as np

preds = np.argmax(predictions.predictions, axis=-1)

In [25]:
 print(preds,(ds['test']['labels']).numpy())

[1 1 1 ... 1 1 0] [0 1 1 ... 0 1 0]


In [29]:
y_test = (ds['test']['labels']).numpy()

## Accuracy on test set 80%

In [30]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, preds)
print(cm)
accuracy_score(y_test, preds)

[[1756  457]
 [ 349 1553]]


0.804131227217497