## Imports

In [4]:
!pip install transformers==4.1.1

Collecting transformers==4.1.1
  Downloading transformers-4.1.1-py3-none-any.whl (1.5 MB)
[K     |████████████████████████████████| 1.5 MB 5.4 MB/s 
Collecting tokenizers==0.9.4
  Downloading tokenizers-0.9.4-cp37-cp37m-manylinux2010_x86_64.whl (2.9 MB)
[K     |████████████████████████████████| 2.9 MB 36.2 MB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 38.8 MB/s 
Installing collected packages: tokenizers, sacremoses, transformers
Successfully installed sacremoses-0.0.46 tokenizers-0.9.4 transformers-4.1.1


In [5]:
import pandas as pd 
from sklearn.model_selection import train_test_split
from transformers import AutoConfig, AutoTokenizer, AutoModel, BertForSequenceClassification
from sklearn.metrics import classification_report
import torch

# Prepare the data


* `X` = `tweet` (text)
* `y` = `hate_speech` (label)

In [7]:
df = pd.read_excel('labeled_data.xlsx')

df = df.dropna()
X = df['tweet'].tolist()
y = df['hate_speech'].to_list()

## Creating Test dataset (20%)
train_texts, test_texts, train_labels, test_labels = train_test_split(X, y, random_state=0, test_size=.2)
## Create Valid dataset (20%)
train_texts, valid_texts, train_labels, valid_labels = train_test_split(train_texts, train_labels, random_state=0, test_size=.2)

## Loading model and tokenizer 

In [8]:
config = AutoConfig.from_pretrained("bert-base-uncased")
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=8)

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [9]:
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=200)
val_encodings = tokenizer(valid_texts, truncation=True, padding=True, max_length=200)

## PyTorch dataset object generation

In [10]:
class ClassificationDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        # initialization
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        # slicing method X[index]
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = ClassificationDataset(train_encodings, train_labels)
valid_dataset = ClassificationDataset(val_encodings, valid_labels)

## Training

In [12]:
from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=5,              # total number of training epochs
    per_device_train_batch_size=36,  # batch size per device during training
    per_device_eval_batch_size=36,   # batch size for evaluation
    learning_rate= 5e-6,
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
    do_eval=True,
    evaluation_strategy = 'epoch'
    
    
)
# training_args.evaluation_strategy = EvaluationStrategy.EPOCH
trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=valid_dataset,          # evaluation dataset

)

trainer.train()

Epoch,Training Loss,Validation Loss
1,0.699948,0.748676
2,0.716133,0.722951
3,0.62035,0.718604
4,0.606346,0.715863
5,0.566295,0.712309


TrainOutput(global_step=90, training_loss=0.6256002638075087)

## Predict and evaluate

In [13]:
test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=50)
test_dataset = ClassificationDataset(test_encodings, test_labels)

In [14]:
pred = trainer.predict(test_dataset)
print(classification_report(pred.label_ids, pred.predictions.argmax(-1)))

              precision    recall  f1-score   support

           0       0.83      1.00      0.91       160
           1       0.00      0.00      0.00        20
           2       0.00      0.00      0.00        12
           3       1.00      0.25      0.40         4

    accuracy                           0.82       196
   macro avg       0.46      0.31      0.33       196
weighted avg       0.70      0.82      0.75       196



  _warn_prf(average, modifier, msg_start, len(result))
