<a href="https://colab.research.google.com/github/mughees-asif/rewire-task/blob/master/rewire_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Imports

In [5]:
!pip install transformers==4.1.1



In [6]:
import pandas as pd 
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import torch
from transformers import AutoConfig, AutoTokenizer, AutoModel, BertForSequenceClassification, Trainer, TrainingArguments
import warnings
warnings.filterwarnings('ignore')

## Prepare the data


* `X` = `tweet` (text)
* `y` = `hate_speech` (label)

In [7]:
df = pd.read_excel('labeled_data.xlsx')

df = df.dropna()
X = df['tweet'].tolist()
y = df['hate_speech'].to_list()

## Creating Test dataset (20%)
train_texts, test_texts, train_labels, test_labels = train_test_split(X, y, random_state=0, test_size=.2)
## Create Valid dataset (20%)
train_texts, valid_texts, train_labels, valid_labels = train_test_split(train_texts, train_labels, random_state=0, test_size=.2)

## Load model and tokenizer 

In [8]:
config = AutoConfig.from_pretrained("bert-base-uncased")
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=8)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [9]:
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=200)
val_encodings = tokenizer(valid_texts, truncation=True, padding=True, max_length=200)

## PyTorch dataset object generation

In [10]:
class ClassificationDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = ClassificationDataset(train_encodings, train_labels)
valid_dataset = ClassificationDataset(val_encodings, valid_labels)

## Training

In [11]:
training_args = TrainingArguments(
    output_dir='./results',              # output directory
    num_train_epochs=5,                  # total number of training epochs
    per_device_train_batch_size=36,      # batch size per device during training
    per_device_eval_batch_size=36,       # batch size for evaluation
    learning_rate= 5e-6,
    logging_dir='./logs',                # directory for storing logs
    logging_steps=10,
    do_eval=True,
    evaluation_strategy = 'epoch'
)

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=valid_dataset,          # evaluation dataset
)

trainer.train()

Epoch,Training Loss,Validation Loss
1,2.033936,1.511473
2,1.414906,1.258412
3,1.146113,1.094544
4,1.029573,1.014588
5,0.945959,0.989581


TrainOutput(global_step=90, training_loss=1.2768288400438097)

## Predict and evaluate

In [12]:
test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=50)
test_dataset = ClassificationDataset(test_encodings, test_labels)

In [13]:
pred = trainer.predict(test_dataset)
print(classification_report(pred.label_ids, pred.predictions.argmax(-1)))

              precision    recall  f1-score   support

           0       0.82      1.00      0.90       160
           1       0.00      0.00      0.00        20
           2       0.00      0.00      0.00        12
           3       0.00      0.00      0.00         4

    accuracy                           0.82       196
   macro avg       0.20      0.25      0.22       196
weighted avg       0.67      0.82      0.73       196

