### Dependencies

In [2]:
# !pip install datasets wandb pytorch_lightning

In [3]:
# utils 
import os
import torch


# data
from datasets import load_dataset

from transformers import BertTokenizerFast, AutoConfig, AutoTokenizer, AutoModel
from transformers import RobertaConfig, RobertaTokenizerFast, RobertaModel
from transformers import XLNetTokenizerFast, XLNetConfig, XLNetForSequenceClassification, XLNetModel

# model
import torch.nn as nn
import torch.nn.functional as F

# training and evaluation
import wandb
import pytorch_lightning as pl
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from pytorch_lightning.loggers import WandbLogger
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from tqdm import tqdm



In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


### Custom Dataset

In [5]:
class HateSpeechDataset(torch.utils.data.Dataset):
    
    def __init__(self, text, label, tokenizer, max_len=200):
        self.text = text 
        self.label = label
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.label_dict = {
            "none":0,
            "racism":1,
            "sexism":2
        }
    
    def __len__(self):
        return len(self.text)
    
    def __getitem__(self, index):
        text = self.text[index]
        label = self.label_dict[self.label[index]]
        
        encoding = self.tokenizer.encode_plus(
            text=text,
            truncation=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_overflowing_tokens=True,
            return_attention_mask=True,
            padding='max_length'
        )
        return {
            "input_ids":torch.tensor(encoding['input_ids']).squeeze(),
            "attention_mask":torch.tensor(encoding['attention_mask']).squeeze(),
            "label":torch.tensor([label], dtype=torch.long)
        }
        

In [6]:
def data_loader(tokenizer, batch_size, root):
    
    train = load_dataset("csv", data_files=root+"train.csv")
    test = load_dataset("csv", data_files=root+"test.csv")
    
    train_dataset = HateSpeechDataset(text=train['train']['Tweets'], label=train['train']['Label'], tokenizer=tokenizer)
    test_dataset = HateSpeechDataset(text=test['train']['Tweets'], label=train['train']['Label'], tokenizer=tokenizer)
    test_dataset, val_dataset = torch.utils.data.random_split(dataset=test_dataset, lengths=[int(len(test_dataset)*0.50), len(test_dataset)-int(len(test_dataset)*0.50)])
    
    train_loader = torch.utils.data.DataLoader(dataset=train_dataset, shuffle=True, num_workers=4, batch_size=batch_size)
    val_loader = torch.utils.data.DataLoader(dataset=val_dataset, shuffle=False, num_workers=4, batch_size=batch_size)
    test_loader = torch.utils.data.DataLoader(dataset=test_dataset, shuffle=False, num_workers=4, batch_size=batch_size)
    
    return train_loader, val_loader, test_loader
    

### Models

### 1. GRU

In [8]:
class GRUClassfier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, padding_idx, hidden_size=768, num_layers=1, dropout=0.10, num_classes=3):
        super(GRUClassfier, self).__init__()
        
        # embedding layer
        self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim, padding_idx=padding_idx)
        # gru module
        self.gru = nn.GRU(
            input_size=embedding_dim,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
#             dropout=dropout,
            bidirectional=True
        )
        
        # full connected layer as classifier
        self.fc = nn.Sequential(*[
            nn.Linear(in_features=2*num_layers*hidden_size, out_features=256),
            nn.ReLU(),
            nn.Linear(in_features=256, out_features=num_classes)
        ])
        
    def forward(self, x, hidden=None):
        
        batch_size = x.shape[0]
        
        # get the embedding
        embedded = self.embedding(x)
        
        # pass the embedding and initial hidden states to GRU (cell state will be same as hidden states) 
        _, outputs = self.gru(embedded, hidden)
        
        # outputs.shape -> [2*num_layers, batch_size, hidden_size] convert it into batch_first format
        outputs = outputs.permute(1, 0, 2)
        outputs = outputs.reshape(batch_size, -1)
#         print(outputs.shape)
        
        # last hidden states of the BidirectionalGRU will be passed to classifier will returns logits 
        logits = self.fc(outputs)
        return logits

### 2. RoBERTa 

In [None]:
class RobertaClassifier(nn.Module):
    
    def __init__(self, model_name, num_classes=3):
        super(BertClassifier, self).__init__()
        
        self.config = RobertaConfig.from_pretrained(pretrained_model_name_or_path=model_name)
        self.model = RobertaModel.from_pretrained(pretrained_model_name_or_path=model_name, config=self.config)
        
        # full connected layer as classifier
        self.fc = nn.Sequential(*[
            nn.Linear(in_features=self.config.hidden_size, out_features=256),
            nn.ReLU(),
            nn.Linear(in_features=256, out_features=num_classes)
        ])
        
    def forward(self, input_ids, attention_mask=None):
        _, pooler = self.model(input_ids, attention_mask)
        logits = self.fc(pooler)
        return logits


### 3. BERTTweet

In [7]:
class BertClassifier(nn.Module):
    
    def __init__(self, model_name, num_classes=3):
        super(BertClassifier, self).__init__()
        
        self.config = AutoConfig.from_pretrained(pretrained_model_name_or_path=model_name)
        self.model = AutoModel.from_pretrained(pretrained_model_name_or_path=model_name, config=self.config)
        
        # full connected layer as classifier
        self.fc = nn.Sequential(*[
            nn.Linear(in_features=self.config.hidden_size, out_features=256),
            nn.ReLU(),
            nn.Linear(in_features=256, out_features=num_classes)
        ])
        
    def forward(self, input_ids, attention_mask=None):
        _, pooler = self.model(input_ids, attention_mask)
        logits = self.fc(pooler)
        return logits


### 4. XLNet

In [7]:
class XLNetClassifier(nn.Module):
    
    def __init__(self, model_name, num_classes=3):
        super(XLNetClassifier, self).__init__()
        
        # xlnet base as feature extractor or  as contextualized embedding layer 
        self.config = XLNetConfig.from_pretrained(pretrained_model_name_or_path=model_name)
        self.base = XLNetModel.from_pretrained(pretrained_model_name_or_path=model_name, config=self.config)
        
        # gru for processing the contextualized embedding into recurrent fashion
        self.gru = nn.GRU(
            input_size=self.config.d_model,
            hidden_size=self.config.d_model,
            num_layers=1,
            batch_first=True,
            bidirectional=True,
        )
        
        # full connected layer as classifier
        self.fc = nn.Sequential(*[
            nn.Linear(in_features=2*self.config.d_model, out_features=256),
            nn.ReLU(),
            nn.Linear(in_features=256, out_features=num_classes)
        ])
        
    def forward(self, input_ids, attention_mask=None, hidden=None):
        batch_size = input_ids.shape[0]
        
        outputs = self.base(input_ids, attention_mask)
        _, outputs = self.gru(outputs[0], hidden)
        
         # outputs.shape -> [2*num_layers, batch_size, hidden_size] convert it into batch_first format
        outputs = outputs.permute(1, 0, 2)
        outputs = outputs.reshape(batch_size, -1)
        
        logits = self.fc(outputs)
        return logits

## Training 

In [8]:
class LightningModel(pl.LightningModule):
    
    def __init__(self, model, config):
        super(LightningModel, self).__init__()
        
        self.model = model
        self.config = config
        
    def forward(self, input_ids, attention_mask):
        logits  = self.model(input_ids, attention_mask)
        return logits
    
    def configure_optimizers(self):
        return torch.optim.Adam(params=self.parameters(), lr=config['lr'])
    
    def train_dataloader(self):
        return train_loader
    
    def training_step(self, batch, batch_idx):
        input_ids, attention_mask, targets = batch['input_ids'], batch['attention_mask'], batch['label'].squeeze()
        logits = self(input_ids, attention_mask)
        loss = F.cross_entropy(logits, targets)
        acc = accuracy_score(targets.cpu(), logits.argmax(dim=1).cpu())
        f1 = f1_score(targets.cpu(), logits.argmax(dim=1).cpu(), average="weighted")
        wandb.log({"loss":loss, "accuraccy":acc, "f1_score":f1})
        return {"loss":loss, "accuraccy":acc, "f1_score":f1}
    
    def val_dataloader(self):
        return val_loader
    
    def validation_step(self, batch, batch_idx):
        input_ids, attention_mask, targets = batch['input_ids'], batch['attention_mask'], batch['label'].squeeze()
        logits = self(input_ids, attention_mask)
        loss = F.cross_entropy(logits, targets)
        acc = accuracy_score(targets.cpu(), logits.argmax(dim=1).cpu())
        f1 = f1_score(targets.cpu(), logits.argmax(dim=1).cpu(), average="weighted")
        return {"val_loss":loss, "val_accuracy":torch.tensor([acc]), "val_f1":torch.tensor([f1])}
    
    def validation_epoch_end(self, outputs):
        avg_loss = torch.stack([x['val_loss'] for x in outputs]).mean()
        avg_acc = torch.stack([x['val_accuracy'] for x in outputs]).mean()
        avg_f1 = torch.stack([x['val_f1'] for x in outputs]).mean()
        wandb.log({"val_loss":avg_loss, "val_accuracy":avg_acc, "val_f1":avg_f1})
        return {"val_loss":avg_loss, "val_accuracy":avg_acc, "val_f1":avg_f1}
    
    def test_dataloader(self):
        return test_loader
    
    def test_step(self, batch, batch_idx):
        input_ids, attention_mask, targets = batch['input_ids'], batch['attention_mask'], batch['label'].squeeze()
        logits = self(input_ids, attention_mask)
        loss = F.cross_entropy(logits, targets)
        acc = accuracy_score(targets.cpu(), logits.argmax(dim=1).cpu())
        f1 = f1_score(targets.cpu(), logits.argmax(dim=1).cpu(), average="weighted")
        precision = precision_score(targets.cpu(), logits.argmax(dim=1).cpu(), average="weighted")
        recall = recall_score(targets.cpu(), logits.argmax(dim=1).cpu(), average="weighted")
        return {"test_loss":loss, "test_precision":torch.tensor([precision]), "test_recall":torch.tensor([recall]), "test_accuracy":torch.tensor([acc]), "test_f1":torch.tensor([f1])}
    
    def test_epoch_end(self, outputs):
        avg_loss = torch.stack([x['test_loss'] for x in outputs]).mean()
        avg_acc = torch.stack([x['test_accuracy'] for x in outputs]).mean()
        avg_f1 = torch.stack([x['test_f1'] for x in outputs]).mean()
        avg_precision = torch.stack([x['test_precision'] for x in outputs]).mean()
        avg_recall = torch.stack([x['test_recall'] for x in outputs]).mean()
        return {"test_loss":avg_loss, "test_precision":avg_precision, "test_recall":avg_recall, "test_acc":avg_acc, "test_f1":avg_f1}

In [9]:
!mkdir ../working/models

In [10]:
config = {
    "root":"../input/hatespeechdataset/dataset/",
    "save_dir":"../working/models/",
    
    "project":"hate-speech-detection",
    "run_name":"xlnet-gru",
    
    "model_name":"xlnet-base-cased",
    "batch_size":8,
    "lr":1e-5,
    
    "monitor":"val_accuracy",
    "min_delta":0.005,
    
    "filepath":"../working/models/{epoch}-{val_accuracy:4f}",
    "precision":16,
    "epochs":10,
    
}

In [11]:
tokenizer = XLNetTokenizerFast.from_pretrained(pretrained_model_name_or_path=config["model_name"])
train_loader, val_loader, test_loader = data_loader(tokenizer=tokenizer, batch_size=config["batch_size"], root=config["root"])

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=798011.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1382015.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=927.0, style=ProgressStyle(description_…




Using custom data configuration default


Downloading and preparing dataset csv/default-d7301296b8e5b97f (download: Unknown size, generated: Unknown size, post-processed: Unknown size, total: Unknown size) to /root/.cache/huggingface/datasets/csv/default-d7301296b8e5b97f/0.0.0/49187751790fa4d820300fd4d0707896e5b941f1a9c644652645b866716a4ac4...


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-d7301296b8e5b97f/0.0.0/49187751790fa4d820300fd4d0707896e5b941f1a9c644652645b866716a4ac4. Subsequent calls will reuse this data.


Using custom data configuration default


Downloading and preparing dataset csv/default-3dc5d768737e23ab (download: Unknown size, generated: Unknown size, post-processed: Unknown size, total: Unknown size) to /root/.cache/huggingface/datasets/csv/default-3dc5d768737e23ab/0.0.0/49187751790fa4d820300fd4d0707896e5b941f1a9c644652645b866716a4ac4...


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-3dc5d768737e23ab/0.0.0/49187751790fa4d820300fd4d0707896e5b941f1a9c644652645b866716a4ac4. Subsequent calls will reuse this data.


In [12]:
xlnet = XLNetClassifier(model_name=config["model_name"])

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=760.0, style=ProgressStyle(description_…






HBox(children=(FloatProgress(value=0.0, description='Downloading', max=467042463.0, style=ProgressStyle(descri…




In [13]:
model = LightningModel(model=xlnet, config=config)

In [14]:
logger = WandbLogger(
    name=config["run_name"],
    save_dir=config["save_dir"],
    project=config["project"],
    log_model=True,
)
early_stopping = EarlyStopping(
    monitor=config["monitor"],
    min_delta=config["min_delta"],
)
checkpoints = ModelCheckpoint(
    filepath=config["filepath"],
    monitor=config["monitor"],
    save_top_k=1
)

In [15]:
trainer = pl.Trainer(
    logger=logger,
    gpus=[0],
    checkpoint_callback=checkpoints,
    default_root_dir="../working/models/",
    max_epochs=config["epochs"],
    precision=config["precision"]
)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Using native 16bit precision.


In [16]:
trainer.fit(model)

[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize


wandb: Paste an API key from your profile and hit enter: ········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: wandb version 0.10.10 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade



  | Name  | Type            | Params
------------------------------------------
0 | model | XLNetClassifier | 124 M 


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validation sanity check', layout=Layout…



HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…






1

In [17]:
trainer.test(model)

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Testing', layout=Layout(flex='2'), max=…

  _warn_prf(average, modifier, msg_start, len(result))


--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'test_acc': tensor(0.7488, dtype=torch.float64),
 'test_f1': tensor(0.8461, dtype=torch.float64),
 'test_loss': tensor(1.2160, device='cuda:0'),
 'test_precision': tensor(1., dtype=torch.float64),
 'test_recall': tensor(0.7488, dtype=torch.float64)}
--------------------------------------------------------------------------------





[{'test_loss': 1.2159990072250366,
  'test_precision': 1.0,
  'test_recall': 0.7487562189054726,
  'test_acc': 0.7487562189054726,
  'test_f1': 0.8461492073432374}]

#### Load from Checkpoint and Test

In [18]:
print(os.listdir("../working/models/"))

['epoch=4-val_accuracy=0.769900.ckpt', 'wandb']


In [40]:
l  = torch.load(f="../working/models/epoch=4-val_accuracy=0.769900.ckpt")

In [41]:
model.load_state_dict(l['state_dict'])

<All keys matched successfully>

In [44]:
def test_fn(model, test_loader, val_loader):
    loss = []
    acc = []
    precision = []
    recall = []
    f1 = []
    model.eval()
    for batch in tqdm(test_loader):
        input_ids, attention_mask, targets = batch['input_ids'], batch['attention_mask'], batch['label'].squeeze()
        logits = model(input_ids.to(device), attention_mask.to(device))
        l = F.cross_entropy(logits.to(device), targets.to(device))
        a = accuracy_score(targets.cpu(), logits.argmax(dim=1).cpu())
        f = f1_score(targets.cpu(), logits.argmax(dim=1).cpu(), average="weighted")
        p = precision_score(targets.cpu(), logits.argmax(dim=1).cpu(), average="weighted")
        r = recall_score(targets.cpu(), logits.argmax(dim=1).cpu(), average="weighted")
        loss.append(l.item())
        acc.append(a)
        f1.append(f)
        precision.append(p)
        recall.append(r)
    for batch in tqdm(val_loader):
        input_ids, attention_mask, targets = batch['input_ids'], batch['attention_mask'], batch['label'].squeeze()
        logits = model(input_ids.to(device), attention_mask.to(device))
        l = F.cross_entropy(logits.to(device), targets.to(device))
        a = accuracy_score(targets.cpu(), logits.argmax(dim=1).cpu())
        f = f1_score(targets.cpu(), logits.argmax(dim=1).cpu(), average="weighted")
        p = precision_score(targets.cpu(), logits.argmax(dim=1).cpu(), average="weighted")
        r = recall_score(targets.cpu(), logits.argmax(dim=1).cpu(), average="weighted")
        loss.append(l.item())
        acc.append(a)
        f1.append(f)
        precision.append(p)
        recall.append(r)
        
    return {
        "accuracy":sum(acc)/len(acc),
        "precision":sum(precision)/len(precision),
        "recall":sum(recall)/len(recall),
        "f1":sum(f1)/len(f1),
        "loss":sum(loss)/len(loss)
    }
        

In [45]:
print(test_fn(model, test_loader, val_loader))

  _warn_prf(average, modifier, msg_start, len(result))
100%|██████████| 101/101 [00:11<00:00,  8.53it/s]
100%|██████████| 101/101 [00:11<00:00,  8.59it/s]

{'accuracy': 0.850763201320132, 'precision': 1.0, 'recall': 0.850763201320132, 'f1': 0.9162689068395092, 'loss': 0.2890760146892897}



