In [1]:
! pip install transformers



In [0]:
import os
from typing import Tuple, List
from functools import partial

import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, RandomSampler
from torch.nn.utils.rnn import pad_sequence
from transformers import BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup, BertPreTrainedModel
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from tqdm import tqdm
import itertools

In [3]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [0]:
url = '/content/gdrive/My Drive/jigsaw-toxic-comment-classification-challenge/'

df = pd.read_csv(os.path.join(url,'train.csv'))

The model taking too much time hence taking a fraction


In [0]:
train_df = df[:1200]

In [0]:
device = torch.device('cpu')
if torch.cuda.is_available():
    device = torch.device('cuda:0')

In [0]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

In [0]:
train_df, val_df = train_test_split(train_df, test_size=0.2)

In [0]:
class ToxicDataset(Dataset):
    
    def __init__(self, tokenizer: BertTokenizer, dataframe: pd.DataFrame, lazy: bool = False):
        self.tokenizer = tokenizer
        self.pad_idx = tokenizer.pad_token_id
        self.lazy = lazy
        if not self.lazy:
            self.X = []
            self.Y = []
            for i, (row) in tqdm(dataframe.iterrows()):
                x, y = self.row_to_tensor(self.tokenizer, row)
                self.X.append(x)
                self.Y.append(y)
        else:
            self.df = dataframe        
    
    @staticmethod
    def row_to_tensor(tokenizer: BertTokenizer, row: pd.Series) -> Tuple[torch.LongTensor, torch.LongTensor]:
        tokens = tokenizer.encode(row["comment_text"], add_special_tokens=True)
        if len(tokens) > 512:
            tokens = tokens[:511] + [tokens[-1]]
        x = torch.LongTensor(tokens)
        y = torch.FloatTensor(row[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]])
        return x, y
        
    
    def __len__(self):
        if self.lazy:
            return len(self.df)
        else:
            return len(self.X)

    def __getitem__(self, index: int) -> Tuple[torch.LongTensor, torch.LongTensor]:
        if not self.lazy:
            return self.X[index], self.Y[index]
        else:
            return self.row_to_tensor(self.tokenizer, self.df.iloc[index])
            

def collate_fn(batch: List[Tuple[torch.LongTensor, torch.LongTensor]], device: torch.device) \
        -> Tuple[torch.LongTensor, torch.LongTensor]:
    x, y = list(zip(*batch))
    x = pad_sequence(x, batch_first=True, padding_value=0)
    y = torch.stack(y)
    return x.to(device), y.to(device)

train_dataset = ToxicDataset(tokenizer, train_df, lazy=True)
dev_dataset = ToxicDataset(tokenizer, val_df, lazy=True)
collate_fn = partial(collate_fn, device=device)
BATCH_SIZE = 10
train_sampler = RandomSampler(train_dataset)
dev_sampler = RandomSampler(dev_dataset)
train_iterator = DataLoader(train_dataset, batch_size=BATCH_SIZE, sampler=train_sampler, collate_fn=collate_fn)
dev_iterator = DataLoader(dev_dataset, batch_size=BATCH_SIZE, sampler=dev_sampler, collate_fn=collate_fn)

In [0]:
class BertClassifier(nn.Module):
    
    def __init__(self, bert: BertModel, num_classes: int):
        super().__init__()
        self.bert = bert
        self.classifier = nn.Linear(bert.config.hidden_size, num_classes)
        
    def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None,labels=None):
        outputs = self.bert(input_ids,
                               attention_mask=attention_mask,
                               token_type_ids=token_type_ids,
                               position_ids=position_ids,
                               head_mask=head_mask)
        cls_output = outputs[1] # batch, hidden
        cls_output = self.classifier(cls_output) # batch, 6
        cls_output = torch.sigmoid(cls_output)
        criterion = nn.BCELoss()
        loss = 0
        if labels is not None:
            loss = criterion(cls_output, labels)
        return loss, cls_output

model = BertClassifier(BertModel.from_pretrained('bert-base-uncased'), 6).to(device)

In [0]:
def train(model, iterator, optimizer, scheduler):
    model.train()
    total_loss = 0
    for x, y in tqdm(iterator,position=0, leave=True):
        optimizer.zero_grad()
        mask = (x != 0).float()
        loss, outputs = model(x, attention_mask=mask, labels=y)
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
        scheduler.step()
    print("Train loss:", total_loss / len(iterator))

def evaluate(model, iterator):
    model.eval()
    pred = []
    true = []
    with torch.no_grad():
        total_loss = 0
        for x, y in tqdm(iterator,position=0, leave=True):
            mask = (x != 0).float()
            loss, outputs = model(x, attention_mask=mask, labels=y)
            total_loss += loss
            true += y.cpu().numpy().tolist()
            pred += outputs.cpu().numpy().tolist()
    true = np.array(true)
    pred = np.array(pred)
    for i, name in enumerate(['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']):
        print(name, "roc_auc" ,roc_auc_score(true[:, i], pred[:, i]))
    print("Evaluate loss:",  total_loss / len(iterator))

In [0]:

optimizer = AdamW(model.parameters(),lr = 2e-5,eps = 1e-8)

epochs = 2
total_steps = len(train_iterator) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, 
                                            num_training_steps = total_steps)


In [22]:
for i in range(epochs):
  train(model, train_iterator, optimizer, scheduler)
  evaluate(model, dev_iterator)

  0%|          | 0/96 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (852 > 512). Running this sequence through the model will result in indexing errors
  1%|          | 1/96 [00:01<02:47,  1.76s/it]Token indices sequence length is longer than the specified maximum sequence length for this model (648 > 512). Running this sequence through the model will result in indexing errors
  9%|▉         | 9/96 [00:09<01:33,  1.08s/it]Token indices sequence length is longer than the specified maximum sequence length for this model (537 > 512). Running this sequence through the model will result in indexing errors
 12%|█▎        | 12/96 [00:11<01:13,  1.15it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (931 > 512). Running this sequence through the model will result in indexing errors
 14%|█▎        | 13/96 [00:13<01:32,  1.12s/it]Token indices sequence length is longer than the

Train loss: 0.2979417548825343


 12%|█▎        | 3/24 [00:00<00:04,  4.49it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (1019 > 512). Running this sequence through the model will result in indexing errors
 29%|██▉       | 7/24 [00:02<00:04,  3.58it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (658 > 512). Running this sequence through the model will result in indexing errors
 46%|████▌     | 11/24 [00:03<00:04,  2.94it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (512 > 512). Running this sequence through the model will result in indexing errors
 67%|██████▋   | 16/24 [00:05<00:02,  2.91it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (674 > 512). Running this sequence through the model will result in indexing errors
 75%|███████▌  | 18/24 [00:06<00:02,  2.89it/s]Token indices sequence length is longe

toxic roc_auc 0.9131631631631631
severe_toxic roc_auc 1.0
obscene roc_auc 0.915954415954416
threat roc_auc 0.8326359832635983
insult roc_auc 0.8874643874643875
identity_hate roc_auc 0.9705882352941176
Evaluate loss: tensor(0.1195, device='cuda:0')


  4%|▍         | 4/96 [00:02<01:11,  1.29it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (648 > 512). Running this sequence through the model will result in indexing errors
  7%|▋         | 7/96 [00:05<01:16,  1.16it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (1227 > 512). Running this sequence through the model will result in indexing errors
 10%|█         | 10/96 [00:08<01:12,  1.18it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (629 > 512). Running this sequence through the model will result in indexing errors
 11%|█▏        | 11/96 [00:10<01:33,  1.10s/it]Token indices sequence length is longer than the specified maximum sequence length for this model (950 > 512). Running this sequence through the model will result in indexing errors
 18%|█▊        | 17/96 [00:15<00:53,  1.48it/s]Token indices sequence length is longe

Train loss: 0.12685720385828367


  4%|▍         | 1/24 [00:00<00:08,  2.68it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (1019 > 512). Running this sequence through the model will result in indexing errors
 21%|██        | 5/24 [00:02<00:08,  2.37it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (658 > 512). Running this sequence through the model will result in indexing errors
 58%|█████▊    | 14/24 [00:05<00:02,  3.57it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (512 > 512). Running this sequence through the model will result in indexing errors
 75%|███████▌  | 18/24 [00:06<00:01,  3.04it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (1127 > 512). Running this sequence through the model will result in indexing errors
 83%|████████▎ | 20/24 [00:07<00:01,  2.75it/s]Token indices sequence length is long

toxic roc_auc 0.9582082082082082
severe_toxic roc_auc 0.99581589958159
obscene roc_auc 0.9736467236467237
threat roc_auc 0.9414225941422594
insult roc_auc 0.9537037037037037
identity_hate roc_auc 1.0
Evaluate loss: tensor(0.0929, device='cuda:0')



