In [128]:
# !pip3 install huggingface
# !pip3 install huggingface_hub
# !pip3 install datasets
# !pip3 install kaggle
# !pip3 install pandas
# !pip3 install matplotlib
# !pip3 install transformers

# Kaggle 

In [129]:
from zipfile import ZipFile
import os
import shutil
from zipfile import ZipFile

from kaggle.api.kaggle_api_extended import KaggleApi

DATA_DIR = './_data'
JIGSAW_DIR = DATA_DIR+'/jigsaw'

k_api = KaggleApi()
k_api.authenticate()
os.makedirs(JIGSAW_DIR, exist_ok = True)

zip_file = DATA_DIR+'/jigsaw-toxic-comment-classification-challenge.zip'
if not os.path.exists(zip_file):
    #downloading datasets for ny stock exchange
    k_api.dataset_download_files(
        dataset='julian3833/jigsaw-toxic-comment-classification-challenge',
        unzip=False
    )
    shutil.move(zip_file, DATA_DIR)

    zf = ZipFile(zip_file)
    zf.extractall(JIGSAW_DIR) 
    zf.close()

JIGSAW_TRAIN_PATH = JIGSAW_DIR+'/train.csv'
JIGSAW_TEST_X_PATH = JIGSAW_DIR+'/test.csv'
JIGSAW_TEST_Y_PATH = JIGSAW_DIR+'/test_labels.csv'


In [130]:
import pandas as pd

df  = pd.read_csv(JIGSAW_TRAIN_PATH)

In [131]:
from typing import Any
from transformers import RobertaTokenizer
from transformers import RobertaModel

from torch import nn

class Encoder(nn.Module):
    def __init__(self) -> None:
        super(Encoder, self).__init__()
        
        self.tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
        self.model = RobertaModel.from_pretrained('roberta-base')
        self.model.eval()
    
    def forward(self, texts):
        tokens = self.tokenizer(
            texts, 
            return_tensors='pt', 
            padding=True, 
            truncation=True)
        embeddings = self.model(**tokens)['pooler_output']
        return embeddings
        
    def __call__(self, *args: Any, **kwds: Any) -> Any:
        return self.forward(args[0])

In [132]:
import torch.nn as nn

class MultiLabelClassificationHead(nn.Module):
    def __init__(self, hidden_size, nlabels) -> None:
        super(MultiLabelClassificationHead, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = nlabels
        
        self.input = nn.Linear(
            in_features=self.hidden_size, 
            out_features=self.hidden_size, bias=True)
        self.dropout = nn.Dropout(p=0.1, inplace=False)
        self.output = nn.Linear(
            in_features=self.hidden_size,
            out_features=self.output_size, bias=True)
        self.output_probs = nn.Sigmoid()
    
    def forward(self, x):
        x = self.input(x)
        x = self.dropout(x)
        return self.output_probs(self.output(x))

In [133]:
from torch.utils.data import Dataset
import torch
import numpy as np

class ToxicDataset(Dataset):
    def __init__(self, dataframe, encoder, labels) -> None:
        super(ToxicDataset, self).__init__()
        self.dataframe = dataframe
        self.encoder = encoder
        self.labels = labels
    
    def __getitem__(self, index) -> Any:
        text = self.dataframe.iloc[index]['comment_text']
        
        embedding = self.encoder([text])[0]
        labels = self.dataframe.iloc[index][self.labels].to_numpy()
        
        return embedding, torch.from_numpy(labels.astype(np.float32))
    
    def __len__(self):
        return len(self.dataframe)

In [134]:
from torch.utils.data import DataLoader
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
labels = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]

encoder_model = Encoder()
classify_model = MultiLabelClassificationHead(768, len(labels)).to(device)

BATCH_SIZE = 16
def to_dataloader(dataframe):
    ds = ToxicDataset(dataframe, encoder_model, labels)
    dl = DataLoader(
        ds, batch_size=BATCH_SIZE)
    return dl

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [135]:
import os

MODELS_DIR = './_models'
JIGSAW_MODEL_DIR = './_models/jigsaw'
os.makedirs(JIGSAW_MODEL_DIR, exist_ok=True)


def save_model(model, name) -> None:
    torch.save(
        model.state_dict(), JIGSAW_MODEL_DIR+'/'+name)


def load_model(model, name) -> Any:
    if os.path.exists(JIGSAW_MODEL_DIR+'/'+name):
        model.load_state_dict(torch.load(JIGSAW_MODEL_DIR+'/'+name))
    return model

In [None]:
import torch

def train_epoch(model, dataloader, lossFn, optimizer):
    epoch_loss = 0
    model.train()
    for input_tensor, target_tensor in dataloader:
        optimizer.zero_grad()
        
        input_tensor = input_tensor.to(device)
        target_tensor = target_tensor.to(device)
        
        probs = model(input_tensor)
        loss = lossFn(probs, target_tensor)
        
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.detach().item()

    epoch_mean_loss = epoch_loss/len(dataloader)
    return epoch_mean_loss

def val_epoch(model, dataloader, lossFn):
    epoch_loss = 0
    model.eval()
    with torch.no_grad():
        for input_tensor, target_tensor in dataloader:
            probs = model(input_tensor)
            loss = lossFn(probs, target_tensor)
            epoch_loss += loss.detach().item()

    epoch_mean_loss = epoch_loss/len(dataloader)
    return epoch_mean_loss



In [137]:
loss_fn = torch.nn.MSELoss()
optim = torch.optim.Adam(classify_model.parameters(), lr=0.001)

sample_df = df[0:2]
dl = to_dataloader(sample_df)
loss = train_epoch(classify_model, dl, loss_fn, optim)
print(loss)

0.24643439054489136


In [138]:
from sklearn.model_selection import train_test_split

train_df, val_df = train_test_split(df, test_size=0.1)
train_dl = to_dataloader(train_df)
val_dl = to_dataloader(val_df)

loss_fn = torch.nn.MSELoss()
optim = torch.optim.Adam(classify_model.parameters(), lr=0.001)
epochs = 100

train_losses = []
val_losses = []
for e in range(1, epochs+1):
    train_loss = train_epoch(classify_model, train_dl, loss_fn, optim)
    val_loss = val_epoch(classify_model, val_dl, loss_fn)
    train_losses.append(train_loss)
    val_losses.append(val_loss)
    
    if e % 5 == 0:
        print("epoch-{}; losses - train {}, val {}".format( 
            e, round(train_loss, 4), 
            round(val_loss, 4)))
        save_model(classify_model, e+'-'+classify_model._get_name())