## Loading librarires and dataset

In [20]:
import torch
from torch import nn
from torch.nn import functional as F
from torch.utils.data import DataLoader, Dataset

import os
import pandas as pd
import numpy as np
import random
import re
import itertools
import argparse
import gc
from tqdm import tqdm
import ast

from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold
from pathlib import Path

import pickle
import transformers
from transformers import AutoConfig, AutoModel, AutoTokenizer, get_linear_schedule_with_warmup
import warnings
from torch.optim import Adam, SGD, AdamW
warnings.filterwarnings("ignore")

In [21]:
# os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [22]:
data_dir = './dataset/'
df = pd.read_csv(data_dir+'processed_train.csv')
df.shape

(159571, 2)

* use 2000 samples to make sure my code run

In [23]:
dfs = df.sample(2000).reset_index(drop=True)
dfs.head(5)

Unnamed: 0,comment_text,labels
0,"No, There is a larger opposition than that. Al...","[0, 0, 0, 0, 0, 0]"
1,"Material to be included should be relevant, no...","[0, 0, 0, 0, 0, 0]"
2,There is nothing a priori that prevents an adm...,"[0, 0, 0, 0, 0, 0]"
3,Source\nSource: . (talk),"[0, 0, 0, 0, 0, 0]"
4,"""\n\nUser:Iblardi wrote: """"Yet in the above li...","[0, 0, 0, 0, 0, 0]"


In [24]:
dfs.dtypes

comment_text    object
labels          object
dtype: object

In [25]:
dfs['labels'][0] 
# need to care about this
# i need to use `ast.literal_eval` to convert to list below

'[0, 0, 0, 0, 0, 0]'

Split into train and validation set

In [26]:
train_df = dfs.sample(frac=1, random_state=42)
# val_df = dfs.drop(train_df.index).reset_index(drop=True)

## Creating Dataset, DataLoader

In [27]:
class MyDataset(Dataset):
    def __init__(self, df, tokenizer: transformers.BertTokenizer, max_len = 128):
        super().__init__()
        self.df = df
        self.tokenizer = tokenizer
        self.max_len = max_len

        self.text = self.df.comment_text.values
        self.label = self.df.labels.values
    
    def __len__(self):
        return len(self.text)

    def __getitem__(self, idx):
        text = self.text[idx]
        label = self.label[idx]

        input = self.tokenizer.encode_plus(
            text = text,
            text_pair = None,
            add_special_tokens=True,
            padding='max_length',
            truncation=True,
            max_length = self.max_len,
            return_tensors='pt'
        )
        # import pdb; pdb.set_trace()
        return {
            'input_ids' : input.input_ids.squeeze(), ## MUST SQUEEZE
            'attention_mask' : input.attention_mask.squeeze(), 
            'label' : torch.tensor(ast.literal_eval(self.label[idx]), dtype = torch.float)
        }
        # return {
        #     'input_ids' : input.input_ids,
        #     'attention_mask' : input.attention_mask
        # }

In [28]:
tokenizer = AutoTokenizer.from_pretrained('./bert')
train = MyDataset(train_df, tokenizer, 200)
# val = MyDataset(val_df, tokenizer, 200)

In [29]:
train_dataloader = DataLoader(train, batch_size=8, shuffle=True)
# val_dataloader = DataLoader(val, batch_size=8, shuffle=False)

## Model

In [30]:
DEVICE = "cuda" if torch.cuda.is_available() else "gpu"

In [31]:
class MyBert(nn.Module):
    def __init__(self, model_path = None, num_labels = 6):
        super().__init__()
        if model_path == None:
            self.bert = AutoModel.from_pretrained('bert-based-uncase')
        else:
            self.bert = AutoModel.from_pretrained(model_path)
        self.drop_out = nn.Dropout(0.1)
        self.lin = nn.Linear(768, num_labels)
    def forward(self, ids, mask):
        pooler = self.bert(ids, mask)[-1] # pooler 
        # import pdb; pdb.set_trace()
        pooler = self.drop_out(pooler)
        return self.lin(pooler)
model = MyBert(model_path='./bert').to(DEVICE)

In [32]:
optimizer = torch.optim.AdamW(params=model.parameters(), lr=3e-5)
loss_fn = F.binary_cross_entropy_with_logits

## training

In [33]:
from utils import AverageMeter

In [34]:
def train(epochs = 1):
    for epoch in range(epochs):
        model.train()
        tk0 = tqdm(train_dataloader, desc = 'Training ...')
        losses = AverageMeter()
        for i, batch in enumerate(tk0):
            ids = batch['input_ids'].to(DEVICE)
            mask = batch['attention_mask'].to(DEVICE)
            label = batch['label'].to(DEVICE)
            model.zero_grad()
            
            output = model(ids, mask)
            loss = loss_fn(output, label)
            losses.update(loss, ids.shape[0])
            
            loss.backward()
            optimizer.step()
    
            tk0.set_postfix(epoch = epoch, loss = losses.avg)
            # import pdb; pdb.set_trace()
            # break
train(5)


        

Training ...: 100%|█| 250/250 [00:45<00:00,  5.50it/s, epoch=0, loss=tensor(0.14
Training ...: 100%|█| 250/250 [00:45<00:00,  5.48it/s, epoch=1, loss=tensor(0.06
Training ...: 100%|█| 250/250 [00:46<00:00,  5.35it/s, epoch=2, loss=tensor(0.04
Training ...: 100%|█| 250/250 [00:46<00:00,  5.33it/s, epoch=3, loss=tensor(0.03
Training ...: 100%|█| 250/250 [00:47<00:00,  5.31it/s, epoch=4, loss=tensor(0.02


## Inference

In [35]:
# all_test_pred = []

# def test(epoch):
#     model.eval()
    
#     with torch.inference_mode():
    
#         for _, data in tqdm(enumerate(val_dataloader, 0)):

#             ids = data['input_ids'].to(DEVICE)
#             mask = data['attention_mask'].to(DEVICE)
#             outputs = model(ids, mask)
#             probas = torch.sigmoid(outputs)

#             import pdb; pdb.set_trace()
#             all_test_pred.append(probas)
            
#     return probas
# probas = test(model)