## Loading librarires and dataset

In [1]:
import torch
from torch import nn
from torch.nn import functional as F
from torch.utils.data import DataLoader, Dataset

import os
import pandas as pd
import numpy as np
import random
import re
import itertools
import argparse
import gc
from tqdm import tqdm
import ast

from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold
from pathlib import Path

import pickle
import transformers
from transformers import AutoConfig, AutoModel, AutoTokenizer, get_linear_schedule_with_warmup
import warnings
from torch.optim import Adam, SGD, AdamW
warnings.filterwarnings("ignore")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [3]:
data_dir = './dataset/'
df = pd.read_csv(data_dir+'processed_train.csv')
df.shape

(159571, 2)

* use 2000 samples to make sure my code run

In [4]:
dfs = df.sample(2000).reset_index(drop=True)
dfs.head(5)

Unnamed: 0,comment_text,labels
0,How surprising \n\nHow surprising that the art...,"[0, 0, 0, 0, 0, 0]"
1,Sounds like perpetual motion to me - the price...,"[0, 0, 0, 0, 0, 0]"
2,Your claim that the article supports your cita...,"[0, 0, 0, 0, 0, 0]"
3,Cholas and Vijayanagar empires are worth the m...,"[0, 0, 0, 0, 0, 0]"
4,"hi, what do you know about the possible neande...","[0, 0, 0, 0, 0, 0]"


In [5]:
dfs.dtypes

comment_text    object
labels          object
dtype: object

In [6]:
dfs['labels'][0] 
# need to care about this
# i need to use `ast.literal_eval` to convert to list below

'[0, 0, 0, 0, 0, 0]'

Split into train and validation set

In [7]:
train_df = dfs.sample(frac=0.8, random_state=42)
val_df = dfs.drop(train_df.index).reset_index(drop=True)

## Creating Dataset, DataLoader

In [8]:
class MyDataset(Dataset):
    def __init__(self, df, tokenizer: transformers.BertTokenizer, max_len = 128):
        super().__init__()
        self.df = df
        self.tokenizer = tokenizer
        self.max_len = max_len

        self.text = self.df.comment_text.values
        self.label = self.df.labels.values
    
    def __len__(self):
        return len(self.text)

    def __getitem__(self, idx):
        text = self.text[idx]
        label = self.label[idx]

        input = self.tokenizer.encode_plus(
            text = text,
            text_pair = None,
            add_special_tokens=True,
            padding='max_length',
            truncation=True,
            max_length = self.max_len,
            return_tensors='pt'
        )
        # import pdb; pdb.set_trace()
        return {
            'input_ids' : input.input_ids.squeeze(), ## MUST SQUEEZE
            'attention_mask' : input.attention_mask.squeeze(), 
            'label' : torch.tensor(ast.literal_eval(self.label[idx]), dtype = torch.float)
        }
        # return {
        #     'input_ids' : input.input_ids,
        #     'attention_mask' : input.attention_mask
        # }

In [9]:
tokenizer = AutoTokenizer.from_pretrained('./bert')
train = MyDataset(train_df, tokenizer, 200)
val = MyDataset(val_df, tokenizer, 200)

In [10]:
train_dataloader = DataLoader(train, batch_size=8, shuffle=True)
val_dataloader = DataLoader(val, batch_size=8, shuffle=False)

## Model

In [11]:
DEVICE = "cuda" if torch.cuda.is_available() else "gpu"

In [12]:
class MyBert(nn.Module):
    def __init__(self, model_path = None, num_labels = 6):
        super().__init__()
        if model_path == None:
            self.bert = AutoModel.from_pretrained('bert-based-uncase')
        else:
            self.bert = AutoModel.from_pretrained(model_path)
        self.drop_out = nn.Dropout(0.1)
        self.lin = nn.Linear(768, num_labels)
    def forward(self, ids, mask):
        pooler = self.bert(ids, mask)[-1] # pooler 
        # import pdb; pdb.set_trace()
        pooler = self.drop_out(pooler)
        return self.lin(pooler)
model = MyBert(model_path='./bert').to(DEVICE)

In [13]:
optimizer = torch.optim.AdamW(params=model.parameters(), lr=3e-5)
loss_fn = F.binary_cross_entropy_with_logits

## training

In [14]:
from utils import AverageMeter

In [16]:
def train(epochs = 1):
    for epoch in range(epochs):
        model.train()
        tk0 = tqdm(train_dataloader, desc = 'Training ...')
        losses = AverageMeter()
        for i, batch in enumerate(tk0):
            ids = batch['input_ids'].to(DEVICE)
            mask = batch['attention_mask'].to(DEVICE)
            label = batch['label'].to(DEVICE)
            model.zero_grad()
            
            output = model(ids, mask)
            loss = loss_fn(output, label)
            losses.update(loss, ids.shape[0])
            
            loss.backward()
            optimizer.step()
    
            tk0.set_postfix(epoch = epoch, loss = losses.avg)
            # import pdb; pdb.set_trace()
            # break
train(5)


        

Training ...: 100%|█| 200/200 [00:36<00:00,  5.50it/s, epoch=0, loss=tensor(0.04
Training ...: 100%|█| 200/200 [00:37<00:00,  5.37it/s, epoch=1, loss=tensor(0.03
Training ...: 100%|█| 200/200 [00:36<00:00,  5.42it/s, epoch=2, loss=tensor(0.02
Training ...: 100%|█| 200/200 [00:36<00:00,  5.42it/s, epoch=3, loss=tensor(0.02
Training ...: 100%|█| 200/200 [00:37<00:00,  5.39it/s, epoch=4, loss=tensor(0.01


In [None]:
all_test_pred = []

def test(epoch):
    model.eval()
    
    with torch.inference_mode():
    
        for _, data in tqdm(enumerate(val_dataloader, 0)):

            ids = data['input_ids'].to(DEVICE)
            mask = data['attention_mask'].to(DEVICE)
            outputs = model(ids, mask)
            probas = torch.sigmoid(outputs)

            import pdb; pdb.set_trace()
            all_test_pred.append(probas)
            
    return probas
probas = test(model)

0it [00:00, ?it/s]

> [0;32m/tmp/ipykernel_18134/1787273927.py[0m(16)[0;36mtest[0;34m()[0m
[0;32m     14 [0;31m[0;34m[0m[0m
[0m[0;32m     15 [0;31m            [0;32mimport[0m [0mpdb[0m[0;34m;[0m [0mpdb[0m[0;34m.[0m[0mset_trace[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m---> 16 [0;31m            [0mall_test_pred[0m[0;34m.[0m[0mappend[0m[0;34m([0m[0mprobas[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     17 [0;31m[0;34m[0m[0m
[0m[0;32m     18 [0;31m    [0;32mreturn[0m [0mprobas[0m[0;34m[0m[0;34m[0m[0m
[0m


ipdb>  probas


tensor([[1.8960e-03, 9.9010e-04, 1.0140e-03, 1.0224e-03, 9.2756e-04, 1.4257e-03],
        [9.8673e-01, 1.4279e-01, 8.5408e-01, 2.7308e-02, 9.5657e-01, 7.6640e-02],
        [1.8749e-03, 9.9681e-04, 1.0238e-03, 1.0639e-03, 9.1174e-04, 1.4644e-03],
        [2.0996e-03, 1.0024e-03, 1.0946e-03, 9.7992e-04, 9.8670e-04, 1.4091e-03],
        [1.8507e-03, 1.0170e-03, 1.0260e-03, 1.0109e-03, 9.2857e-04, 1.4463e-03],
        [2.2345e-03, 9.9958e-04, 1.0309e-03, 1.0249e-03, 9.8413e-04, 1.4357e-03],
        [1.8208e-03, 1.0390e-03, 1.0214e-03, 1.0299e-03, 9.7679e-04, 1.4476e-03],
        [1.8394e-03, 1.0317e-03, 1.0269e-03, 9.9866e-04, 9.6371e-04, 1.4379e-03]],
       device='cuda:0')


ipdb>  probas.shape


torch.Size([8, 6])


ipdb>  probas[0]


tensor([0.0019, 0.0010, 0.0010, 0.0010, 0.0009, 0.0014], device='cuda:0')
