In [2]:
import warnings
warnings.filterwarnings('ignore', category=DeprecationWarning)
warnings.filterwarnings('ignore', category=FutureWarning)

import os
import re
import json
import glob
from copy import deepcopy
from collections import defaultdict
from functools import partial
from imblearn.under_sampling import RandomUnderSampler

import pandas as pd
import numpy as np

from nltk import sent_tokenize

import matplotlib.pyplot as plt
import seaborn as sns

import unidecode

from tqdm.notebook import tqdm
import string

from transformers import AutoTokenizer, AutoModel
import torch
from torch import nn
from torchcrf import CRF

from sklearn.model_selection import train_test_split

%matplotlib inline

device='cuda' if torch.cuda.is_available() else 'cpu'
model_checkpoint='bert-base-uncased'
tokenizer=AutoTokenizer.from_pretrained(model_checkpoint)

def clean_text(txt):
    return [re.sub('[^A-Za-z0-9]+', ' ', t.lower()) for t in txt]

torch.manual_seed(1)

I0614 21:16:16.626769  9896 filelock.py:274] Lock 1832085578248 acquired on C:\Users\mt601/.cache\huggingface\transformers\733bade19e5f0ce98e6531021dd5180994bb2f7b8bd7e80c7968805834ba351e.35205c6cfc956461d8515139f0f8dd5d207a2f336c0c3a83b4bc8dca3518e37b.lock
Downloading: 100%|██████████| 481/481 [00:00<00:00, 241kB/s]
I0614 21:16:16.973569  9896 filelock.py:318] Lock 1832085578248 released on C:\Users\mt601/.cache\huggingface\transformers\733bade19e5f0ce98e6531021dd5180994bb2f7b8bd7e80c7968805834ba351e.35205c6cfc956461d8515139f0f8dd5d207a2f336c0c3a83b4bc8dca3518e37b.lock
I0614 21:16:17.313405  9896 filelock.py:274] Lock 1832085578920 acquired on C:\Users\mt601/.cache\huggingface\transformers\d3ccdbfeb9aaa747ef20432d4976c32ee3fa69663b379deb253ccfce2bb1fdc5.d67d6b367eb24ab43b08ad55e014cf254076934f71d832bbab9ad35644a375ab.lock
Downloading: 100%|██████████| 899k/899k [00:00<00:00, 3.61MB/s]
I0614 21:16:17.910042  9896 filelock.py:318] Lock 1832085578920 released on C:\Users\mt601/.cache\hug

<torch._C.Generator at 0x1aaff993330>

In [2]:
# train_df=pd.read_csv('../input/show-me-the-data-preparer/processed_train_df.csv')
# sample_sub = pd.read_csv('../input/show-me-the-data-preparer/processed_test_df.csv')

train_df=pd.read_csv('data/processed/processed_train_df.csv').reset_index()
sample_sub = pd.read_csv('data/processed/processed_test_df.csv')

train_df.drop_duplicates(inplace=True)
# train_df.dropna(inplace=True)
train_df.label=train_df.label.fillna('')
labels=[int(len(label)>0) for label in train_df.label]

rus=RandomUnderSampler()
X_res, y_res = rus.fit_resample(train_df.iloc[:,:2], labels)
train_df=pd.merge(train_df,X_res.drop('text',axis=1),on='index').set_index('index')

train_df=train_df[[len(str(t)) > 50 for t in train_df.text]]
train_df['cls_label']=[int(len(t)>0) for t in train_df.label]

train_df.reset_index(drop=True,inplace=True)

X_train, X_val, y_train, y_val = train_test_split(train_df.text, train_df.label, random_state=1821)
y_train_cls=train_df.cls_label[X_train.index]
y_val_cls=train_df.cls_label[X_val.index]

In [3]:
START_TAG=tokenizer.cls_token
STOP_TAG=tokenizer.sep_token
PAD_TAG=tokenizer.pad_token

tag_to_ix = {"B": 0, "I": 1, "O": 2, START_TAG: 3, STOP_TAG: 4, PAD_TAG:-1}

class my_model(nn.Module):
    def __init__(self,backbone,tag_to_ix,cls_loss=True):
        super(my_model,self).__init__()
        self.tag_to_ix = tag_to_ix
        self.tagset_size = len(tag_to_ix)
        # feature extraction
        self.backbone=backbone
        self.hidden_dim=backbone(**tokenizer('test',return_tensors='pt'))[0].shape[-1]
        # Maps the output of the backbone into tag space.
        self.hidden2tag = nn.Linear(self.hidden_dim, self.tagset_size)
        self.aux_fc = nn.Linear(self.hidden_dim,1) if cls_loss else None
        # CRF
        self.crf = CRF(self.tagset_size, batch_first=True)
        # loss func
        self.loss_fn = nn.BCEWithLogitsLoss()
        self.cls_loss=cls_loss

    def forward(self, inputs, labels=None, cls_labels=None):
        # Get the emission scores from the backbone
        outputs = self.backbone(**inputs).last_hidden_state
        emission = self.hidden2tag(outputs)
        
        # Return result
        if labels is not None:
            crf_loss = -self.crf(nn.functional.log_softmax(emission,2), labels, mask=inputs['attention_mask'].bool(), reduction='mean')
            if self.cls_loss:
                cls_output = self.aux_fc(outputs[:,0,:])
                cls_loss = self.loss_fn(cls_output,cls_labels)
                loss = crf_loss+cls_loss
                return loss
            else:
                return crf_loss
        else:
            prediction = self.crf.decode(emission,mask=inputs['attention_mask'].bool())
            return prediction

def gen_label(text,label):
    encoded_text=[tokenizer.cls_token] + tokenizer.tokenize(text) + [tokenizer.sep_token]
    result=[tokenizer.cls_token] + ['O']*len(tokenizer.tokenize(text)) + [tokenizer.sep_token]
    for label in label:
        if label=='':
            continue
        encoded_label=tokenizer.tokenize(label)
        for i,token in enumerate(encoded_text):
            if token==encoded_label[0] and encoded_text[i:i+len(encoded_label)]==encoded_label:
                result[i]='B'
                result[i+1:i+len(encoded_label)]=['I']*(len(encoded_label)-1)
    return [tag_to_ix[i] for i in result]

def gen_label_batch(texts,labels):
    tags=[gen_label(*inputs)[:max_len] for inputs in zip(texts,labels)]
    max_length=max([len(tag) for tag in tags])
    if tokenizer.padding_side=='right':
        return torch.tensor([tag+[tag_to_ix[PAD_TAG]]*(max_length-len(tag)) for tag in tags], dtype=torch.long, device=device).view(len(texts),-1)
    else:
        return torch.tensor([[tag_to_ix[PAD_TAG]]*(max_length-len(tag))+tag for tag in tags], dtype=torch.long, device=device).view(len(texts),-1)

In [4]:
class my_ds(torch.utils.data.Dataset):
    def __init__(self,text,label,cls_label):
        super().__init__()
        self.text=text
        self.label=label
        self.cls_label=cls_label
    
    def __len__(self):
        return len(self.text)
    
    def __getitem__(self,idx):
        return self.text.iloc[idx], self.label.iloc[idx], self.cls_label.iloc[idx]

In [5]:
train_ds=my_ds(X_train,y_train,y_train_cls)
val_ds=my_ds(X_val,y_val,y_val_cls)

In [6]:
batch_size=4
max_len=128

train_loader=torch.utils.data.DataLoader(train_ds,shuffle=True,batch_size=batch_size)
val_loader=torch.utils.data.DataLoader(val_ds,shuffle=False,batch_size=batch_size)

In [7]:
from transformers import get_scheduler, AdamW

def train_fn(train_loader,model,optimizer,lr_scheduler):
    model.train()
    train_loss=0
    train_epoch=tqdm(train_loader, total = len(train_loader), leave=False)

    for index, batch in enumerate(train_epoch):
        # gen input
        text=list(batch[0])
        inputs=tokenizer(text,return_tensors='pt',padding=True,truncation=True,max_length=max_len)
        inputs={k:v.to(device) for k,v in inputs.items()}
        # gen label
        data_names=[label.strip().split('|') for label in batch[1]]
        tags=gen_label_batch(text,data_names)
        # cls label
        cls_labels=batch[2].view(-1,1).float().to(device)
        
        # get loss
        loss = model(inputs, tags, cls_labels)
        # optimizing
        loss.backward()
        optimizer.step()   
        lr_scheduler.step()
        optimizer.zero_grad()
             
        train_loss += loss.detach()

        if index % 10 == 0:
            train_epoch.set_description('Step:{} | Loss:{:.3f}'.format(index,loss.item()))
    return train_loss.mean()

def eval_fn(val_loader,model):
    model.eval()
    eval_loss=0
    
    with torch.no_grad():
        for index, batch in enumerate(tqdm(val_loader, total = len(val_loader), leave=False)):
            # gen input
            text=list(batch[0])
            inputs=tokenizer(text,return_tensors='pt',padding=True,truncation=True,max_length=max_len)
            inputs={k:v.to(device) for k,v in inputs.items()}
            # gen label
            data_names=[label.strip().split('|') for label in batch[1]]
            tags=gen_label_batch(text,data_names)
            # cls label
            cls_labels=batch[2].view(-1,1).float().to(device)
            
            # get loss
            loss = model(inputs, tags, cls_labels)
            
            eval_loss += loss.detach()
            
    return eval_loss.mean()

def train_engine(model, epoch, train_loader, val_loader):
    # model = torch.nn.DataParallel(model)
    optimizer = AdamW(model.parameters(), lr=1e-4)
    
    num_training_steps = epoch * len(train_loader)
    lr_scheduler = get_scheduler(
        "linear",
        optimizer=optimizer,
        num_warmup_steps=100,
        num_training_steps=num_training_steps
    )
    
    best_eval_loss = np.inf
    for i in tqdm(range(epoch)):
        train_loss = train_fn(train_loader,model,optimizer,lr_scheduler)
        eval_loss = eval_fn(val_loader, model)
        
        print(f"Epoch {i} , Train loss: {train_loss}, Eval loss: {eval_loss}")

        if eval_loss < best_eval_loss:
            best_eval_loss = eval_loss           
            
            print("Saving the model")
            torch.save(model.state_dict(), f'model_checkpoint/{model_checkpoint}.bin')
            
#     return model, eval_predictions, true_labels 


In [8]:
backbone=AutoModel.from_pretrained(model_checkpoint)
model=my_model(backbone,tag_to_ix).to(device)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [9]:
train_engine(model,10,train_loader,val_loader)

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/17146 [00:00<?, ?it/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (942 > 512). Running this sequence through the model will result in indexing errors


  0%|          | 0/5716 [00:00<?, ?it/s]

Epoch 0 , Train loss: 90757.8359375, Eval loss: 14170.77734375
Saving the model


  0%|          | 0/17146 [00:00<?, ?it/s]

  0%|          | 0/5716 [00:00<?, ?it/s]

Epoch 1 , Train loss: 25551.23828125, Eval loss: 7176.58251953125
Saving the model


  0%|          | 0/17146 [00:00<?, ?it/s]

  0%|          | 0/5716 [00:00<?, ?it/s]

Epoch 2 , Train loss: 16535.87890625, Eval loss: 5907.7978515625
Saving the model


  0%|          | 0/17146 [00:00<?, ?it/s]

  0%|          | 0/5716 [00:00<?, ?it/s]

Epoch 3 , Train loss: 14898.8056640625, Eval loss: 6139.64892578125


  0%|          | 0/17146 [00:00<?, ?it/s]

  0%|          | 0/5716 [00:00<?, ?it/s]

Epoch 4 , Train loss: 14412.4169921875, Eval loss: 6054.23095703125


  0%|          | 0/17146 [00:00<?, ?it/s]

  0%|          | 0/5716 [00:00<?, ?it/s]

Epoch 5 , Train loss: 14121.0439453125, Eval loss: 6393.51904296875


  0%|          | 0/17146 [00:00<?, ?it/s]