In [1]:
import os
import gc
import random
from math import ceil, log, sqrt
from collections import deque, defaultdict, Counter
from time import time
from datetime import timedelta
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import mean_squared_error
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader,\
                             WeightedRandomSampler
import transformers
from transformers import RobertaTokenizer,\
                         RobertaModel,\
                         AdamW, get_linear_schedule_with_warmup

In [2]:
seed = 42
random.seed(seed)
torch.manual_seed(seed)
np.random.seed(seed)

np.set_printoptions(precision=4, suppress=True)

In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [4]:
df = pd.read_csv('../input/commonlitreadabilityprize/train.csv')
df['excerpt'] = df['excerpt'].str.replace('\n', ' ')
df.head()

Unnamed: 0,id,url_legal,license,excerpt,target,standard_error
0,c12129c31,,,When the young people returned to the ballroom...,-0.340259,0.464009
1,85aa80a4c,,,"All through dinner time, Mrs. Fayre was somewh...",-0.315372,0.480805
2,b69ac6792,,,"As Roger had predicted, the snow departed as q...",-0.580118,0.476676
3,dd1000b26,,,And outside before the palace a great garden w...,-1.054013,0.450007
4,37c1b32fb,,,Once upon a time there were Three Bears who li...,0.247197,0.510845


In [5]:
def create_fold(df, num_splits, column):
    
    df = df.sample(frac=1).reset_index(drop=True)
    num_bins = int(ceil(log(len(df), 2) + 1))
    df['bin'] = pd.cut(df[column], num_bins, labels=False)
    
    skf = StratifiedKFold(n_splits=num_splits)
    df['fold'] = -1
    
    for fold, (_, idxs) in enumerate(skf.split(df, df['bin'].values)):
        df.loc[idxs, 'fold'] = fold
    # df.drop('bin', axis=1, inplace=True)
    
    return df


num_folds = 5
df = create_fold(df, num_folds, 'target')

In [6]:
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

class CLRPDataset(Dataset):
    
    def __init__(self, excerpts, targets):
        
        self.excerpts = excerpts
        self.targets = targets
        
    def __getitem__(self, idx):
        
        excerpt = self.excerpts[idx]
        encode_excerpt = tokenizer.encode_plus(excerpt,
                                               padding=False,
                                               return_attention_mask=True)
        
        ids = torch.tensor(encode_excerpt['input_ids'], 
                           dtype=torch.long)
        mask = torch.tensor(encode_excerpt['attention_mask'], 
                            dtype=torch.long)
        target = torch.tensor(self.targets[idx], dtype=torch.float)
        
        return {'ids': ids, 'mask': mask, 'target': target}
    
    def __len__(self):
        return len(self.excerpts)

class CLRPCollate:
    
    def __init__(self):
        pass
    
    def __call__(self, batch):
        
        res = {'ids': [], 'mask': [], 
               'target': [], 'bin': []}
        
        max_len = 0
        
        for example in batch:
            res['ids'].append(example['ids'].numpy())
            res['mask'].append(example['mask'].numpy())
            res['target'].append(example['target'])
            max_len = max(max_len, len(res['ids'][-1]))
        
        for idx in range(len(batch)):
            ones = np.ones(max_len - len(res['ids'][idx]))
            res['ids'][idx] = np.concatenate([res['ids'][idx], ones])
            ones = np.ones(max_len - len(res['mask'][idx]))
            res['mask'][idx] = np.concatenate([res['mask'][idx], ones])
            
        res['ids'] = torch.tensor(res['ids'], dtype=torch.long)
        res['mask'] = torch.tensor(res['mask'], dtype=torch.long)
        res['target'] = torch.tensor(res['target'], dtype=torch.float)
        
        return res
        

Downloading:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [7]:
class Averager:
    
    def __init__(self, maxlen=16):
        self.values = deque(maxlen=maxlen)
        self.total = 0.
        self.count = 0
        return
    
    def update(self, value):
        self.values.append(value)
        self.total += value
        self.count += 1
        return
    
    @property
    def avg(self):
        return np.mean(self.values)
    
    @property
    def global_avg(self):
        return self.total / self.count
    
    def __str__(self):
        return f'{self.avg:.3f} ({self.global_avg:.3f})'

class Traininglog:
    
    def __init__(self, sep=' '):
        self.metrics = defaultdict(Averager)
        self.sep = sep
        return
    
    def update(self, **kwargs):
        for key, value in kwargs.items():
            assert isinstance(value, (float, int))
            self.metrics[key].update(value)
        return
    
    def __str__(self):
        loss_str = []
        for key, value in self.metrics.items():
            loss_str.append(f'{key}: {value}')
        return self.sep.join(loss_str)
    
    def run_iterations(self, iterable, print_freq, header=''):
        step = 0
        start = time()
        log_msg = self.sep.join([header, '[{0:02d}/{1}]', 'eta: {eta}',
                                 '{metrics}','time: {time}', 'data: {data}'])
        data_time = Averager()
        iter_time = Averager()
        end = time()
        
        for obj in iterable:
            data_time.update(time() - end)
            yield obj
            iter_time.update(time() - end)
            if (step + 1) % print_freq == 0 or step == len(iterable) - 1:
                eta_secs = iter_time.global_avg * (len(iterable) - 1)
                eta_str = f'{timedelta(seconds=int(eta_secs))}'
                print(log_msg.format(step, len(iterable), eta=eta_str,
                                     metrics=self, time=iter_time, 
                                     data=data_time))
            end = time()
            step += 1
            
        total_time = time() - start
        total_time = f'{timedelta(seconds=int(total_time))}'
        print(f'Total time: {total_time}')
        return

In [8]:
class AttentionHead(nn.Module):
    
    def __init__(self, embed_dim, hidden_dim):
        super(AttentionHead, self).__init__()
        self.W_qk = nn.Linear(embed_dim, hidden_dim)
        self.W_v = nn.Linear(hidden_dim, 1)
    
    def forward(self, hidden_states):
        alpha = self.W_v(torch.tanh(self.W_qk(hidden_states)))
        score = torch.softmax(alpha, dim=1)
        context = score * hidden_states
        return torch.sum(context, dim=1)
        
class CLRPModel(nn.Module):
    
    def __init__(self):
        super(CLRPModel, self).__init__()
        self.roberta = RobertaModel.from_pretrained('roberta-base')
        self.attention_head = AttentionHead(768, 512)
        self.dropout = nn.Dropout(0.1)
        self.regressor = nn.Linear(768, 1)
    
    def forward(self, input_ids, attention_mask):
        hidden_states = self.roberta(input_ids, attention_mask).last_hidden_state
        return self.regressor(self.dropout(self.attention_head(hidden_states)))

In [9]:
###############################################################################

def loss_func(output, target):
    return torch.sqrt(nn.MSELoss()(output, target))


def validation(val_dataloader, model):
    
    log = Traininglog()
    header = 'Validation'
    run_target = []
    run_pred = []
    
    with torch.no_grad():
        model.eval()
        
        for batch in log.run_iterations(val_dataloader, 50, header=header):

            ids = batch['ids'].to(device, non_blocking=True)
            mask = batch['mask'].to(device, non_blocking=True)
            end = time()
            output = model(ids, mask)
            inference_time = time() - end
            output = output.squeeze(-1)
            run_pred.append(output.to('cpu'))
            run_target.append(batch['target'])
            log.update(inference_time=inference_time)
            
            del mask, ids
            
        run_target = np.concatenate(run_target)
        run_pred = np.concatenate(run_pred)
    return np.sqrt(mean_squared_error(run_target, run_pred))


def train_eval_one_epoch(fold, epoch, train_dataloader, val_dataloader, 
                         eval_steps, min_loss, model, optimizer, lr_scheduler):
    
    model.train()
    
    log = Traininglog()
    header = f'Epoch[{epoch}]'
    
    steps = 0
    min_loss = min_loss
    
    for batch in log.run_iterations(train_dataloader, eval_steps, header=header):
        
        optimizer.zero_grad()
        
        ids = batch['ids'].to(device, non_blocking=True)
        mask = batch['mask'].to(device, non_blocking=True)
        target = batch['target'].to(device, non_blocking=True)
        
        output = model(ids, mask)
        output = output.squeeze(-1)
        
        loss = loss_func(output, target)
                
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        
        log.update(train_loss=loss.item())
        del loss, output, target, mask, ids
        
        steps += 1
        if (steps + 1) % eval_steps == 0 or steps == len(train_dataloader):
            val_loss = validation(val_dataloader, model)
            print(f'val_loss: {val_loss}')
            if val_loss < min_loss:
                print(f'min_loss decreased from {min_loss} to {val_loss}.')
                print(f'saving model...')
                torch.save(model.state_dict(), f'roberta-baseline-fold{fold}.pth')
                min_loss = val_loss
            
    return min_loss
        

In [10]:
batch_size = 16

for fold in range(num_folds):
    
    train_df = df[(df.fold != fold)].reset_index(drop=True)
    val_df = df[(df.fold == fold)].reset_index(drop=True)
    
    counter = dict(Counter(train_df['bin']))
    weights = [counter[i] for i in train_df['bin']]
    
    train_dataset = CLRPDataset(train_df['excerpt'], train_df['target'])
    val_dataset = CLRPDataset(val_df['excerpt'], val_df['target'])
    
    train_dataloader = DataLoader(train_dataset, batch_size=batch_size,
                                  sampler=WeightedRandomSampler(weights, 
                                                                len(train_df)),
                                  collate_fn = CLRPCollate(),
                                  num_workers=4, pin_memory=True, drop_last=True)
    
    val_dataloader = DataLoader(val_dataset, batch_size=2 * batch_size,
                            collate_fn = CLRPCollate(),
                            shuffle=False,num_workers=4, pin_memory=True)
    
    # model definition
    
    model = CLRPModel()
    model.to(device)
    
    lr = 2e-5
    epochs = 3
    eval_steps = 10
    
    optimizer = AdamW(model.parameters(), lr, betas=(0.9, 0.999), 
                      weight_decay=1e-2)
    train_steps = len(train_dataset) // batch_size * epochs
    num_steps = int(train_steps * 0.1)
    lr_scheduler = get_linear_schedule_with_warmup(optimizer, num_steps, 
                                                   train_steps)
    min_loss = 1.0
    print('#' * 30, f'fold-{fold}', '#' * 30)
    for epoch in range(epochs):
        min_loss = train_eval_one_epoch(fold, epoch, train_dataloader, val_dataloader,\
                                        eval_steps, min_loss, model, optimizer, lr_scheduler)
    print(f'fold-{fold}: min_loss={min_loss}')
    
    del model, optimizer, lr_scheduler
    gc.collect()
    torch.cuda.empty_cache()

Downloading:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/501M [00:00<?, ?B/s]

############################## fold-0 ##############################
Validation [17/18] eta: 0:00:05 inference_time: 0.028 (0.032) time: 0.274 (0.309) data: 0.000 (0.035)
Total time: 0:00:05
val_loss: 1.5560849905014038
Epoch[0] [09/141] eta: 0:02:38 train_loss: 1.434 (1.434) time: 1.131 (1.131) data: 0.048 (0.048)
Validation [17/18] eta: 0:00:05 inference_time: 0.024 (0.032) time: 0.274 (0.308) data: 0.001 (0.034)
Total time: 0:00:05
val_loss: 1.0819593667984009
Epoch[0] [19/141] eta: 0:02:28 train_loss: 1.137 (1.244) time: 1.130 (1.064) data: 0.000 (0.024)
Validation [17/18] eta: 0:00:05 inference_time: 0.019 (0.023) time: 0.273 (0.300) data: 0.000 (0.026)
Total time: 0:00:05
val_loss: 1.0015506744384766
Epoch[0] [29/141] eta: 0:02:24 train_loss: 0.864 (1.107) time: 1.121 (1.033) data: 0.000 (0.016)
Validation [17/18] eta: 0:00:05 inference_time: 0.020 (0.024) time: 0.273 (0.313) data: 0.000 (0.039)
Total time: 0:00:05
val_loss: 0.9558019042015076
min_loss decreased from 1.0 to 0.955