# Libraries

In [39]:
from pytorch_lightning.callbacks import EarlyStopping
from pytorch_lightning.logging import TensorBoardLogger
import os
from pytorch_lightning.callbacks import ModelCheckpoint
import pytorch_lightning as pl

In [40]:
import numpy as np
import pandas as pd
import os
import warnings
import random
import torch 
from torch import nn
import torch.optim as optim
from sklearn.model_selection import StratifiedKFold
import tokenizers
from transformers import RobertaModel, RobertaConfig

warnings.filterwarnings('ignore')

# Seed

In [41]:
def seed_everything(seed_value):
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    os.environ['PYTHONHASHSEED'] = str(seed_value)
    
    if torch.cuda.is_available(): 
        torch.cuda.manual_seed(seed_value)
        torch.cuda.manual_seed_all(seed_value)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = True

seed = 42
seed_everything(seed)

# Data Loader

> download the pretrained weights of roberta model

**Run only once**

In [42]:
# from transformers import AutoTokenizer, AutoModelForQuestionAnswering

# tokenizer = AutoTokenizer.from_pretrained("roberta-base")
# model = AutoModelForQuestionAnswering.from_pretrained("roberta-base")

# tokenizer.save_pretrained('roberta-base')
# model.save_pretrained('roberta-base')

In [43]:
class TweetDataset(torch.utils.data.Dataset):
    def __init__(self, df, max_len=96):
        self.df = df
        self.max_len = max_len
        self.labeled = 'selected_text' in df
        self.tokenizer = tokenizers.ByteLevelBPETokenizer(
            vocab_file='roberta-base/vocab.json', 
            merges_file='roberta-base/merges.txt', 
            lowercase=True,
            add_prefix_space=True)

    def __getitem__(self, index):
        data = {}
        row = self.df.iloc[index]
        
        ids, masks, tweet, offsets = self.get_input_data(row)
        data['ids'] = ids
        data['masks'] = masks
        data['tweet'] = tweet
        data['offsets'] = offsets
        
        if self.labeled:
            start_idx, end_idx = self.get_target_idx(row, tweet, offsets)
            data['start_idx'] = start_idx
            data['end_idx'] = end_idx
        
        return data

    def __len__(self):
        return len(self.df)
    
    def get_input_data(self, row):
        tweet = " " + " ".join(row.text.lower().split())
        encoding = self.tokenizer.encode(tweet)
        sentiment_id = self.tokenizer.encode(row.sentiment).ids
        ids = [0] + sentiment_id + [2, 2] + encoding.ids + [2]
        offsets = [(0, 0)] * 4 + encoding.offsets + [(0, 0)]
                
        pad_len = self.max_len - len(ids)
        if pad_len > 0:
            ids += [1] * pad_len
            offsets += [(0, 0)] * pad_len
        
        ids = torch.tensor(ids)
        masks = torch.where(ids != 1, torch.tensor(1), torch.tensor(0))
        offsets = torch.tensor(offsets)
        
        return ids, masks, tweet, offsets
        
    def get_target_idx(self, row, tweet, offsets):
        selected_text = " " +  " ".join(row.selected_text.lower().split())

        len_st = len(selected_text) - 1
        idx0 = None
        idx1 = None

        for ind in (i for i, e in enumerate(tweet) if e == selected_text[1]):
            if " " + tweet[ind: ind+len_st] == selected_text:
                idx0 = ind
                idx1 = ind + len_st - 1
                break

        char_targets = [0] * len(tweet)
        if idx0 != None and idx1 != None:
            for ct in range(idx0, idx1 + 1):
                char_targets[ct] = 1

        target_idx = []
        for j, (offset1, offset2) in enumerate(offsets):
            if sum(char_targets[offset1: offset2]) > 0:
                target_idx.append(j)

        start_idx = target_idx[0]
        end_idx = target_idx[-1]
        
        return start_idx, end_idx
        
def get_train_val_loaders(df, train_idx, val_idx, batch_size=8):
    train_df = df.iloc[train_idx]
    val_df = df.iloc[val_idx]

    train_loader = torch.utils.data.DataLoader(
        TweetDataset(train_df), 
        batch_size=batch_size, 
        shuffle=True, 
        num_workers=2,
        drop_last=True)

    val_loader = torch.utils.data.DataLoader(
        TweetDataset(val_df), 
        batch_size=batch_size, 
        shuffle=False, 
        num_workers=2)

    dataloaders_dict = {"train": train_loader, "val": val_loader}

    return dataloaders_dict

def get_test_loader(df, batch_size=32):
    loader = torch.utils.data.DataLoader(
        TweetDataset(df), 
        batch_size=batch_size, 
        shuffle=False, 
        num_workers=2)    
    return loader

## Dataloaders

In [44]:
train_df = pd.read_csv('data/train.csv')

In [45]:
train_df['text'] = train_df['text'].astype(str)
train_df['selected_text'] = train_df['selected_text'].astype(str)


In [46]:
batch_size = 64
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)

In [47]:
for fold, (train_idx, val_idx) in enumerate(skf.split(train_df, train_df.sentiment), start=1): 
    dataloaders_dict = get_train_val_loaders(train_df, train_idx, val_idx, batch_size)
    break

## Loss Function

In [48]:
def loss_fn(start_logits, end_logits, start_positions, end_positions):
    ce_loss = nn.CrossEntropyLoss()
    start_loss = ce_loss(start_logits, start_positions)
    end_loss = ce_loss(end_logits, end_positions)    
    total_loss = start_loss + end_loss
    return total_loss

# Model

In [49]:
class TweetModel(pl.LightningModule):
    def __init__(self, lr):
        super(TweetModel, self).__init__()
        self.lr = lr
        config = RobertaConfig.from_pretrained(
            'roberta-base/config.json', output_hidden_states=True)    
        self.roberta = RobertaModel.from_pretrained(
            'roberta-base/pytorch_model.bin', config=config)
        self.dropout = nn.Dropout(0.5)
        self.fc = nn.Linear(config.hidden_size, 2)
        nn.init.normal_(self.fc.weight, std=0.02)
        nn.init.normal_(self.fc.bias, 0)
        self.criterion = loss_fn

    def forward(self, input_ids, attention_mask):
        _, _, hs = self.roberta(input_ids, attention_mask)
         
        x = torch.stack([hs[-1], hs[-2], hs[-3], hs[-4]])
        x = torch.mean(x, 0)
        x = self.dropout(x)
        x = self.fc(x)
        start_logits, end_logits = x.split(1, dim=-1)
        start_logits = start_logits.squeeze(-1)
        end_logits = end_logits.squeeze(-1)
                
        return start_logits, end_logits
    
    def training_step(self, batch, batch_idx):
        ids = batch['ids'].cuda()
        masks = batch['masks'].cuda()
        tweet = batch['tweet']
        offsets = batch['offsets'].cpu().detach().numpy()
        start_idx = batch['start_idx'].cuda()
        end_idx = batch['end_idx'].cuda()
        
        start_logits, end_logits = self.forward(ids, masks)
        loss = self.criterion(start_logits, end_logits, start_idx, end_idx)
        
        tensorboard_logs = {'train_loss': loss}
        return {'loss': loss, 'log': tensorboard_logs}
    
    def configure_optimizers(self):
        return torch.optim.AdamW([p for p in self.parameters() if p.requires_grad],  
                                 lr=self.lr, betas=(0.9, 0.999))

    
    def validation_step(self, batch, batch_idx):
        ids = batch['ids'].cuda()
        masks = batch['masks'].cuda()
        tweet = batch['tweet']
        offsets = batch['offsets'].cpu().detach().numpy()
        start_idx = batch['start_idx'].cuda()
        end_idx = batch['end_idx'].cuda()
        start_logits, end_logits = self.forward(ids, masks)
        loss = self.criterion(start_logits, end_logits, start_idx, end_idx)
        
        start_idx = start_idx.cpu().detach().numpy()
        end_idx = end_idx.cpu().detach().numpy()
        start_logits = torch.softmax(start_logits, dim=1).cpu().detach().numpy()
        end_logits = torch.softmax(end_logits, dim=1).cpu().detach().numpy()
        
        batch_jaccard = 0
        for i in range(len(ids)):                        
            jaccard_score = compute_jaccard_score(
                tweet[i],
                start_idx[i],
                end_idx[i],
                start_logits[i], 
                end_logits[i], 
                offsets[i])
            batch_jaccard += jaccard_score
        
        batch_len = len(ids)
        
        return {'val_loss':loss, 'val_jaccard':batch_jaccard,
               'batch_len':batch_len}
    
    def validation_epoch_end(self, outputs):
        total_len = sum([batch['batch_len'] for batch in outputs])
        avg_loss = torch.stack([x['val_loss'] for x in outputs]).mean()
        total_jaccard = sum([x['val_jaccard'] for x in outputs])
        
        total_jaccard = torch.tensor(total_jaccard / total_len)
        total_jaccard = torch.tensor([total_jaccard])
        
        tensorboard_logs = {'val_loss': avg_loss, 'val_jaccard': total_jaccard}
        
        return {'val_loss': avg_loss, 'log': tensorboard_logs,
               'progress_bar': tensorboard_logs}
    
    def train_dataloader(self):
        return dataloaders_dict['train']
    
    def val_dataloader(self):
        return dataloaders_dict['val']
    
    def test_step(self, batch, batch_id):
        ids     = batch['ids'].cuda()
        masks   = batch['masks'].cuda()
        tweet   = batch['tweet'].cuda()
        offsets = batch['offsets'].cpu().detach().numpy()
        
        output = self.forward(ids, masks)
        start_logits.append(torch.softmax(output[0], dim=1).cpu().detach().numpy())
        end_logits.append(torch.softmax(output[1], dim=1).cpu().detach().numpy())
        start_pred = np.argmax(start_logits[i])
        end_pred = np.argmax(end_logits[i])
        
        return {'start_pred':start_pred, 'end_pred': end_pred}

# Evaluation Function

In [50]:
def get_selected_text(text, start_idx, end_idx, offsets):
    selected_text = ""
    for ix in range(start_idx, end_idx + 1):
        selected_text += text[offsets[ix][0]: offsets[ix][1]]
        if (ix + 1) < len(offsets) and offsets[ix][1] < offsets[ix + 1][0]:
            selected_text += " "  # add spaces for words in b/w
    return selected_text

def jaccard(str1, str2):
    a = set(str1.lower().split()) 
    b = set(str2.lower().split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

def compute_jaccard_score(text, start_idx, end_idx, start_logits, end_logits, offsets):
    ''' out model outputs 2 vectors
    start logits -> probability that a token is the start
    end logits   -> probability that a token is the ending'''
    start_pred = np.argmax(start_logits)  # start pred
    end_pred = np.argmax(end_logits)      # end pred
    if start_pred > end_pred:
        pred = text
    else:
        pred = get_selected_text(text, start_pred, end_pred, offsets)
        
    true = get_selected_text(text, start_idx, end_idx, offsets)
    
    return jaccard(true, pred)

# Training

In [51]:
def get_total_params(model):
    s = 0
    for param in model.parameters():
        s += param.numel()
    return s

def get_trainable_params(model):
    s = 0
    for param in model.parameters():
        if param.requires_grad:
            s += param.numel()
    return s

def print_trainable_params(model):
    
    for name, param in model.named_parameters():
        if param.requires_grad:
            print(name)

def finetune_model(model, last_n):
    '''freezes all the layers in the model except last_n number of layers
    
    Input:
    
    model  :  pytorch model
    last_n    :  number of last layers to unfreeze
    '''
    
    total_layers = len(list(model.parameters()))
    
    for enum, param in enumerate(model.parameters()):
        param.requires_grad = False
        if enum + last_n >= total_layers:
            param.requires_grad = True
    

In [None]:

tm = TweetModel(lr=3e-3)

finetune_model(tm, 25)

print_trainable_params(tm)

get_trainable_params(tm)

name = 'roberta-finetune-last25layer'

logger = TensorBoardLogger(
                save_dir='ts-logs',
                name = name
            )

early_stopping = EarlyStopping('val_loss',patience=5)

checkpoint_callback = ModelCheckpoint(filepath=f'saved_models/{name}')


trainer = pl.Trainer(min_epochs=10,
                     logger=logger,
                     early_stop_callback=early_stopping,
                     checkpoint_callback=checkpoint_callback)

tm.cuda()
trainer.fit(tm)

GPU available: True, used: False
No environment variable for node rank defined. Set as 0.

    | Name                                                | Type              | Params
--------------------------------------------------------------------------------------
0   | roberta                                             | RobertaModel      | 124 M 
1   | roberta.embeddings                                  | RobertaEmbeddings | 39 M  
2   | roberta.embeddings.word_embeddings                  | Embedding         | 38 M  
3   | roberta.embeddings.position_embeddings              | Embedding         | 394 K 
4   | roberta.embeddings.token_type_embeddings            | Embedding         | 768   
5   | roberta.embeddings.LayerNorm                        | LayerNorm         | 1 K   
6   | roberta.embeddings.dropout                          | Dropout           | 0     
7   | roberta.encoder                                     | BertEncoder       | 85 M  
8   | roberta.encoder.layer            

roberta.encoder.layer.10.intermediate.dense.bias
roberta.encoder.layer.10.output.dense.weight
roberta.encoder.layer.10.output.dense.bias
roberta.encoder.layer.10.output.LayerNorm.weight
roberta.encoder.layer.10.output.LayerNorm.bias
roberta.encoder.layer.11.attention.self.query.weight
roberta.encoder.layer.11.attention.self.query.bias
roberta.encoder.layer.11.attention.self.key.weight
roberta.encoder.layer.11.attention.self.key.bias
roberta.encoder.layer.11.attention.self.value.weight
roberta.encoder.layer.11.attention.self.value.bias
roberta.encoder.layer.11.attention.output.dense.weight
roberta.encoder.layer.11.attention.output.dense.bias
roberta.encoder.layer.11.attention.output.LayerNorm.weight
roberta.encoder.layer.11.attention.output.LayerNorm.bias
roberta.encoder.layer.11.intermediate.dense.weight
roberta.encoder.layer.11.intermediate.dense.bias
roberta.encoder.layer.11.output.dense.weight
roberta.encoder.layer.11.output.dense.bias
roberta.encoder.layer.11.output.LayerNorm.weigh

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validation sanity check', layout=Layout…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…


    | Name                                                | Type              | Params
--------------------------------------------------------------------------------------
0   | roberta                                             | RobertaModel      | 124 M 
1   | roberta.embeddings                                  | RobertaEmbeddings | 39 M  
2   | roberta.embeddings.word_embeddings                  | Embedding         | 38 M  
3   | roberta.embeddings.position_embeddings              | Embedding         | 394 K 
4   | roberta.embeddings.token_type_embeddings            | Embedding         | 768   
5   | roberta.embeddings.LayerNorm                        | LayerNorm         | 1 K   
6   | roberta.embeddings.dropout                          | Dropout           | 0     
7   | roberta.encoder                                     | BertEncoder       | 85 M  
8   | roberta.encoder.layer                               | ModuleList        | 85 M  
9   | roberta.encoder.layer.0             

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validation sanity check', layout=Layout…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

Detected KeyboardInterrupt, attempting graceful shutdown...





Traceback (most recent call last):
  File "/data/nithish/anaconda3/lib/python3.7/multiprocessing/queues.py", line 242, in _feed
    send_bytes(obj)
  File "/data/nithish/anaconda3/lib/python3.7/multiprocessing/connection.py", line 200, in send_bytes
    self._send_bytes(m[offset:offset + size])
  File "/data/nithish/anaconda3/lib/python3.7/multiprocessing/connection.py", line 404, in _send_bytes
    self._send(header + buf)
  File "/data/nithish/anaconda3/lib/python3.7/multiprocessing/connection.py", line 368, in _send
    n = write(self._handle, buf)
BrokenPipeError: [Errno 32] Broken pipe


1

In [26]:
dataset = dataloaders_dict['train']

In [46]:
num_epochs = 6
batch_size = 64
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)

In [47]:
%%time


train_df = pd.read_csv('data/train.csv')
train_df['text'] = train_df['text'].astype(str)
train_df['selected_text'] = train_df['selected_text'].astype(str)

for fold, (train_idx, val_idx) in enumerate(skf.split(train_df, train_df.sentiment), start=1): 
    if fold==1:
        print(f'Fold: {fold}')

        model = TweetModel()
        optimizer = optim.AdamW(model.parameters(), lr=3e-5, betas=(0.9, 0.999))
        criterion = loss_fn    
        dataloaders_dict = get_train_val_loaders(train_df, train_idx, val_idx, batch_size)

        train_model(
            model, 
            dataloaders_dict,
            criterion, 
            optimizer, 
            num_epochs,
            f'roberta_fold{fold}.pth')

Fold: 4
Epoch 1/6 | train | Loss: 2.2849 | Jaccard: 0.6532
Epoch 1/6 |  val  | Loss: 1.5838 | Jaccard: 0.7264
Epoch 2/6 | train | Loss: 1.6371 | Jaccard: 0.7124
Epoch 2/6 |  val  | Loss: 1.5323 | Jaccard: 0.7196
Epoch 3/6 | train | Loss: 1.4999 | Jaccard: 0.7298
Epoch 3/6 |  val  | Loss: 1.5044 | Jaccard: 0.7290
Epoch 4/6 | train | Loss: 1.3718 | Jaccard: 0.7471
Epoch 4/6 |  val  | Loss: 1.5855 | Jaccard: 0.7230
Epoch 5/6 | train | Loss: 1.2490 | Jaccard: 0.7646
Epoch 5/6 |  val  | Loss: 1.6754 | Jaccard: 0.7265
Epoch 6/6 | train | Loss: 1.1175 | Jaccard: 0.7840
Epoch 6/6 |  val  | Loss: 1.8250 | Jaccard: 0.7147
Fold: 5


KeyboardInterrupt: 

# Inference

In [None]:
%%time

test_df = pd.read_csv('../input/tweet-sentiment-extraction/test.csv')
test_df['text'] = test_df['text'].astype(str)
test_loader = get_test_loader(test_df)
predictions = []
models = []
for fold in range(skf.n_splits):
    model = TweetModel()
    model.cuda()
    model.load_state_dict(torch.load(f'roberta_fold{fold+1}.pth'))
    model.eval()
    models.append(model)

for data in test_loader:
    ids = data['ids'].cuda()
    masks = data['masks'].cuda()
    tweet = data['tweet']
    offsets = data['offsets'].numpy()

    start_logits = []
    end_logits = []
    for model in models:
        with torch.no_grad():
            output = model(ids, masks)
            start_logits.append(torch.softmax(output[0], dim=1).cpu().detach().numpy())
            end_logits.append(torch.softmax(output[1], dim=1).cpu().detach().numpy())

    start_logits = np.mean(start_logits, axis=0)
    end_logits = np.mean(end_logits, axis=0)
    for i in range(len(ids)):    
        start_pred = np.argmax(start_logits[i])
        end_pred = np.argmax(end_logits[i])
        if start_pred > end_pred:
            pred = tweet[i]
        else:
            pred = get_selected_text(tweet[i], start_pred, end_pred, offsets[i])
        predictions.append(pred)

# Submission

In [None]:
sub_df = pd.read_csv('../input/tweet-sentiment-extraction/sample_submission.csv')
sub_df['selected_text'] = predictions
sub_df['selected_text'] = sub_df['selected_text'].apply(lambda x: x.replace('!!!!', '!') if len(x.split())==1 else x)
sub_df['selected_text'] = sub_df['selected_text'].apply(lambda x: x.replace('..', '.') if len(x.split())==1 else x)
sub_df['selected_text'] = sub_df['selected_text'].apply(lambda x: x.replace('...', '.') if len(x.split())==1 else x)
sub_df.to_csv('submission.csv', index=False)
sub_df.head()