In [1]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.9.0-py3-none-any.whl (2.6 MB)
[K     |████████████████████████████████| 2.6 MB 8.9 MB/s 
Collecting huggingface-hub==0.0.12
  Downloading huggingface_hub-0.0.12-py3-none-any.whl (37 kB)
Collecting sacremoses
  Downloading sacremoses-0.0.45-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 61.1 MB/s 
[?25hCollecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 62.5 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-5.4.1-cp37-cp37m-manylinux1_x86_64.whl (636 kB)
[K     |████████████████████████████████| 636 kB 62.9 MB/s 
Installing collected packages: tokenizers, sacremoses, pyyaml, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninstalling PyYAML-3.13:
      Successfully u

In [2]:
import numpy as np
import pandas as pd

from transformers import Trainer, TrainingArguments
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModel, AutoConfig

import matplotlib.pyplot as plt 
import os
import random

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

import transformers
from transformers import AdamW

from tqdm.notebook import tqdm
import gc
gc.enable()

import warnings
warnings.simplefilter('ignore')

In [3]:
config = {
    'train_batch_size': 16,
    'valid_batch_size': 32,
    'max_len': 314,
    'nfolds': 5,
    'seed': 42,
}

In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'{device} is used')

def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONASSEED'] = str(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.detarministic = True 
    torch.backends.cudnn.benchmark = True 

seed_everything(seed=42)

cuda is used


In [5]:
train_data = pd.read_csv("/content/train.csv")
#test_data = pd.read_csv("/content/test.csv")
#sample_data = pd.read_csv("/content/sample_submission.csv")

In [6]:
idf = [len(x.split()) for x in train_data.excerpt]
print (max(idf), min(idf))

205 135


In [7]:
# k-fold
num_bins = int(np.floor(1 + np.log2(len(train_data))))
train_data.loc[:, 'bins'] = pd.cut(train_data['target'], bins=num_bins, labels=False)

train_data['kfold'] = -1
kfold = StratifiedKFold(n_splits=config['nfolds'],
                        shuffle=True,
                        random_state=config['seed'])
for k, (train_idx, valid_idx) in enumerate(kfold.split(X=train_data, y=train_data.bins)):
    train_data.loc[valid_idx, 'kfold'] = k


In [8]:
class clrp(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [9]:
tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-base")
#config_model = AutoConfig.from_pretrained("../input/huggingface-deberta-variants/deberta-base/deberta-base")
#model = AutoModel.from_pretrained("../input/huggingface-deberta-variants/deberta-base/deberta-base", config=config_model)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=52.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=474.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=898825.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=456318.0, style=ProgressStyle(descripti…




In [10]:
class LitModel(nn.Module):
    def __init__(self):
        super().__init__()

        config = AutoConfig.from_pretrained("microsoft/deberta-base")
        config.update({"output_hidden_states":True, 
                       "hidden_dropout_prob": 0.0,
                       "layer_norm_eps": 1e-7})                       
        
        self.roberta = AutoModel.from_pretrained("microsoft/deberta-base", config=config)  
            
        self.attention = nn.Sequential(            
            nn.Linear(768, 512),            
            nn.Tanh(),                       
            nn.Linear(512, 1),
            nn.Softmax(dim=1)
        )        

        self.regressor = nn.Sequential(                        
            nn.Linear(768, 1)                        
        )
        

    def forward(self, input_ids, attention_mask, token_type_ids):
        roberta_output = self.roberta(input_ids=input_ids,
                                      attention_mask=attention_mask,
                                     token_type_ids= token_type_ids)        

        # There are a total of 13 layers of hidden states.
        # 1 for the embedding layer, and 12 for the 12 Roberta layers.
        # We take the hidden states from the last Roberta layer.
        last_layer_hidden_states = roberta_output.hidden_states[-1]

        # The number of cells is MAX_LEN.
        # The size of the hidden state of each cell is 768 (for roberta-base).
        # In order to condense hidden states of all cells to a context vector,
        # we compute a weighted average of the hidden states of all cells.
        # We compute the weight of each cell, using the attention neural network.
        weights = self.attention(last_layer_hidden_states)
                
        # weights.shape is BATCH_SIZE x MAX_LEN x 1
        # last_layer_hidden_states.shape is BATCH_SIZE x MAX_LEN x 768        
        # Now we compute context_vector as the weighted average.
        # context_vector.shape is BATCH_SIZE x 768
        context_vector = torch.sum(weights * last_layer_hidden_states, dim=1)        
        
        # Now we reduce the context vector to the prediction score.
        return self.regressor(context_vector)
model_lit = LitModel()

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=558582766.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at microsoft/deberta-base were not used when initializing DebertaModel: ['lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'config']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [10]:
def weight(dim_in, dim_out, factorize_k = None):
    if factorize_k is None:
        return nn.Linear(dim_in, dim_out, bias = False)

    assert factorize_k < dim_in and factorize_k < dim_out, 'k must be of relative lower rank'

    return nn.Sequential(
        nn.Linear(dim_in, factorize_k, bias = False),
        nn.Linear(factorize_k, dim_out, bias = False)
    )

class Mogrifier(nn.Module):
    def __init__(self, dim, iters = 5, factorize_k = None):
        super().__init__()
        self.dim = dim
        self.iters = iters

        self.Q = weight(dim, dim, factorize_k)
        self.R = weight(dim, dim, factorize_k) if iters > 1 else None

    def forward(self, x, h):
        shape = x.shape
        *_, dim = shape
        assert dim == self.dim, f'mogrifier accepts a dimension of {self.dim}'

        x, h = map(lambda t: t.reshape(-1, dim), (x, h))

        for ind in range(self.iters):
            if (ind % 2) == 0:
                x = 2 * self.Q(h).sigmoid() * x
            else:
                h = 2 * self.R(x).sigmoid() * h

        x, h = map(lambda t: t.reshape(*shape), (x, h))
        return x, h
class CLRPMogLSTM(nn.Module):
    def __init__(self):
        super().__init__()

        config = AutoConfig.from_pretrained("microsoft/deberta-base")
        config.update({"output_hidden_states":True, 
                       "hidden_dropout_prob": 0.0,
                       "layer_norm_eps": 1e-7})                       
        
        self.transformers = AutoModel.from_pretrained("microsoft/deberta-base", config=config)  

        self.mog_lstm_1 = Mogrifier(
                          dim = 768,
                          iters = 5,          # number of iterations, defaults to 5 as paper recommended for LSTM
                          factorize_k = 16    # factorize weight matrices into (dim x k) and (k x dim), if specified
                      )
        self.dropout = nn.Dropout(0.2)
            
        self.attention = nn.Sequential(            
            nn.Linear(768, 512),            
            nn.Tanh(),                       
            nn.Linear(512, 1),
            nn.Softmax(dim=1)
        )        

        self.regressor = nn.Sequential(                        
            nn.Linear(768, 1)                        
        )
        

    def forward(self, input_ids, attention_mask, token_type_ids):
        transformers_outputs = self.transformers(input_ids=input_ids,
                                      attention_mask=attention_mask,
                                     token_type_ids= token_type_ids)        

        # There are a total of 13 layers of hidden states.
        # 1 for the embedding layer, and 12 for the 12 Roberta layers.
        # We take the hidden states from the last Roberta layer.
        last_layer_hidden_states = transformers_outputs.last_hidden_state

        #mog lstm
        h = torch.zeros(last_layer_hidden_states.shape[0], last_layer_hidden_states.shape[1], last_layer_hidden_states.shape[2])
        mog_1, z = self.mog_lstm_1(last_layer_hidden_states, h)
        mog_1 = self.dropout(mog_1)

        # The number of cells is MAX_LEN.
        # The size of the hidden state of each cell is 768 (for roberta-base).
        # In order to condense hidden states of all cells to a context vector,
        # we compute a weighted average of the hidden states of all cells.
        # We compute the weight of each cell, using the attention neural network.
        weights = self.attention(mog_1)
                
        # weights.shape is BATCH_SIZE x MAX_LEN x 1
        # last_layer_hidden_states.shape is BATCH_SIZE x MAX_LEN x 768        
        # Now we compute context_vector as the weighted average.
        # context_vector.shape is BATCH_SIZE x 768
        context_vector = torch.sum(weights * last_layer_hidden_states, dim=1)        
        
        # Now we reduce the context vector to the prediction score.
        return self.regressor(context_vector)
model_lit = CLRPMogLSTM()

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=558582766.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at microsoft/deberta-base were not used when initializing DebertaModel: ['lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.LayerNorm.weight', 'config']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [11]:
#model_auto = AutoModelForSequenceClassification.from_pretrained("../input/huggingface-deberta-variants/deberta-base/deberta-base", num_labels=1)

In [12]:
#for param in model_lit.base_model.parameters():
    #param.requires_grad = False

In [13]:
#for param in model.roberta.parameters():
    #param.requires_grad = False

In [11]:
class clrp(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [12]:
p_fold = 0
p_train = train_data.query(f'kfold != {p_fold}').reset_index(drop=True)
p_valid = train_data.query(f'kfold == {p_fold}').reset_index(drop=True)
tokenizer_train = tokenizer.batch_encode_plus(p_train.excerpt.to_list(),
                                          add_special_tokens=True,
                                          max_length=205,
                                          pad_to_max_length=True,
                                          truncation=True,
                                          return_attention_mask=True)
tokenizer_val = tokenizer.batch_encode_plus(p_valid.excerpt.to_list(),
                                          add_special_tokens=True,
                                          max_length=205,
                                          pad_to_max_length=True,
                                          truncation=True,
                                          return_attention_mask=True)

In [13]:
train_dataset = clrp(tokenizer_train, p_train.target.to_list())
val_dataset = clrp(tokenizer_val, p_valid.target.to_list())

In [14]:
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    overwrite_output_dir=True,
    save_total_limit=1,
    do_train=True,
    do_eval=False,
    do_predict=True,
    num_train_epochs=5,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=16,   # batch size for evaluation
    warmup_steps=250,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    report_to="none",
    #learning_rate=5e-6,
    seed=99,
    lr_scheduler_type="cosine"
)

In [15]:
class RMSELoss(nn.Module):
    def __init__(self, eps=1e-6):
        super().__init__()
        self.mse = nn.MSELoss()
        self.eps = eps
        
    def forward(self,yhat,y):
        loss = torch.sqrt(self.mse(yhat,y) + self.eps)
        return loss
rmse_loss = RMSELoss()

In [16]:
class RMSE_Trainner(Trainer):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs
        #loss = torch.sqrt(nn.functional.mse_loss(logits,labels.unsqueeze(1)))
        loss = rmse_loss(logits, labels.unsqueeze(1))
        return (loss, outputs) if return_outputs else loss

In [17]:
"""def compute_metrics(eval_pred):
    logits, labels = eval_pred
    rmse = torch.sqrt(nn.functional.mse_loss(logits,labels))
    return rmse"""

'def compute_metrics(eval_pred):\n    logits, labels = eval_pred\n    rmse = torch.sqrt(nn.functional.mse_loss(logits,labels))\n    return rmse'

In [18]:
trainer = RMSE_Trainner(
    model=model_lit,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset      # evaluation dataset
    #compute_metrics=compute_metrics
)

In [19]:
gc.collect()
trainer.train()
del trainer

***** Running training *****
  Num examples = 2267
  Num Epochs = 5
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 710


RuntimeError: ignored

In [None]:
1e-5 == 1e-05

True

In [None]:
class clrp_test(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        #item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.encodings.input_ids)
tokenizer_test = tokenizer.batch_encode_plus(test_data.excerpt.to_list(),
                                          max_length=205,
                                          pad_to_max_length=True,
                                          truncation=True)
test_datasets = clrp_test(tokenizer_test)

In [None]:
def predict(model_path, data):
    model_lit_infer = LitModel()
    model_lit_infer.load_state_dict(torch.load(model_path)) 
    model_lit_infer.to(device)
    result = []
    trainer_inter_arg  = TrainingArguments(
    output_dir='./results/pred',          # output directory
    do_train=False,
    do_eval=False,
    do_predict=True
    )
    trainer_infer = Trainer(
        model = model_lit_infer,
        args=trainer_inter_arg
    )
    out_pred, _, __ = trainer_infer.predict(data)
    del trainer_infer
    return out_pred.squeeze(1)

In [None]:
len(val_dataset)

567

In [None]:
out_pred, _, __ = trainer.predict(val_dataset)

In [None]:
len(out_pred)

531

In [None]:
sample_data.target = (out_pred)
sample_data.to_csv('submission.csv',index=False)

ValueError: Length of values (6) does not match length of index (7)

In [None]:
sample_data