In [1]:
import gc
import numpy as np
import pandas as pd

import torch 
from torch.utils.data import TensorDataset, DataLoader

from transformers import AutoTokenizer, AutoModelForSequenceClassification

from tqdm.notebook import tqdm

import warnings
warnings.filterwarnings('ignore')

device = 'cuda'

In [2]:
import pandas as pd
import numpy as np
import os
import gc
from tqdm import tqdm

from torch.utils.data import Dataset, DataLoader
import torch
import transformers
import torch.nn as nn
import torch.nn.functional as F
from torch.cuda.amp import GradScaler, autocast

from sklearn.metrics import mean_squared_error
from sklearn.svm import SVR

import os
import random
import pickle
from collections import OrderedDict, defaultdict

# Pairs Model

In [3]:
def get_model_preds(model_paths, input_ids, attention_masks):
    
    dataset = TensorDataset(input_ids, attention_masks)
    dataloader = DataLoader(dataset, batch_size=128, pin_memory=True)

    preds_list = []

    for fold in range(len(model_paths)):

        model = AutoModelForSequenceClassification.from_pretrained(
                                                   model_paths[fold], 
                                                   num_labels = 1,
                                                   output_attentions = False,
                                                   output_hidden_states = False, 
                                                   ).to(device)
        model.eval()

        preds = []
        
        for batch in tqdm(dataloader):
            
            with torch.no_grad(): 
    
                output = model(batch[0].to(device), batch[1].to(device))
                output = output.logits.detach().cpu().numpy().ravel().tolist()
                preds.extend(output)
    
        del model
        torch.cuda.empty_cache()
        gc.collect()
        
        preds_list.append(preds)      

    return np.array(preds_list)


def auto_tokenize(data, tokenizer_dir, num_tokens):

    data = data.to_list()

    tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir)

    encoded_input = tokenizer(
                              data, 
                              padding='max_length', 
                              truncation=True, 
                              max_length=num_tokens, 
                              return_tensors='pt'
                              )

    input_ids = encoded_input['input_ids']
    attention_masks = encoded_input['attention_mask']

    return input_ids, attention_masks


In [4]:
BASE_PATH = '../input'
BASE2_PATH = '../input/d/eduardopeynetti'
DATA_PATH = '../input/commonlitreadabilityprize'
KFOLD1_PATH = BASE_PATH + '/baseline-kfold1'
KFOLD2_PATH = BASE_PATH + '/baseline-kfold2'
KFOLD_RESIDUAL_PATH = BASE_PATH + '/roberta-kfold-residual'
KFOLD_RESIDUAL_300_PATH = BASE_PATH + '/residual-300-tokens'
ELECTRA_RESIDUAL_PATH = BASE_PATH + '/electra-kfold-residual'
DEBERTA_KFOLD1_PATH = BASE_PATH + '/deberta-kfold1'
DEBERTA_KFOLD2_PATH = BASE2_PATH + '/deberta-kfold2' 
DEBERTA_RESIDUAL_300_PATH = BASE_PATH + '/deberta-kfold-residual' 

HUGGING_PATH = BASE_PATH + '/commonlit-huggingface'

TRAIN_PATH = DATA_PATH + '/' + 'train.csv'
TEST_PATH = DATA_PATH + '/' + 'test.csv'
SAMPLE_PATH = DATA_PATH + '/' + 'sample_submission.csv'

DEBERTA_PATH = HUGGING_PATH + '/' +'deberta-tokenizer'
ELECTRA_PATH = HUGGING_PATH + '/' +'electra_tokenizer'
ROBERTA_PATH = HUGGING_PATH + '/' +'roberta-tokenizer'

# Fold 1

MODEL0_KFOLD1_PATH = KFOLD1_PATH + '/baseline_kfold1_fold0'    #0.476
MODEL2_KFOLD1_PATH = KFOLD1_PATH + '/baseline_kfold1_fold2'    #0.475
MODEL3_KFOLD1_PATH = KFOLD1_PATH + '/baseline_kfold1_fold3'    #0.469
MODEL4_KFOLD1_PATH = KFOLD1_PATH + '/baseline_kfold1_fold4'    #0.473

MODEL_EXTRA_KFOLD1_PATH = KFOLD1_PATH + '/pairs_large_fold3'   #0.474

# Fold 2

MODEL2_KFOLD2_PATH = KFOLD2_PATH + '/baseline_kfold2_fold2'    #0.472
MODEL4_KFOLD2_PATH = KFOLD2_PATH + '/baseline_kfold2_fold4'    #0.472   

# Residual

RESIDUAL0_PATH = KFOLD_RESIDUAL_PATH + '/model1_fold0'         #0.472
RESIDUAL4_300_PATH = KFOLD_RESIDUAL_300_PATH + '/fold4model4'  #0.472

RESIDUAL_REINIT_PATH = '../input/pairs-reinit/kfold1_fold3'

# Electra

ELECTRA0_PATH = ELECTRA_RESIDUAL_PATH + '/fold0model3'
ELECTRA3_PATH = ELECTRA_RESIDUAL_PATH + '/fold3model3'
ELECTRA4_PATH = ELECTRA_RESIDUAL_PATH + '/electra_mlm_fold4'

# Deberta

DEBERTA0_KFOLD1_PATH = DEBERTA_KFOLD1_PATH + '/deberta_kfold1_fold0'
DEBERTA3_KFOLD1_PATH = DEBERTA_KFOLD1_PATH + '/deberta_kfold1_fold3'
DEBERTA2_KFOLD2_PATH = DEBERTA_KFOLD2_PATH + '/deberta_kfold2_fold2'

DEBERTA1_RESIDUAL_PATH = DEBERTA_RESIDUAL_300_PATH + '/deberta_fold1_model2'

            
# Ensemble


BASELINE_MODEL_PATHS = [MODEL0_KFOLD1_PATH, MODEL2_KFOLD1_PATH, MODEL3_KFOLD1_PATH, MODEL3_KFOLD1_PATH,
                        MODEL4_KFOLD1_PATH, MODEL_EXTRA_KFOLD1_PATH,
                        MODEL2_KFOLD2_PATH, MODEL4_KFOLD2_PATH]

BEST_RESIDUAL_PATHS = [ RESIDUAL0_PATH, RESIDUAL0_PATH]
BEST_RESIDUAL_300_PATHS = [RESIDUAL4_300_PATH, RESIDUAL4_300_PATH]

DEBERTA_OLD_PATHS = [DEBERTA0_KFOLD1_PATH, DEBERTA3_KFOLD1_PATH, DEBERTA2_KFOLD2_PATH]
DEBERTA_RESIDUAL_300_PATHS = [DEBERTA1_RESIDUAL_PATH]

ELECTRA_RESIDUAL_300_PATHS = [ELECTRA0_PATH, ELECTRA3_PATH, ELECTRA4_PATH]


In [5]:
test = pd.read_csv(TEST_PATH)
sample = pd.read_csv(SAMPLE_PATH)
test_x = test.excerpt


In [6]:
roberta_input_ids, roberta_attention_masks = auto_tokenize(test_x, ROBERTA_PATH, 256)
deberta_input_ids, deberta_attention_masks = auto_tokenize(test_x, DEBERTA_PATH, 256)
roberta_300_input_ids, roberta_300_attention_masks = auto_tokenize(test_x, ROBERTA_PATH, 300)
electra_300_input_ids, electra_300_attention_masks = auto_tokenize(test_x, ELECTRA_PATH, 300)
deberta_300_input_ids, deberta_300_attention_masks = auto_tokenize(test_x, DEBERTA_PATH, 300)

roberta_baseline_preds = get_model_preds(BASELINE_MODEL_PATHS, 
                                         roberta_input_ids, 
                                         roberta_attention_masks,
                                         )

roberta_residual_preds = get_model_preds(BEST_RESIDUAL_PATHS, 
                                         roberta_input_ids, 
                                         roberta_attention_masks,
                                         )

roberta_residual_300_preds = get_model_preds(BEST_RESIDUAL_300_PATHS, 
                                             roberta_300_input_ids, 
                                             roberta_300_attention_masks,
                                            )
                                            
deberta_old_preds = get_model_preds(DEBERTA_OLD_PATHS, 
                                deberta_input_ids, 
                                deberta_attention_masks,
                                )


electra_300_preds = get_model_preds(ELECTRA_RESIDUAL_300_PATHS, 
                                    electra_300_input_ids, 
                                    electra_300_attention_masks,
                                    )

electra_preds = [electra_300_preds.mean(axis=0), electra_300_preds.mean(axis=0)]


sample['target'] = np.concatenate([roberta_baseline_preds, 
                                   roberta_residual_preds,
                                   roberta_residual_300_preds,
                                   deberta_old_preds,
                                   electra_preds,
                                  ]).mean(axis=0)

sample.to_csv("submission.csv",index=False)
pairs_pred_df = sample
pairs_pred_df

100%|██████████| 1/1 [00:01<00:00,  1.16s/it]
100%|██████████| 1/1 [00:00<00:00,  5.35it/s]
100%|██████████| 1/1 [00:00<00:00,  5.46it/s]
100%|██████████| 1/1 [00:00<00:00,  5.70it/s]
100%|██████████| 1/1 [00:00<00:00,  5.71it/s]
100%|██████████| 1/1 [00:00<00:00,  5.34it/s]
100%|██████████| 1/1 [00:00<00:00,  5.70it/s]
100%|██████████| 1/1 [00:00<00:00,  5.24it/s]
100%|██████████| 1/1 [00:00<00:00,  5.44it/s]
100%|██████████| 1/1 [00:00<00:00,  5.70it/s]
100%|██████████| 1/1 [00:00<00:00,  3.99it/s]
100%|██████████| 1/1 [00:00<00:00,  3.68it/s]
100%|██████████| 1/1 [00:00<00:00,  3.29it/s]
100%|██████████| 1/1 [00:00<00:00,  3.74it/s]
100%|██████████| 1/1 [00:00<00:00,  3.97it/s]
100%|██████████| 1/1 [00:00<00:00,  3.79it/s]
100%|██████████| 1/1 [00:00<00:00,  4.02it/s]
100%|██████████| 1/1 [00:00<00:00,  3.79it/s]


Unnamed: 0,id,target
0,c0f722661,-0.291971
1,f0953f0a5,-0.526356
2,0df072751,-0.597451
3,04caf4e0c,-2.294861
4,0e63f8bea,-1.865334
5,12537fe78,-1.184783
6,965e592c0,0.254294


# Regression approach

In [7]:
torch.cuda.empty_cache()
gc.collect()

314

In [8]:
ID_COL  = 'id'
TARGET_COL = 'target'
TEXT_COL = 'excerpt'
DEVICE = torch.device('cuda')

preds = 0
num_folds = 5
random_state = 1234


#Create models dir in folder

class BERTDataset(Dataset):
    def __init__(self, review, model_name, target=None, is_test=False):
        self.review = review
        self.target = target
        self.is_test = is_test
        self.tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)
        self.max_len = args.max_len
        self.model_name = model_name
        
    def __len__(self):
        return len(self.review)

    def __getitem__(self, idx):
        review = str(self.review[idx])
        if args.lower:
            review = review.lower()
        #review = review.replace('\n', '')
        if args.custom_head and 'roberta' in self.model_name :
            pass
        else:
            review = ' '.join(review.split())
        global inputs

        if args.dynamic_padding:
            inputs = self.tokenizer.encode_plus(
                text=review,
                truncation=False,
                add_special_tokens=True,
                padding=False,
                return_attention_mask=True,
                return_token_type_ids=True
            )
        else:
            inputs = self.tokenizer.encode_plus(
                text=review,
                truncation=True,
                add_special_tokens=True,
                max_length=self.max_len,
                padding='max_length',
                return_attention_mask=True,
                return_token_type_ids=True
            )
 
        ids = torch.tensor(inputs['input_ids'], dtype=torch.long)
        mask = torch.tensor(inputs['attention_mask'], dtype=torch.long)
        token_type_ids = torch.tensor(inputs['token_type_ids'], dtype=torch.long)

        if self.is_test:
            return {
                'ids': ids,
                'mask': mask,
                'token_type_ids': token_type_ids,
            }
        else:
            targets = torch.tensor(self.target[idx], dtype=torch.float)
            return {
                'ids': ids,
                'mask': mask,
                'token_type_ids': token_type_ids,
                'targets': targets,
            }

class BERTModel(nn.Module):
    def __init__(self, model_name):
        super(BERTModel, self).__init__()
        self.config = transformers.AutoConfig.from_pretrained(model_name) #, output_hidden_states=True)
        self.model_name = model_name
        self.rd_feature_len = 0
        if args.custom_head:
            if 'roberta' in model_name:
                self.config.update({"output_hidden_states":True, 
                       "hidden_dropout_prob": 0.0,
                       "layer_norm_eps": 1e-7})      
                self.roberta = transformers.AutoModel.from_pretrained( model_name, output_hidden_states=True)
                self.regressor = nn.Sequential(
                    nn.Linear(768, 1)
                )
            else:
                self.bert = transformers.AutoModel.from_pretrained(model_name , output_hidden_states=True)
                self.regressor = nn.Sequential(
                    OrderedDict([
                        ('dropout0', nn.Dropout(args.use_dropout)),
                        ('fc', nn.Linear(args.fc_size, 1))
                     ])
                    )

            self.attention = nn.Sequential(
                nn.Linear(args.fc_size, 512),
                nn.Tanh(),
                nn.Linear(512, 1),
                nn.Softmax(dim=1)
            )

        else:
            if args.automodel_seq:
                self.bert = transformers.AutoModelForSequenceClassification.from_pretrained(model_name, num_labels = 1, output_hidden_states=False, output_attentions=False)
            else:
                self.bert = transformers.AutoModel.from_pretrained(model_name , output_hidden_states=True)

        if 'distil'  in model_name:
            self.layer_norm = nn.LayerNorm(args.hidden_size)
            
        if args.use_dropout:
            if args.multisample_dropout:
                self.dropouts = nn.ModuleList([
                 nn.Dropout(args.use_dropout) for _ in range(5)
                ])
            else:
                self.dropouts = nn.ModuleList([nn.Dropout(args.use_dropout)])

        # Custom head
        if args.use_single_fc:
            self.fc = nn.Linear(args.fc_size + self.rd_feature_len, 1)
        elif args.custom_head:
            print('Using custom head')
        elif args.automodel_seq:
            pass
        else:
             self.whole_head = nn.Sequential(OrderedDict([
            ('dropout0', nn.Dropout(args.use_dropout)),
            ('l1', nn.Linear(args.fc_size + self.rd_feature_len, 256)),
            ('act1', nn.GELU()),
            ('dropout1', nn.Dropout(args.use_dropout)),
            ('l2', nn.Linear(256, 1))
        ]))
                
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)

    def forward(self, ids, mask, rd_features=None, token_type_ids=None):
        # Returns keys(['last_hidden_state', 'pooler_output', 'hidden_states'])
        if token_type_ids is not None:
            if args.custom_head and 'roberta' in self.model_name:
                output = self.roberta(ids, attention_mask=mask, token_type_ids=token_type_ids, return_dict=True)
            else:
                output = self.bert(ids, attention_mask=mask, token_type_ids=token_type_ids, return_dict=True)
        else:
            output = self.bert(ids, attention_mask=mask, return_dict=True)

        #output = self.bert(ids, return_dict=True)

        # Hidden layer
        if args.use_hidden:
          if args.use_hidden == 'last':
              # Last  hidden states
              if args.custom_head and 'bart' in self.model_name:
                  output = output['decoder_hidden_states'][-1]
              else:
                  output = output['hidden_states'][-1]
              if not args.custom_head:
                  output = output.mean(1)
              if args.use_rd_features:
                  output = torch.cat((output, rd_features),1)
                  output = self.layer_norm(output)

          elif args.use_hidden == 'mean_max':
              output = output['last_hidden_state']
              average_pool = torch.mean(output, 1)
              max_pool, _ = torch.max(output, 1)
              output = torch.cat((average_pool, max_pool), 1)
              if args.use_rd_features:
                  output = torch.cat((output, rd_features),1)
                  output = self.layer_norm(output)

          elif args.use_hidden == 'mean':
              hs = output['hidden_states']
              seq_output = torch.cat([hs[-1],hs[-2],hs[-3], hs[-4]], dim=-1)
              input_mask_expanded = mask.unsqueeze(-1).expand(seq_output.size()).float()
              sum_embeddings = torch.sum(seq_output * input_mask_expanded, 1)
              sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
              output = sum_embeddings / sum_mask
              if args.use_rd_features:
                  output = torch.cat((output, rd_features),1)
                  output = self.layer_norm(output)
        # Pooler
        elif args.use_pooler:
          output = output['pooler_output']
          if args.use_rd_features:
              output = torch.cat((output, rd_features),1)
              output = self.layer_norm(output)
        # Mean of last layer
        elif args.use_last_mean:
          output = output['last_hidden_state']
          input_mask_expanded = mask.unsqueeze(-1).expand(output.size()).float()
          sum_embeddings = torch.sum(output * input_mask_expanded, 1)
          sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
          output = sum_embeddings / sum_mask
          if args.use_rd_features:
              output = torch.cat((output, rd_features),1)
              output = self.layer_norm(output)
        elif args.automodel_seq:
            output = output['logits']
        # CLS
        else:
          # Last layer
          output = output['last_hidden_state']
          # CLS token
          output = output[:,0,:]
          if args.use_rd_features:
              output = torch.cat((output, rd_features),1)
              output = self.layer_norm(output)

    
        """
        # Dropout if single FC used
        if args.use_dropout and args.use_single_fc:
          for i, dropout in enumerate(self.dropouts):
            if i == 0:
                logits = self.fc(dropout(output))
            else:
                logits += self.fc(dropout(output))
          output = logits/len(self.dropouts)
        elif args.use_single_fc:
            output = self.fc(output)
        """
        
        # Custom head
        if args.use_single_fc:
            output = self.fc(output)
        elif args.custom_head:
            weights = self.attention(output)
            output = torch.sum(weights * output, dim=1)
            output = self.regressor(output)
        elif args.automodel_seq:
            pass
        else:
            output = self.whole_head(output)
        output = output.squeeze(-1).squeeze(-1)
        return output
    
    
# %% [code] {"execution":{"iopub.status.busy":"2021-06-22T06:53:21.338286Z","iopub.execute_input":"2021-06-22T06:53:21.338705Z","iopub.status.idle":"2021-06-22T06:53:21.350545Z","shell.execute_reply.started":"2021-06-22T06:53:21.338669Z","shell.execute_reply":"2021-06-22T06:53:21.349744Z"}}

class CLMCollate:

    def __init__(self):
        self.seq_dic = defaultdict(int)  ## used to track max_length
        self.batch_record = defaultdict(list)
        self.bn = 0

    def __call__(self,batch):
        out = {'ids' :[],
               'mask':[],
               'token_type_ids':[],
               'targets':[],
               'errors': [],
               'rd_features': [],
               'bins': []
        }

        for i in batch:
            for k,v in i.items():
                out[k].append(v)

        if args.dynamic_padding:
            max_pad =0

            for p in out['ids']:
                if max_pad < len(p):
                    max_pad = len(p)

        else:
            max_pad = args.max_len


        self.batch_record[str(self.bn)] = [len(x) for x in out['ids']]
        self.seq_dic[str(self.bn)] = max_pad
        self.bn+=1
        for i in range(len(batch)):
            input_id = out['ids'][i]
            att_mask = out['mask'][i]
            token_type_id = out['token_type_ids'][i]
            text_len = len(input_id)

            # Add pad based on text len in batch
            out['ids'][i] = np.hstack((out['ids'][i].detach().numpy(), [1] * (max_pad - text_len))[:max_pad])
            out['mask'][i] = np.hstack((out['mask'][i].detach().numpy(), [0] * (max_pad - text_len))[:max_pad])
            out['token_type_ids'][i] = np.hstack((out['token_type_ids'][i].detach().numpy(), [0] * (max_pad - text_len))[:max_pad])

        out['ids'] = torch.tensor(out['ids'],dtype=torch.long)
        out['mask'] = torch.tensor(out['mask'],dtype=torch.long)
        out['token_type_ids'] = torch.tensor(out['token_type_ids'],dtype=torch.long)
        out['targets'] = torch.tensor(out['targets'],dtype=torch.float)
        out['errors'] = torch.tensor(out['errors'],dtype=torch.float)
        out['rd_features'] = torch.tensor(out['rd_features'],dtype=torch.float)

        return out
    
    
def get_bert_predictions(test_data, model_name, model_path):
        print('Getting BERT Embeddings')
        """
        This function validates the model for one epoch through all batches of the valid dataset
        It also returns the validation Root mean squared error for assesing model performance.
        """
        BertModel = BERTModel(model_name=model_name)
        #print(BertModel) 
        BertModel.to(DEVICE) 
        BertModel.load_state_dict(torch.load(model_path), strict=True)

        test_set = BERTDataset(
            review = test_data[TEXT_COL].values,
            target = None,
            model_name = model_name,
            is_test = True

        )

        if args.dynamic_padding:
            sequence = CLMCollate()
            test_data_loader = DataLoader(
                test_set,
                batch_size = Config.VALID_BS,
                collate_fn=sequence,
                shuffle = False,
                num_workers=8
            )
        else:
            test_data_loader = DataLoader(
                test_set,
                batch_size = Config.VALID_BS,
                shuffle = False,
                num_workers=8
            )

        prog_bar = tqdm(enumerate(test_data_loader), total=len(test_data_loader))
        BertModel.eval()
        all_predictions = []
        with torch.no_grad():
            for idx, inputs in prog_bar:
                ids = inputs['ids'].to(DEVICE, dtype=torch.long)
                mask = inputs['mask'].to(DEVICE, dtype=torch.long)
                ttis = inputs['token_type_ids'].to(DEVICE, dtype=torch.long)
                if 'distil' in model_name or 'bart' in model_name:
                    ttis = None
                outputs = BertModel(ids=ids, mask=mask, token_type_ids=ttis)
                all_predictions.extend(outputs.cpu().detach().numpy())

        return all_predictions

df = pd.read_csv('../input/commonlitreadabilityprize/test.csv')
pred_df = pd.DataFrame()

class Config:
    seed = 1234
    NB_EPOCHS = 10
    LR = 4e-5
    N_SPLITS = 5
    TRAIN_BS = 32
    VALID_BS = 64
    DBERT_MODELS = ['distilbert', 'xlnet', 't5']
    FILE_NAME = '../input/train.csv'
    scaler = GradScaler()

# %% [markdown]



## LB 0.464-Roberta-large


In [9]:
class BERTModelConfig():
    lower = False
    use_dropout = 0.1
    pretrained_model = False
    use_hidden = False
    hidden_size = 1024
    max_len = 250
    fc_size = 1024
    use_single_fc = False
    use_pooler = False
    use_last_mean = False
    multisample_dropout = False
    use_rd_features = False
    use_hidden_4 = False 
    custom_head = False
    dynamic_padding = False
    automodel_seq = False

args = BERTModelConfig()

    
pred_df['fold0'] = get_bert_predictions(df, model_name='../input/torch-bert-large-models/roberta-large' ,
                                         model_path='../input/roberta-large-0708/roberta_l_0708/bert_model_fold0.bin'
                                       )

torch.cuda.empty_cache()
gc.collect()


pred_df['fold1'] = get_bert_predictions(df, model_name='../input/torch-bert-large-models/roberta-large' ,
                                         model_path='../input/roberta-large-0708/roberta_l_0708/bert_model_fold1.bin'
                                       ) 
torch.cuda.empty_cache()
gc.collect()

pred_df['fold2'] = get_bert_predictions(df, model_name='../input/torch-bert-large-models/roberta-large' ,
                                         model_path='../input/roberta-large-0708/roberta_l_0708/bert_model_fold2.bin'
                                       )
torch.cuda.empty_cache()
gc.collect()


pred_df['fold3'] = get_bert_predictions(df, model_name='../input/torch-bert-large-models/roberta-large' ,
                                         model_path='../input/roberta-large-0708/roberta_l_0708/bert_model_fold3.bin'
                                       )
torch.cuda.empty_cache()
gc.collect()

pred_df['fold4'] = get_bert_predictions(df, model_name='../input/torch-bert-large-models/roberta-large' ,
                                         model_path='../input/roberta-large-0708/roberta_l_0708/bert_model_fold4.bin'
                                       )
torch.cuda.empty_cache()
gc.collect()

Getting BERT Embeddings


100%|██████████| 1/1 [00:00<00:00,  1.99it/s]


Getting BERT Embeddings


100%|██████████| 1/1 [00:00<00:00,  1.99it/s]


Getting BERT Embeddings


100%|██████████| 1/1 [00:00<00:00,  1.89it/s]


Getting BERT Embeddings


100%|██████████| 1/1 [00:00<00:00,  1.78it/s]


Getting BERT Embeddings


100%|██████████| 1/1 [00:00<00:00,  2.01it/s]


1297

## LB 0.465 Funnel large 

In [10]:
class BERTModelConfig():
    lower = False
    use_dropout = 0.1
    pretrained_model = False
    use_hidden = False
    hidden_size = 1024
    max_len = 250
    fc_size = 1024
    use_single_fc = False
    use_pooler = False
    use_last_mean = False
    multisample_dropout = False
    use_rd_features = False
    custom_head = False
    dynamic_padding = False
    automodel_seq = False

args = BERTModelConfig()

    
pred_df['fold5'] = get_bert_predictions( df,
                                         model_name='../input/funneltransformerlarge/funnel-transformer-large' ,
                                         model_path='../input/funnel-large-0703/funnel_l_0702/bert_model_fold0.bin'
                                       )

torch.cuda.empty_cache()
gc.collect()

pred_df['fold6'] = get_bert_predictions(df, 
                                         model_name='../input/funneltransformerlarge/funnel-transformer-large' ,
                                         model_path='../input/funnel-large-0703/funnel_l_0702/bert_model_fold1.bin'
                                       )
torch.cuda.empty_cache()
gc.collect()

pred_df['fold7'] = get_bert_predictions(df, 
                                         model_name='../input/funneltransformerlarge/funnel-transformer-large' ,
                                         model_path='../input/funnel-large-0703/funnel_l_0702/bert_model_fold2.bin'
                                       )
torch.cuda.empty_cache()
gc.collect()


pred_df['fold8'] = get_bert_predictions(df, 
                                         model_name='../input/funneltransformerlarge/funnel-transformer-large' ,
                                         model_path='../input/funnel-large-0703/funnel_l_0702/bert_model_fold3.bin'
                                       )
torch.cuda.empty_cache()
gc.collect()

pred_df['fold9'] = get_bert_predictions(df, 
                                         model_name='../input/funneltransformerlarge/funnel-transformer-large' ,
                                         model_path='../input/funnel-large-0703/funnel_l_0702/bert_model_fold4.bin'
                                       )
torch.cuda.empty_cache()
gc.collect()


Getting BERT Embeddings


100%|██████████| 1/1 [00:00<00:00,  1.61it/s]


Getting BERT Embeddings


100%|██████████| 1/1 [00:00<00:00,  1.78it/s]


Getting BERT Embeddings


100%|██████████| 1/1 [00:00<00:00,  1.92it/s]


Getting BERT Embeddings


100%|██████████| 1/1 [00:00<00:00,  1.69it/s]


Getting BERT Embeddings


100%|██████████| 1/1 [00:00<00:00,  1.83it/s]


1569

## LB 0.466 - Deberta-large


In [11]:
class BERTModelConfig():
    lower = False
    use_dropout = 0.1
    pretrained_model = False
    use_hidden = False
    hidden_size = 1024
    max_len = 250
    fc_size = 1024
    use_single_fc = False
    use_pooler = False
    use_last_mean = False
    multisample_dropout = False
    use_rd_features = False
    custom_head = False
    dynamic_padding = False
    automodel_seq = False

args = BERTModelConfig()

    
pred_df['fold10'] = get_bert_predictions(df, model_name='../input/torch-bert-large-models/deberta-large' ,
                                         model_path='../input/deberta-large-0627/deberta_l_0627/bert_model_fold0.bin'
                                       )

torch.cuda.empty_cache()
gc.collect()


pred_df['fold11'] = get_bert_predictions(df, model_name='../input/torch-bert-large-models/deberta-large' ,
                                         model_path='../input/deberta-large-0627/deberta_l_0627/bert_model_fold1.bin'
                                       ) 
torch.cuda.empty_cache()
gc.collect()

pred_df['fold12'] = get_bert_predictions(df, model_name='../input/torch-bert-large-models/deberta-large' ,
                                         model_path='../input/deberta-large-0627/deberta_l_0627/bert_model_fold2.bin'
                                       )
torch.cuda.empty_cache()
gc.collect()


pred_df['fold13'] = get_bert_predictions(df, model_name='../input/torch-bert-large-models/deberta-large' ,
                                         model_path='../input/deberta-large-0627/deberta_l_0627/bert_model_fold3.bin'
                                       )
torch.cuda.empty_cache()
gc.collect()


pred_df['fold14'] = get_bert_predictions(df, model_name='../input/torch-bert-large-models/deberta-large' ,
                                         model_path='../input/deberta-large-0627/deberta_l_0627/bert_model_fold4.bin'
                                       )
torch.cuda.empty_cache()
gc.collect()

Getting BERT Embeddings


100%|██████████| 1/1 [00:00<00:00,  1.43it/s]


Getting BERT Embeddings


100%|██████████| 1/1 [00:00<00:00,  1.48it/s]


Getting BERT Embeddings


100%|██████████| 1/1 [00:00<00:00,  1.33it/s]


Getting BERT Embeddings


100%|██████████| 1/1 [00:00<00:00,  1.47it/s]


Getting BERT Embeddings


100%|██████████| 1/1 [00:00<00:00,  1.44it/s]


6532

## LB 0.467 roberta base 


In [12]:
class BERTModelConfig():
    lower = False
    use_dropout = 0.1
    pretrained_model = False
    use_hidden = 'last'
    hidden_size = 1024
    max_len = 248
    fc_size = 768
    use_single_fc = False
    use_pooler = False
    use_last_mean = False
    multisample_dropout = False
    use_rd_features = False
    custom_head = True
    dynamic_padding = False
    automodel_seq = False

args = BERTModelConfig()

    
pred_df['fold15'] = get_bert_predictions( df,
                                         model_name='../input/roberta-base' ,
                                         model_path='../input/robertapublb0467/model_1.pth'
                                       )

torch.cuda.empty_cache()
gc.collect()

pred_df['fold16'] = get_bert_predictions(df, 
                                         model_name='../input/roberta-base' ,
                                         model_path='../input/robertapublb0467/model_2.pth'
                                       )
torch.cuda.empty_cache()
gc.collect()

pred_df['fold17'] = get_bert_predictions(df, 
                                         model_name='../input/roberta-base' ,
                                         model_path='../input/robertapublb0467/model_3.pth'
                                       )
torch.cuda.empty_cache()
gc.collect()


pred_df['fold18'] = get_bert_predictions(df, 
                                         model_name='../input/roberta-base' ,
                                         model_path='../input/robertapublb0467/model_4.pth'
                                       )
torch.cuda.empty_cache()
gc.collect()

pred_df['fold19'] = get_bert_predictions(df, 
                                         model_name='../input/roberta-base' ,
                                         model_path='../input/robertapublb0467/model_5.pth'
                                       )
torch.cuda.empty_cache()
gc.collect()


Getting BERT Embeddings
Using custom head


100%|██████████| 1/1 [00:00<00:00,  2.54it/s]


Getting BERT Embeddings
Using custom head


100%|██████████| 1/1 [00:00<00:00,  2.24it/s]


Getting BERT Embeddings
Using custom head


100%|██████████| 1/1 [00:00<00:00,  2.57it/s]


Getting BERT Embeddings
Using custom head


100%|██████████| 1/1 [00:00<00:00,  2.58it/s]


Getting BERT Embeddings
Using custom head


100%|██████████| 1/1 [00:00<00:00,  2.50it/s]


500

## LB 0.468 - electra-large-discriminator


In [13]:

class BERTModelConfig():
    lower = False
    use_dropout = 0.1
    pretrained_model = False
    use_hidden = False
    hidden_size = 1024
    max_len = 250
    fc_size = 1024
    use_single_fc = False
    use_pooler = False
    use_last_mean = False
    multisample_dropout = False
    use_rd_features = False
    custom_head = False
    dynamic_padding = False
    automodel_seq = False

args = BERTModelConfig()

    
pred_df['fold20'] = get_bert_predictions(df, model_name='../input/torch-bert-large-models/electra-large-discriminator' ,
                                         model_path='../input/electra-large-0630/electra_large_0630/bert_model_fold0.bin'
                                       )

torch.cuda.empty_cache()
gc.collect()

pred_df['fold21'] = get_bert_predictions(df, model_name='../input/torch-bert-large-models/electra-large-discriminator' ,
                                         model_path='../input/electra-large-0630/electra_large_0630/bert_model_fold1.bin'
                                       ) 
torch.cuda.empty_cache()
gc.collect()


pred_df['fold22'] = get_bert_predictions(df, model_name='../input/torch-bert-large-models/electra-large-discriminator' ,
                                         model_path='../input/electra-large-0630/electra_large_0630/bert_model_fold2.bin'
                                       )
torch.cuda.empty_cache()
gc.collect()


pred_df['fold23'] = get_bert_predictions(df, model_name='../input/torch-bert-large-models/electra-large-discriminator' ,
                                         model_path='../input/electra-large-0630/electra_large_0630/bert_model_fold3.bin'
                                       )
torch.cuda.empty_cache()
gc.collect()


pred_df['fold24'] = get_bert_predictions(df, model_name='../input/torch-bert-large-models/electra-large-discriminator' ,
                                         model_path='../input/electra-large-0630/electra_large_0630/bert_model_fold4.bin'
                                       )
torch.cuda.empty_cache()
gc.collect()

Getting BERT Embeddings


100%|██████████| 1/1 [00:00<00:00,  1.83it/s]


Getting BERT Embeddings


100%|██████████| 1/1 [00:00<00:00,  2.00it/s]


Getting BERT Embeddings


100%|██████████| 1/1 [00:00<00:00,  1.78it/s]


Getting BERT Embeddings


100%|██████████| 1/1 [00:00<00:00,  1.74it/s]


Getting BERT Embeddings


100%|██████████| 1/1 [00:00<00:00,  1.89it/s]


1297

## Weighted Ensemble

regression_pred_df = pd.DataFrame()
regression_pred_group1_df = pd.DataFrame()
regression_pred_group2_df = pd.DataFrame()

group1_cols = ['fold0', 'fold1', 'fold2', 'fold3','fold4',
               'fold5','fold6', 'fold7', 'fold8','fold9',
              'fold10', 'fold11', 'fold12', 'fold13','fold14',
              ]
group2_cols = ['fold15', 'fold16', 'fold17', 'fold18','fold19']
regression_pred_group1_df['target'] = pred_df[group1_cols].mean(axis=1).values.tolist()
regression_pred_group2_df['target'] = pred_df[group2_cols].mean(axis=1).values.tolist()

regression_pred_df['target'] = regression_pred_group1_df['target'] * 0.6 + regression_pred_group2_df['target'] * 0.4
regression_pred_df

## Simple mean

In [14]:
regression_pred_df = pd.DataFrame()
regression_pred_df['target'] = pred_df.mean(axis=1).values.tolist()
print(regression_pred_df.shape)
print(regression_pred_df)


(7, 1)
     target
0 -0.484693
1 -0.556209
2 -0.494541
3 -2.272735
4 -1.838250
5 -1.297987
6  0.220606


In [15]:
pairs_pred_df 


Unnamed: 0,id,target
0,c0f722661,-0.291971
1,f0953f0a5,-0.526356
2,0df072751,-0.597451
3,04caf4e0c,-2.294861
4,0e63f8bea,-1.865334
5,12537fe78,-1.184783
6,965e592c0,0.254294


In [16]:
sub_df = pd.read_csv('../input/commonlitreadabilityprize/sample_submission.csv')
# Mean of pairs and regression approach
sub_df['target'] = (pairs_pred_df['target'] * 0.5) + (regression_pred_df['target'] * 0.5)
sub_df.to_csv('submission.csv', index=False)
print(sub_df.head())

          id    target
0  c0f722661 -0.388332
1  f0953f0a5 -0.541283
2  0df072751 -0.545996
3  04caf4e0c -2.283798
4  0e63f8bea -1.851792
