In [1]:
import gc
import numpy as np
import pandas as pd

import torch 
from torch.utils.data import TensorDataset, DataLoader

from transformers import AutoTokenizer, AutoModelForSequenceClassification

from tqdm.notebook import tqdm

import warnings
warnings.filterwarnings('ignore')

device = 'cuda'

In [2]:
import os
import gc

from torch.utils.data import Dataset, DataLoader
import torch
import transformers
import torch.nn as nn
import torch.nn.functional as F
from torch.cuda.amp import GradScaler, autocast

from sklearn.metrics import mean_squared_error
from sklearn.svm import SVR

import argparse
import os
import random
import pickle
from collections import OrderedDict, defaultdict

## Pairs Approach

In [3]:
def get_model_preds(model_paths, input_ids, attention_masks):
    
    dataset = TensorDataset(input_ids, attention_masks)
    dataloader = DataLoader(dataset, batch_size=128, pin_memory=True)

    preds_list = []

    for fold in range(len(model_paths)):

        model = AutoModelForSequenceClassification.from_pretrained(
                                                   model_paths[fold], 
                                                   num_labels = 1,
                                                   output_attentions = False,
                                                   output_hidden_states = False, 
                                                   ).to(device)
        model.eval()

        preds = []
        
        for batch in tqdm(dataloader):
            
            with torch.no_grad(): 
    
                output = model(batch[0].to(device), batch[1].to(device))
                output = output.logits.detach().cpu().numpy().ravel().tolist()
                preds.extend(output)
    
        del model
        torch.cuda.empty_cache()
        gc.collect()
        
        preds_list.append(preds)      

    return np.array(preds_list)


def auto_tokenize(data, tokenizer_dir, num_tokens):

    data = data.to_list()

    tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir)

    encoded_input = tokenizer(
                              data, 
                              padding='max_length', 
                              truncation=True, 
                              max_length=num_tokens, 
                              return_tensors='pt'
                              )

    input_ids = encoded_input['input_ids']
    attention_masks = encoded_input['attention_mask']

    return input_ids, attention_masks


In [4]:
BASE_PATH = '../input'
BASE2_PATH = '../input/d/eduardopeynetti'
DATA_PATH = '../input/commonlitreadabilityprize'
KFOLD1_PATH = BASE_PATH + '/baseline-kfold1'
KFOLD2_PATH = BASE_PATH + '/baseline-kfold2'
KFOLD_RESIDUAL_PATH = BASE_PATH + '/roberta-kfold-residual'
KFOLD_RESIDUAL_300_PATH = BASE_PATH + '/residual-300-tokens'
ROBERTA_REDUCED_PATH = BASE_PATH + '/roberta-reduced'
ELECTRA_RESIDUAL_PATH = BASE_PATH + '/electra-kfold-residual'
DEBERTA_RESIDUAL_PATH = BASE_PATH + '/deberta-kfold-residual'
DEBERTA_KFOLD1_PATH = BASE_PATH + '/deberta-kfold1'
DEBERTA_KFOLD2_PATH = BASE_PATH + '/deberta-kfold2'
PAIRS_REINIT_PATH = BASE_PATH + '/pairs-reinit'

HUGGING_PATH = BASE2_PATH + '/commonlit-huggingface'

TRAIN_PATH = DATA_PATH + '/' + 'train.csv'
TEST_PATH = DATA_PATH + '/' + 'test.csv'
SAMPLE_PATH = DATA_PATH + '/' + 'sample_submission.csv'

DEBERTA_PATH = HUGGING_PATH + '/' +'deberta-tokenizer'
ELECTRA_PATH = HUGGING_PATH + '/' +'electra_tokenizer'
ROBERTA_PATH = HUGGING_PATH + '/' +'roberta-tokenizer'

# Fold 1

MODEL0_KFOLD1_PATH = KFOLD1_PATH + '/baseline_kfold1_fold0' #0.476
MODEL1_KFOLD1_PATH = KFOLD1_PATH + '/baseline_kfold1_fold1' #0.485
MODEL2_KFOLD1_PATH = KFOLD1_PATH + '/baseline_kfold1_fold2' #0.475
MODEL3_KFOLD1_PATH = KFOLD1_PATH + '/baseline_kfold1_fold3' #0.469
MODEL4_KFOLD1_PATH = KFOLD1_PATH + '/baseline_kfold1_fold4' #0.473

MODEL_EXTRA_KFOLD1_PATH = KFOLD1_PATH + '/pairs_large_fold3'     #0.474

ROBERTA_KFOLD1_PATHS = [MODEL0_KFOLD1_PATH, MODEL1_KFOLD1_PATH, MODEL2_KFOLD1_PATH,
                        MODEL3_KFOLD1_PATH, MODEL4_KFOLD1_PATH, MODEL_EXTRA_KFOLD1_PATH]

# Fold 2

MODEL0_KFOLD2_PATH = KFOLD2_PATH + '/baseline_kfold2_fold0'       #0.478
MODEL1_KFOLD2_PATH = KFOLD2_PATH + '/baseline_kfold2_fold1'       #0.486
MODEL2_KFOLD2_PATH = KFOLD2_PATH + '/baseline_kfold2_fold2'       #0.472
MODEL3_KFOLD2_PATH = KFOLD2_PATH + '/baseline_kfold2_fold3'       #0.486
MODEL4_KFOLD2_PATH = KFOLD2_PATH + '/baseline_kfold2_fold4'       #0.472   

ROBERTA_KFOLD2_PATHS = [MODEL0_KFOLD2_PATH, MODEL1_KFOLD2_PATH, MODEL2_KFOLD2_PATH,
                        MODEL3_KFOLD2_PATH, MODEL4_KFOLD2_PATH]

# Residual

RESIDUAL0_PATH = KFOLD_RESIDUAL_PATH + '/model1_fold0'   #0.472
RESIDUAL1_PATH = KFOLD_RESIDUAL_PATH + '/fold1model3'    #0.478
RESIDUAL2_PATH = KFOLD_RESIDUAL_PATH + '/model1_fold2'   #0.482
RESIDUAL3_PATH = KFOLD_RESIDUAL_PATH + '/model1_fold3'   #0.488
RESIDUAL4_PATH = KFOLD_RESIDUAL_PATH + '/model0_fold4'

ROBERTA_RESIDUAL_PATHS = [RESIDUAL0_PATH, RESIDUAL1_PATH, RESIDUAL2_PATH,
                          RESIDUAL3_PATH, RESIDUAL4_PATH]

RESIDUAL0_300_PATH = KFOLD_RESIDUAL_300_PATH + '/fold0model3'
RESIDUAL1_300_PATH = KFOLD_RESIDUAL_300_PATH + '/fold1model1'
RESIDUAL2_300_PATH = KFOLD_RESIDUAL_300_PATH + '/fold2model4'
RESIDUAL3_300_PATH = KFOLD_RESIDUAL_300_PATH + '/fold3model1'
RESIDUAL4_300_PATH = KFOLD_RESIDUAL_300_PATH + '/fold4model4'  # 0.472

ROBERTA_RESIDUAL_300_PATHS = [RESIDUAL0_300_PATH, RESIDUAL1_300_PATH, RESIDUAL2_300_PATH,
                              RESIDUAL3_300_PATH, RESIDUAL4_300_PATH]

# Reduced

REDUCED0_PATH = ROBERTA_REDUCED_PATH + '/fold0model4'
REDUCED1_PATH = ROBERTA_REDUCED_PATH + '/fold1model1'
REDUCED2_PATH = ROBERTA_REDUCED_PATH + '/fold2model2'
REDUCED3_PATH = ROBERTA_REDUCED_PATH + '/fold3model2'
REDUCED4_PATH = ROBERTA_REDUCED_PATH + '/fold4model1'

ROBERTA_REDUCED_PATHS = [REDUCED0_PATH, REDUCED1_PATH, REDUCED2_PATH, REDUCED3_PATH, REDUCED4_PATH]


REINIT0_PATH = PAIRS_REINIT_PATH + '/kfold1_fold2'
REINIT1_PATH = PAIRS_REINIT_PATH + '/kfold1_fold3'
REINIT2_PATH = PAIRS_REINIT_PATH + '/kfold2_fold2'
REINIT3_PATH = PAIRS_REINIT_PATH + '/kfold2_fold4'
REINIT4_PATH = PAIRS_REINIT_PATH + '/residual_fold0'
REINIT5_PATH = PAIRS_REINIT_PATH + '/residual_fold4'


# Electra

ELECTRA0_PATH = ELECTRA_RESIDUAL_PATH + '/fold0model3'
ELECTRA1_PATH = ELECTRA_RESIDUAL_PATH + '/fold1model1'             #0.490
ELECTRA2_PATH = ELECTRA_RESIDUAL_PATH + '/electra_mlm_fold2'
ELECTRA3_PATH = ELECTRA_RESIDUAL_PATH + '/fold3model3'
ELECTRA4_PATH = ELECTRA_RESIDUAL_PATH + '/electra_mlm_fold4'

ELECTRA_RESIDUAL_300_PATHS = [ELECTRA0_PATH, ELECTRA1_PATH, ELECTRA2_PATH, ELECTRA3_PATH, ELECTRA4_PATH]

# Deberta

DEBERTA0_PATH = DEBERTA_RESIDUAL_PATH + '/deberta_fold0_model3'
DEBERTA1_PATH = DEBERTA_RESIDUAL_PATH + '/deberta_fold1_model2'
DEBERTA2_PATH = DEBERTA_RESIDUAL_PATH + '/deberta_fold2_model3'
DEBERTA3_PATH = DEBERTA_RESIDUAL_PATH + '/deberta_fold3_model0'
DEBERTA4_PATH = DEBERTA_RESIDUAL_PATH + '/deberta_fold4_model0'

DEBERTA0_MLM_PATH = DEBERTA_RESIDUAL_PATH + '/deberta_mlm_fold0'
DEBERTA1_MLM_PATH = DEBERTA_RESIDUAL_PATH + '/deberta_mlm_fold1'
DEBERTA2_MLM_PATH = DEBERTA_RESIDUAL_PATH + '/deberta_mlm_fold2'   #0.500
DEBERTA3_MLM_PATH = DEBERTA_RESIDUAL_PATH + '/deberta_mlm_fold3'   #0.488
DEBERTA4_MLM_PATH = DEBERTA_RESIDUAL_PATH + '/deberta_mlm_fold4'

DEBERTA0_KFOLD1_PATH = DEBERTA_KFOLD1_PATH + '/deberta_kfold1_fold0'
DEBERTA3_KFOLD1_PATH = DEBERTA_KFOLD1_PATH + '/deberta_kfold1_fold3'
DEBERTA2_KFOLD2_PATH = DEBERTA_KFOLD2_PATH + '/deberta_kfold2_fold2'


DEBERTA_RESIDUAL_300_PATHS = [DEBERTA0_PATH, DEBERTA1_PATH, DEBERTA4_PATH, 
                              DEBERTA0_MLM_PATH, DEBERTA1_MLM_PATH, DEBERTA4_MLM_PATH]                        

DEBERTA_OLD_PATHS = [DEBERTA0_KFOLD1_PATH, DEBERTA3_KFOLD1_PATH, DEBERTA2_KFOLD2_PATH]

# Ensemble

BASELINE_MODEL_PATHS = [MODEL0_KFOLD1_PATH, MODEL2_KFOLD1_PATH, MODEL3_KFOLD1_PATH, MODEL3_KFOLD1_PATH,
                        MODEL4_KFOLD1_PATH, MODEL_EXTRA_KFOLD1_PATH,
                        MODEL2_KFOLD2_PATH, MODEL4_KFOLD2_PATH]

BEST_RESIDUAL_PATHS = [ RESIDUAL0_PATH, RESIDUAL0_PATH]
BEST_RESIDUAL_300_PATHS = [RESIDUAL4_300_PATH, RESIDUAL4_300_PATH]

ELECTRA_RESIDUAL_300_PATHS = [ELECTRA0_PATH, ELECTRA3_PATH, ELECTRA4_PATH]

ROBERTA_REINIT_PATHS = [REINIT0_PATH, REINIT1_PATH]#REINIT1_PATH, REINIT2_PATH, REINIT3_PATH, REINIT4_PATH, REINIT5_PATH]

In [5]:
test = pd.read_csv(TEST_PATH)
sample = pd.read_csv(SAMPLE_PATH)
test_x = test.excerpt


In [6]:
roberta_input_ids, roberta_attention_masks = auto_tokenize(test_x, ROBERTA_PATH, 256)
deberta_input_ids, deberta_attention_masks = auto_tokenize(test_x, DEBERTA_PATH, 256)
roberta_300_input_ids, roberta_300_attention_masks = auto_tokenize(test_x, ROBERTA_PATH, 300)
electra_300_input_ids, electra_300_attention_masks = auto_tokenize(test_x, ELECTRA_PATH, 300)
deberta_300_input_ids, deberta_300_attention_masks = auto_tokenize(test_x, DEBERTA_PATH, 300)


roberta_baseline_preds = get_model_preds(BASELINE_MODEL_PATHS, 
                                         roberta_input_ids, 
                                         roberta_attention_masks,
                                         )

roberta_residual_preds = get_model_preds(BEST_RESIDUAL_PATHS, 
                                         roberta_input_ids, 
                                         roberta_attention_masks,
                                         )

roberta_residual_300_preds = get_model_preds(BEST_RESIDUAL_300_PATHS, 
                                             roberta_300_input_ids, 
                                             roberta_300_attention_masks,
                                            )

deberta_old_preds = get_model_preds(DEBERTA_OLD_PATHS, 
                                deberta_input_ids, 
                                deberta_attention_masks,
                                )

deberta_residual_300_preds = get_model_preds(DEBERTA_RESIDUAL_300_PATHS, 
                                             deberta_300_input_ids, 
                                             deberta_300_attention_masks,
                                            )

electra_300_preds = get_model_preds(ELECTRA_RESIDUAL_300_PATHS, 
                                    electra_300_input_ids, 
                                    electra_300_attention_masks,
                                    )

electra_preds = electra_300_preds #[electra_300_preds.mean(axis=0)]
deberta_preds = [deberta_residual_300_preds.mean(axis=0), deberta_residual_300_preds.mean(axis=0)]

#sample['target'] = roberta_reinit_preds.mean(axis=0)

sample['target'] = np.concatenate([roberta_baseline_preds, 
                                   roberta_residual_preds,
                                   roberta_residual_300_preds,
                                   deberta_old_preds,
                                   deberta_preds,
                                   electra_preds,
                                  ]).mean(axis=0)
 
sample.to_csv("submission.csv",index=False)
pairs_pred_df = sample




  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

## Regression Approach

In [7]:
torch.cuda.empty_cache()
gc.collect()

1116

In [8]:
ID_COL  = 'id'
TARGET_COL = 'target'
TEXT_COL = 'excerpt'
DEVICE = torch.device('cuda')

preds = 0
num_folds = 5
random_state = 1234

In [9]:
class BERTDataset(Dataset):
    def __init__(self, review, model_name, target=None, is_test=False):
        self.review = review
        self.target = target
        self.is_test = is_test
        self.tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)
        self.max_len = args.max_len
        self.model_name = model_name
        
    def __len__(self):
        return len(self.review)

    def __getitem__(self, idx):
        review = str(self.review[idx])
        if args.lower:
            review = review.lower()
        #review = review.replace('\n', '')
        if args.custom_head and 'roberta' in self.model_name :
            pass
        else:
            review = ' '.join(review.split())
        global inputs

        if args.dynamic_padding:
            inputs = self.tokenizer.encode_plus(
                text=review,
                truncation=False,
                add_special_tokens=True,
                padding=False,
                return_attention_mask=True,
                return_token_type_ids=True
            )
        else:
            inputs = self.tokenizer.encode_plus(
                text=review,
                truncation=True,
                add_special_tokens=True,
                max_length=self.max_len,
                padding='max_length',
                return_attention_mask=True,
                return_token_type_ids=True
            )
 
        ids = torch.tensor(inputs['input_ids'], dtype=torch.long)
        mask = torch.tensor(inputs['attention_mask'], dtype=torch.long)
        token_type_ids = torch.tensor(inputs['token_type_ids'], dtype=torch.long)

        if self.is_test:
            return {
                'ids': ids,
                'mask': mask,
                'token_type_ids': token_type_ids,
            }
        else:
            targets = torch.tensor(self.target[idx], dtype=torch.float)
            return {
                'ids': ids,
                'mask': mask,
                'token_type_ids': token_type_ids,
                'targets': targets,
            }

class BERTModel(nn.Module):
    def __init__(self, model_name):
        super(BERTModel, self).__init__()
        self.config = transformers.AutoConfig.from_pretrained(model_name) #, output_hidden_states=True)
        self.model_name = model_name
        self.rd_feature_len = 0
        if args.custom_head:
            if 'roberta' in model_name:
                self.config.update({"output_hidden_states":True, 
                       "hidden_dropout_prob": 0.0,
                       "layer_norm_eps": 1e-7})      
                self.roberta = transformers.AutoModel.from_pretrained( model_name, output_hidden_states=True)
                self.regressor = nn.Sequential(
                    nn.Linear(768, 1)
                )
            else:
                self.bert = transformers.AutoModel.from_pretrained(model_name , output_hidden_states=True)
                self.regressor = nn.Sequential(
                    OrderedDict([
                        ('dropout0', nn.Dropout(args.use_dropout)),
                        ('fc', nn.Linear(args.fc_size, 1))
                     ])
                    )

            self.attention = nn.Sequential(
                nn.Linear(args.fc_size, 512),
                nn.Tanh(),
                nn.Linear(512, 1),
                nn.Softmax(dim=1)
            )

        else:
            if args.automodel_seq:
                self.bert = transformers.AutoModelForSequenceClassification.from_pretrained(model_name, num_labels = 1, output_hidden_states=False, output_attentions=False)
            else: 
                self.bert = transformers.AutoModel.from_pretrained(model_name , output_hidden_states=True)

        if 'distil'  in model_name :
            self.layer_norm = nn.LayerNorm(args.hidden_size)
        
        if args.use_last_mean:
            if args.use_norm:
                self.layer_norm = nn.LayerNorm(args.fc_size)
            
        if args.use_dropout:
            if args.multisample_dropout:
                self.dropouts = nn.ModuleList([
                 nn.Dropout(args.use_dropout) for _ in range(5)
                ])
            else:
                self.dropouts = nn.ModuleList([nn.Dropout(args.use_dropout)])

        if args.use_single_fc:
            self.fc = nn.Linear(args.fc_size + self.rd_feature_len, 1)
        elif args.custom_head:
            print('Using custom head')
        elif args.automodel_seq:
            pass
        elif args.use_conv_head:
            self.conv = nn.Sequential(OrderedDict([
            ('conv', nn.Conv1d(args.fc_size, 256, kernel_size=3)),
            ('act1', nn.GELU()),
            ('dropout1', nn.Dropout(args.use_dropout)),
            ]))
            self.fc = nn.Linear(256, 1)
        else:
             self.whole_head = nn.Sequential(OrderedDict([
            ('dropout0', nn.Dropout(args.use_dropout)),
            ('l1', nn.Linear(args.fc_size + self.rd_feature_len, 256)),
            ('act1', nn.GELU()),
            ('dropout1', nn.Dropout(args.use_dropout)),
            ('l2', nn.Linear(256, 1))
        ]))
                
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)

    def forward(self, ids, mask, rd_features=None, token_type_ids=None):
        # Returns keys(['last_hidden_state', 'pooler_output', 'hidden_states'])
        if token_type_ids is not None:
            if args.custom_head and 'roberta' in self.model_name:
                output = self.roberta(ids, attention_mask=mask, token_type_ids=token_type_ids, return_dict=True)
            else:
                output = self.bert(ids, attention_mask=mask, token_type_ids=token_type_ids, return_dict=True)
        else:
            output = self.bert(ids, attention_mask=mask, return_dict=True)

        #output = self.bert(ids, return_dict=True)

        # Hidden layer
        if args.use_hidden:
          if args.use_hidden == 'last':
              # Last  hidden states
              if args.custom_head and 'bart' in self.model_name:
                  output = output['decoder_hidden_states'][-1]
              else:
                  output = output['hidden_states'][-1]
              if not args.custom_head:
                  output = output.mean(1)
              if args.use_rd_features:
                  output = torch.cat((output, rd_features),1)
                  output = self.layer_norm(output)

          elif args.use_hidden == 'mean_max':
              output = output['last_hidden_state']
              average_pool = torch.mean(output, 1)
              max_pool, _ = torch.max(output, 1)
              output = torch.cat((average_pool, max_pool), 1)
              if args.use_rd_features:
                  output = torch.cat((output, rd_features),1)
                  output = self.layer_norm(output)

          elif args.use_hidden == 'mean':
              hs = output['hidden_states']
              seq_output = torch.cat([hs[-1],hs[-2],hs[-3], hs[-4]], dim=-1)
              input_mask_expanded = mask.unsqueeze(-1).expand(seq_output.size()).float()
              sum_embeddings = torch.sum(seq_output * input_mask_expanded, 1)
              sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
              output = sum_embeddings / sum_mask
              if args.use_rd_features:
                  output = torch.cat((output, rd_features),1)
                  output = self.layer_norm(output)
          elif args.use_hidden == 'conv':
              hs = output['hidden_states']
              seq_output = torch.cat([hs[-1],hs[-2],hs[-3]], dim=-1)
              input_mask_expanded = mask.unsqueeze(-1).expand(seq_output.size()).float()
              sum_embeddings = torch.sum(seq_output * input_mask_expanded, 1)
              sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
              output = sum_embeddings / sum_mask
              output = output.reshape(-1, 3, 1024)
              output = output.permute(0,2,1)
        # Pooler
        elif args.use_pooler:
          output = output['pooler_output']
          if args.use_rd_features:
              output = torch.cat((output, rd_features),1)
              output = self.layer_norm(output)
        # Mean of last layer
        elif args.use_last_mean:
          output = output['last_hidden_state']
          input_mask_expanded = mask.unsqueeze(-1).expand(output.size()).float()
          sum_embeddings = torch.sum(output * input_mask_expanded, 1)
          sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
          output = sum_embeddings / sum_mask
          if args.use_norm:
                output = self.layer_norm(output)

          if args.use_rd_features:
              output = torch.cat((output, rd_features),1)
              output = self.layer_norm(output)
        elif args.automodel_seq:
            output = output['logits']
        # CLS
        else:
          # Last layer
          output = output['last_hidden_state']
          # CLS token
          output = output[:,0,:]
          if args.use_rd_features:
              output = torch.cat((output, rd_features),1)
              output = self.layer_norm(output)

    
        """
        # Dropout if single FC used
        if args.use_dropout and args.use_single_fc:
          for i, dropout in enumerate(self.dropouts):
            if i == 0:
                logits = self.fc(dropout(output))
            else:
                logits += self.fc(dropout(output))
          output = logits/len(self.dropouts)
        elif args.use_single_fc:
            output = self.fc(output)
        """
        
        # Custom head
        if args.use_single_fc:
            output = self.fc(output)
        elif args.custom_head:
            weights = self.attention(output)
            output = torch.sum(weights * output, dim=1)
            output = self.regressor(output)
        elif args.automodel_seq:
            pass
        elif args.use_conv_head:
                output = self.conv(output)
                output = output.squeeze()
                output = self.fc(output)
        else:
            output = self.whole_head(output)
        output = output.squeeze(-1).squeeze(-1)
        return output

class CLMCollate:

    def __init__(self):
        self.seq_dic = defaultdict(int)  ## used to track max_length
        self.batch_record = defaultdict(list)
        self.bn = 0

    def __call__(self,batch):
        out = {'ids' :[],
               'mask':[],
               'token_type_ids':[],
               'targets':[],
               'errors': [],
               'rd_features': [],
               'bins': []
        }

        for i in batch:
            for k,v in i.items():
                out[k].append(v)

        if args.dynamic_padding:
            max_pad =0

            for p in out['ids']:
                if max_pad < len(p):
                    max_pad = len(p)

        else:
            max_pad = args.max_len


        self.batch_record[str(self.bn)] = [len(x) for x in out['ids']]
        self.seq_dic[str(self.bn)] = max_pad
        self.bn+=1
        for i in range(len(batch)):
            input_id = out['ids'][i]
            att_mask = out['mask'][i]
            token_type_id = out['token_type_ids'][i]
            text_len = len(input_id)

            # Add pad based on text len in batch
            out['ids'][i] = np.hstack((out['ids'][i].detach().numpy(), [1] * (max_pad - text_len))[:max_pad])
            out['mask'][i] = np.hstack((out['mask'][i].detach().numpy(), [0] * (max_pad - text_len))[:max_pad])
            out['token_type_ids'][i] = np.hstack((out['token_type_ids'][i].detach().numpy(), [0] * (max_pad - text_len))[:max_pad])

        out['ids'] = torch.tensor(out['ids'],dtype=torch.long)
        out['mask'] = torch.tensor(out['mask'],dtype=torch.long)
        out['token_type_ids'] = torch.tensor(out['token_type_ids'],dtype=torch.long)
        out['targets'] = torch.tensor(out['targets'],dtype=torch.float)
        out['errors'] = torch.tensor(out['errors'],dtype=torch.float)
        out['rd_features'] = torch.tensor(out['rd_features'],dtype=torch.float)

        return out

def get_bert_predictions(test_data, model_name, model_path):
        print('Getting BERT Embeddings')
        """
        This function validates the model for one epoch through all batches of the valid dataset
        It also returns the validation Root mean squared error for assesing model performance.
        """
        BertModel = BERTModel(model_name=model_name)
        #print(BertModel) 
        BertModel.to(DEVICE) 
        BertModel.load_state_dict(torch.load(model_path), strict=True)

        test_set = BERTDataset(
            review = test_data[TEXT_COL].values,
            target = None,
            model_name = model_name,
            is_test = True

        )

        if args.dynamic_padding:
            sequence = CLMCollate()
            test_data_loader = DataLoader(
                test_set,
                batch_size = Config.VALID_BS,
                collate_fn=sequence,
                shuffle = False,
                num_workers=8
            )
        else:
            test_data_loader = DataLoader(
                test_set,
                batch_size = Config.VALID_BS,
                shuffle = False,
                num_workers=8
            )

        prog_bar = tqdm(enumerate(test_data_loader), total=len(test_data_loader))
        BertModel.eval()
        all_predictions = []
        with torch.no_grad():
            for idx, inputs in prog_bar:
                ids = inputs['ids'].to(DEVICE, dtype=torch.long)
                mask = inputs['mask'].to(DEVICE, dtype=torch.long)
                ttis = inputs['token_type_ids'].to(DEVICE, dtype=torch.long)
                if 'distil' in model_name or 'bart' in model_name:
                    ttis = None
                outputs = BertModel(ids=ids, mask=mask, token_type_ids=ttis)
                all_predictions.extend(outputs.cpu().detach().numpy())

        return all_predictions

In [10]:
GET_CV = False
DEBUG = False

if GET_CV:
    df = pd.read_csv('../input/commonlitreadabilityprize/train.csv')
    df = pd.concat([df, df])
    print(df.shape)
    if DEBUG:
        df = df.sample(100)
else:
    df = pd.read_csv('../input/commonlitreadabilityprize/test.csv')
    

pred_df = pd.DataFrame()
svr_df = pd.DataFrame()
fold_start = 0

class Config:
    seed = 1234
    NB_EPOCHS = 10
    LR = 4e-5
    N_SPLITS = 5
    TRAIN_BS = 32
    VALID_BS = 64
    DBERT_MODELS = ['distilbert', 'xlnet', 't5']
    FILE_NAME = '../input/train.csv'
    scaler = GradScaler()

## LB 0.464-Roberta-large

In [11]:
class BERTModelConfig():
    lower = False
    use_dropout = 0.1
    pretrained_model = False
    use_hidden = False
    hidden_size = 1024
    max_len = 250
    fc_size = 1024
    use_single_fc = False
    use_pooler = False
    use_last_mean = False
    multisample_dropout = False
    use_rd_features = False
    use_hidden_4 = False 
    custom_head = False
    dynamic_padding = False
    automodel_seq = False
    use_conv_head = False
    use_norm = False

args = BERTModelConfig()

for fold in range(5):
    pred_df[f'fold{fold_start}'] = get_bert_predictions(df, model_name='../input/torch-bert-large-models/roberta-large' ,
                                         model_path=f'../input/roberta-large-0708/roberta_l_0708/bert_model_fold{fold}.bin'
                                       )
    fold_start += 1
    torch.cuda.empty_cache()
    gc.collect()

print(fold_start)

Getting BERT Embeddings


  0%|          | 0/1 [00:00<?, ?it/s]

Getting BERT Embeddings


  0%|          | 0/1 [00:00<?, ?it/s]

Getting BERT Embeddings


  0%|          | 0/1 [00:00<?, ?it/s]

Getting BERT Embeddings


  0%|          | 0/1 [00:00<?, ?it/s]

Getting BERT Embeddings


  0%|          | 0/1 [00:00<?, ?it/s]

5


## LB 0.465 Funnel large

In [12]:
class BERTModelConfig():
    lower = False
    use_dropout = 0.1
    pretrained_model = False
    use_hidden = False
    hidden_size = 1024
    max_len = 250
    fc_size = 1024
    use_single_fc = False
    use_pooler = False
    use_last_mean = False
    multisample_dropout = False
    use_rd_features = False
    custom_head = False
    dynamic_padding = False
    automodel_seq = False
    use_conv_head = False
    use_norm = False

args = BERTModelConfig()

for fold in range(5):
    pred_df[f'fold{fold_start}'] = get_bert_predictions( df,
                                         model_name='../input/funneltransformerlarge/funnel-transformer-large' ,
                                         model_path=f'../input/funnel-large-0703/funnel_l_0702/bert_model_fold{fold}.bin'
                                       )
    fold_start += 1
    torch.cuda.empty_cache()
    gc.collect()

print(fold_start)

Getting BERT Embeddings


  0%|          | 0/1 [00:00<?, ?it/s]

Getting BERT Embeddings


  0%|          | 0/1 [00:00<?, ?it/s]

Getting BERT Embeddings


  0%|          | 0/1 [00:00<?, ?it/s]

Getting BERT Embeddings


  0%|          | 0/1 [00:00<?, ?it/s]

Getting BERT Embeddings


  0%|          | 0/1 [00:00<?, ?it/s]

10


## LB 0.466 Deberta large

In [13]:
class BERTModelConfig():
    lower = False
    use_dropout = 0.1
    pretrained_model = False
    use_hidden = False
    hidden_size = 1024
    max_len = 250
    fc_size = 1024
    use_single_fc = False
    use_pooler = False
    use_last_mean = False
    multisample_dropout = False
    use_rd_features = False
    custom_head = False
    dynamic_padding = False
    automodel_seq = False
    use_conv_head = False
    use_norm = False

args = BERTModelConfig()

for fold in range(5):
    pred_df[f'fold{fold_start}'] = get_bert_predictions(df, model_name='../input/torch-bert-large-models/deberta-large' ,
                                         model_path=f'../input/deberta-large-0627/deberta_l_0627/bert_model_fold{fold}.bin'
                                       )
    fold_start += 1
    torch.cuda.empty_cache()
    gc.collect()


print(fold_start)

Getting BERT Embeddings


  0%|          | 0/1 [00:00<?, ?it/s]

Getting BERT Embeddings


  0%|          | 0/1 [00:00<?, ?it/s]

Getting BERT Embeddings


  0%|          | 0/1 [00:00<?, ?it/s]

Getting BERT Embeddings


  0%|          | 0/1 [00:00<?, ?it/s]

Getting BERT Embeddings


  0%|          | 0/1 [00:00<?, ?it/s]

15


## LB 0.467 Roberta-base


In [14]:
class BERTModelConfig():
    lower = False
    use_dropout = 0.1
    pretrained_model = False
    use_hidden = 'last'
    hidden_size = 1024
    max_len = 248
    fc_size = 768
    use_single_fc = False
    use_pooler = False
    use_last_mean = False
    multisample_dropout = False
    use_rd_features = False
    custom_head = True
    dynamic_padding = False
    automodel_seq = False
    use_conv_head = False
    use_norm = False

args = BERTModelConfig()

for fold in range(5):
    pred_df[f'fold{fold_start}'] = get_bert_predictions( df,
                                         model_name='../input/robertabase/roberta-base' ,
                                         model_path=f'../input/robertapublb0467/model_{fold+1}.pth'
                                       )
    fold_start +=1
    torch.cuda.empty_cache()
    gc.collect()


print(fold_start)

Getting BERT Embeddings
Using custom head


  0%|          | 0/1 [00:00<?, ?it/s]

Getting BERT Embeddings
Using custom head


  0%|          | 0/1 [00:00<?, ?it/s]

Getting BERT Embeddings
Using custom head


  0%|          | 0/1 [00:00<?, ?it/s]

Getting BERT Embeddings
Using custom head


  0%|          | 0/1 [00:00<?, ?it/s]

Getting BERT Embeddings
Using custom head


  0%|          | 0/1 [00:00<?, ?it/s]

20


## LB 0.468 Electra large

In [15]:
class BERTModelConfig():
    lower = False
    use_dropout = 0.1
    pretrained_model = False
    use_hidden = False
    hidden_size = 1024
    max_len = 250
    fc_size = 1024
    use_single_fc = False
    use_pooler = False
    use_last_mean = False
    multisample_dropout = False
    use_rd_features = False
    custom_head = False
    dynamic_padding = False
    automodel_seq = False
    use_conv_head = False
    use_norm = False

args = BERTModelConfig()

for fold in range(5):
    pred_df[f'fold{fold_start}'] = get_bert_predictions(df, model_name='../input/torch-bert-large-models/electra-large-discriminator' ,
                                         model_path=f'../input/electra-large-0630/electra_large_0630/bert_model_fold{fold}.bin'
                                       )
    fold_start += 1
    torch.cuda.empty_cache()
    gc.collect()

print(fold_start)

Getting BERT Embeddings


  0%|          | 0/1 [00:00<?, ?it/s]

Getting BERT Embeddings


  0%|          | 0/1 [00:00<?, ?it/s]

Getting BERT Embeddings


  0%|          | 0/1 [00:00<?, ?it/s]

Getting BERT Embeddings


  0%|          | 0/1 [00:00<?, ?it/s]

Getting BERT Embeddings


  0%|          | 0/1 [00:00<?, ?it/s]

25


## LB 0.467 Roberta large 300 tokens new train


In [16]:
class BERTModelConfig():
    lower = False
    use_dropout = 0.0
    pretrained_model = False
    use_hidden = False
    hidden_size = 1024
    max_len = 300
    fc_size = 1024
    use_single_fc = True
    use_pooler = False
    use_last_mean = True
    multisample_dropout = False
    use_rd_features = False
    use_hidden_4 = False 
    custom_head = False
    dynamic_padding = False
    automodel_seq = False
    use_conv_head = False
    use_norm = True
    
args = BERTModelConfig()

for fold in range(5):
    pred_df[f'fold{fold_start}'] = get_bert_predictions(df, model_name='../input/torch-bert-large-models/roberta-large' ,
                                         model_path=f'../input/roberta-large-mean-0723/roberta_l_meanpool_0723/bert_model_fold{fold}.bin'
                                       )
    fold_start +=1
    torch.cuda.empty_cache()
    gc.collect()


print(fold_start)

Getting BERT Embeddings


  0%|          | 0/1 [00:00<?, ?it/s]

Getting BERT Embeddings


  0%|          | 0/1 [00:00<?, ?it/s]

Getting BERT Embeddings


  0%|          | 0/1 [00:00<?, ?it/s]

Getting BERT Embeddings


  0%|          | 0/1 [00:00<?, ?it/s]

Getting BERT Embeddings


  0%|          | 0/1 [00:00<?, ?it/s]

30


## LB 0.468 roberta ed train 300 token. automodel


In [17]:
class BERTModelConfig():
    lower = False
    use_dropout = 0.0
    pretrained_model = False
    use_hidden = False
    hidden_size = 1024
    max_len = 300
    fc_size = 1024
    use_single_fc = False
    use_pooler = False
    use_last_mean = False
    multisample_dropout = False
    use_rd_features = False
    use_hidden_4 = False 
    custom_head = False
    dynamic_padding = False
    automodel_seq = True
    use_conv_head = False
    use_norm = False
    
args = BERTModelConfig()

for fold in range(5):
    pred_df[f'fold{fold_start}'] = get_bert_predictions(df, model_name='../input/torch-bert-large-models/roberta-large' ,
                                         model_path=f'../input/roberta-large-0728/roberta_l_0728/bert_model_fold{fold}.bin'
                                       )
    fold_start +=1
    torch.cuda.empty_cache()
    gc.collect()


print(fold_start)


Getting BERT Embeddings


Some weights of the model checkpoint at ../input/torch-bert-large-models/roberta-large were not used when initializing RobertaForSequenceClassification: ['pooler.dense.weight', 'pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at ../input/torch-bert-large-models/roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.out_proj.bias']
You should probably

  0%|          | 0/1 [00:00<?, ?it/s]

Getting BERT Embeddings


Some weights of the model checkpoint at ../input/torch-bert-large-models/roberta-large were not used when initializing RobertaForSequenceClassification: ['pooler.dense.weight', 'pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at ../input/torch-bert-large-models/roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.out_proj.bias']
You should probably

  0%|          | 0/1 [00:00<?, ?it/s]

Getting BERT Embeddings


Some weights of the model checkpoint at ../input/torch-bert-large-models/roberta-large were not used when initializing RobertaForSequenceClassification: ['pooler.dense.weight', 'pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at ../input/torch-bert-large-models/roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.out_proj.bias']
You should probably

  0%|          | 0/1 [00:00<?, ?it/s]

Getting BERT Embeddings


Some weights of the model checkpoint at ../input/torch-bert-large-models/roberta-large were not used when initializing RobertaForSequenceClassification: ['pooler.dense.weight', 'pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at ../input/torch-bert-large-models/roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.out_proj.bias']
You should probably

  0%|          | 0/1 [00:00<?, ?it/s]

Getting BERT Embeddings


Some weights of the model checkpoint at ../input/torch-bert-large-models/roberta-large were not used when initializing RobertaForSequenceClassification: ['pooler.dense.weight', 'pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at ../input/torch-bert-large-models/roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.out_proj.bias']
You should probably

  0%|          | 0/1 [00:00<?, ?it/s]

35


## LB 0.467 Roberta large meanpooling


In [18]:
class BERTModelConfig():
    lower = False
    use_dropout = 0.0
    pretrained_model = False
    use_hidden = False
    hidden_size = 1024
    max_len = 256
    fc_size = 1024
    use_single_fc = True
    use_pooler = False
    use_last_mean = True
    multisample_dropout = False
    use_rd_features = False
    use_hidden_4 = False 
    custom_head = False
    dynamic_padding = False
    automodel_seq = False
    use_conv_head = False
    use_norm = False

args = BERTModelConfig()

for fold in range(5):
    pred_df[f'fold{fold_start}'] = get_bert_predictions(df, model_name='../input/torch-bert-large-models/roberta-large' ,
                                         model_path=f'../input/roberta-large-0721/roberta_l_meanpool_0721/bert_model_fold{fold}.bin'
                                       )
    fold_start +=1
    torch.cuda.empty_cache()
    gc.collect()

print(fold_start)

Getting BERT Embeddings


  0%|          | 0/1 [00:00<?, ?it/s]

Getting BERT Embeddings


  0%|          | 0/1 [00:00<?, ?it/s]

Getting BERT Embeddings


  0%|          | 0/1 [00:00<?, ?it/s]

Getting BERT Embeddings


  0%|          | 0/1 [00:00<?, ?it/s]

Getting BERT Embeddings


  0%|          | 0/1 [00:00<?, ?it/s]

40


## LB 0.476 BART large squad

In [19]:
class BERTModelConfig():
    lower = False
    use_dropout = 0.0
    pretrained_model = False
    use_hidden = False
    hidden_size = 1024
    max_len = 250
    fc_size = 1024
    use_single_fc = False
    use_pooler = False
    use_last_mean = False
    multisample_dropout = False
    use_rd_features = False
    use_hidden_4 = False 
    custom_head = False
    dynamic_padding = False
    automodel_seq = False
    use_conv_head = False
    use_norm = False
    
args = BERTModelConfig()

for fold in range(5):

    pred_df[f'fold{fold_start}'] = get_bert_predictions(df,
                                         model_name='../input/bert-sentence-xlmr/bart-large-finetuned-squadv1' ,
                                         model_path=f'../input/bart-large-0710/bart_l_0701/bert_model_fold{fold}.bin'
                                       )



    fold_start +=1
    torch.cuda.empty_cache()
    gc.collect()
print(fold_start)

Getting BERT Embeddings


  0%|          | 0/1 [00:00<?, ?it/s]

Getting BERT Embeddings


  0%|          | 0/1 [00:00<?, ?it/s]

Getting BERT Embeddings


  0%|          | 0/1 [00:00<?, ?it/s]

Getting BERT Embeddings


  0%|          | 0/1 [00:00<?, ?it/s]

Getting BERT Embeddings


  0%|          | 0/1 [00:00<?, ?it/s]

45


## LB 0.468 roberta large all_text


In [20]:
class BERTModelConfig():
    lower = False
    use_dropout = 0.0
    pretrained_model = False
    use_hidden = False
    hidden_size = 1024
    max_len = 250
    fc_size = 1024
    use_single_fc = False
    use_pooler = False
    use_last_mean = False
    multisample_dropout = False
    use_rd_features = False
    use_hidden_4 = False 
    custom_head = False
    dynamic_padding = False
    automodel_seq = False
    use_conv_head = False
    use_norm = False

args = BERTModelConfig()

for fold in range(5):

    pred_df[f'fold{fold_start}'] = get_bert_predictions(df, model_name='../input/torch-bert-large-models/roberta-large' ,
                                         model_path=f'../input/roberta-large-0626aug/roberta_l_0626/bert_model_fold{fold}.bin'
                                       )
    fold_start +=1
    torch.cuda.empty_cache()
    gc.collect()

print(fold_start)

Getting BERT Embeddings


  0%|          | 0/1 [00:00<?, ?it/s]

Getting BERT Embeddings


  0%|          | 0/1 [00:00<?, ?it/s]

Getting BERT Embeddings


  0%|          | 0/1 [00:00<?, ?it/s]

Getting BERT Embeddings


  0%|          | 0/1 [00:00<?, ?it/s]

Getting BERT Embeddings


  0%|          | 0/1 [00:00<?, ?it/s]

50


In [21]:
regression_pred_df = pd.DataFrame() 
regression_pred_group1_df = pd.DataFrame() 
regression_pred_group2_df = pd.DataFrame()
regression_pred_group3_df = pd.DataFrame()
regression_pred_group4_df = pd.DataFrame()
regression_pred_group5_df = pd.DataFrame()

print(pred_df.shape)
print(pred_df.head())
print(svr_df.shape)
print(svr_df.head())

(7, 50)
      fold0     fold1     fold2     fold3     fold4     fold5     fold6  \
0 -0.561806 -0.466362 -0.488735 -0.634259 -0.560673 -0.434041 -0.314521   
1 -0.586472 -0.383682 -0.482145 -0.535929 -0.407913 -0.565384 -0.504237   
2 -0.448924 -0.778351 -0.471418 -0.534699 -0.387349 -0.481668 -0.741641   
3 -2.410063 -2.428232 -2.015002 -2.606764 -2.341552 -2.093623 -2.182258   
4 -1.844369 -2.147868 -1.663280 -1.993978 -1.728984 -1.916421 -2.022207   

      fold7     fold8     fold9  ...    fold40    fold41    fold42    fold43  \
0 -0.250722 -0.545449 -0.542897  ... -0.401214 -0.335098 -0.467688 -0.605479   
1 -0.583989 -0.416541 -0.355844  ... -0.422520 -0.521808 -0.431727 -0.509120   
2 -0.584910 -0.395339 -0.618272  ... -0.684752 -0.738746 -0.580276 -0.698312   
3 -2.335519 -2.034565 -2.114521  ... -1.827704 -2.022896 -2.054706 -2.219954   
4 -1.753264 -1.904072 -2.010194  ... -1.867424 -1.813752 -1.856029 -1.856373   

     fold44    fold45    fold46    fold47    fold48    fold4

In [22]:
##### 0.464-0.468 Best models - 5 models
group1_cols = [f'fold{i}' for i in range(0,25)]
print(group1_cols) 
##### Rem models - 5 models
group2_cols = [f'fold{i}' for i in range(25,50)]
print(group2_cols) 


##### LB 0.464-0.468 - 5 models
regression_pred_group1_df['target'] = pred_df[group1_cols].mean(axis=1).values.tolist()
##### rem models - 5 models
regression_pred_group2_df['target'] = pred_df[group2_cols].mean(axis=1).values.tolist() 


##### regression_pred_df['target'] = (regression_pred_group1_df['target']  * 0.6) + (regression_pred_group2_df['target'] *0.3) + (regression_pred_group3_df['target'] *0.1) 


##### best_models_df['target'] = np.mean((regression_pred_group1_df['target'], regression_pred_group3_df['target']), axis=0) 

regression_pred_df['target'] = (regression_pred_group1_df['target']  * 0.8) + (regression_pred_group2_df['target'] *0.2)
print(regression_pred_df.head())

['fold0', 'fold1', 'fold2', 'fold3', 'fold4', 'fold5', 'fold6', 'fold7', 'fold8', 'fold9', 'fold10', 'fold11', 'fold12', 'fold13', 'fold14', 'fold15', 'fold16', 'fold17', 'fold18', 'fold19', 'fold20', 'fold21', 'fold22', 'fold23', 'fold24']
['fold25', 'fold26', 'fold27', 'fold28', 'fold29', 'fold30', 'fold31', 'fold32', 'fold33', 'fold34', 'fold35', 'fold36', 'fold37', 'fold38', 'fold39', 'fold40', 'fold41', 'fold42', 'fold43', 'fold44', 'fold45', 'fold46', 'fold47', 'fold48', 'fold49']
     target
0 -0.493030
1 -0.525235
2 -0.503864
3 -2.270474
4 -1.844058


In [23]:
pairs_pred_df 


Unnamed: 0,id,target
0,c0f722661,-0.297706
1,f0953f0a5,-0.53511
2,0df072751,-0.599064
3,04caf4e0c,-2.257258
4,0e63f8bea,-1.857209
5,12537fe78,-1.176318
6,965e592c0,0.252361


In [24]:
sub_df = pd.read_csv('../input/commonlitreadabilityprize/sample_submission.csv')
# Mean of pairs and regression approach
sub_df['target'] = (pairs_pred_df['target'] * 0.5) + (regression_pred_df['target'] * 0.5)
sub_df.to_csv('submission.csv', index=False)
print(sub_df.head())

          id    target
0  c0f722661 -0.395368
1  f0953f0a5 -0.530172
2  0df072751 -0.551464
3  04caf4e0c -2.263866
4  0e63f8bea -1.850633
