In [1]:
##### The following is necessary if you want to use the fast tokenizer for deberta v2 or v3
# This must be done before importing transformers
import shutil
from pathlib import Path

transformers_path = Path("/opt/conda/lib/python3.7/site-packages/transformers")

input_dir = Path("../input/deberta-v3-tokenizer-fast")

convert_file = input_dir / "convert_slow_tokenizer.py"
conversion_path = transformers_path/convert_file.name

if conversion_path.exists():
    conversion_path.unlink()

shutil.copy(convert_file, transformers_path)
deberta_v2_path = transformers_path / "models" / "deberta_v2"

for filename in ['tokenization_deberta_v2.py', 'tokenization_deberta_v2_fast.py']:
    filepath = deberta_v2_path/filename
    
    if filepath.exists():
        filepath.unlink()

    shutil.copy(input_dir/filename, filepath)

In [2]:
from transformers.convert_slow_tokenizer import SpmConverter
from transformers.models.deberta_v2.tokenization_deberta_v2 import (
        DebertaV2Tokenizer,
    )

from transformers.models.deberta_v2.tokenization_deberta_v2_fast import DebertaV2TokenizerFast

from tokenizers import Regex, normalizers, processors
class DebertaV2Converter(SpmConverter):
    def normalizer(self, proto):
        list_normalizers = []
        if self.original_tokenizer.do_lower_case:
            list_normalizers.append(normalizers.Lowercase())

        # precompiled_charsmap = proto.normalizer_spec.precompiled_charsmap
        # if precompiled_charsmap:
        #     list_normalizers.append(normalizers.Precompiled(precompiled_charsmap))
        list_normalizers.append(normalizers.Replace(Regex(" {2,}"), " "))

        return normalizers.Sequence(list_normalizers)

    def post_processor(self):
        return processors.TemplateProcessing(
            single="[CLS]:0 $A:0 [SEP]:0",
            pair="[CLS]:0 $A:0 [SEP]:0 $B:1 [SEP]:1",
            special_tokens=[
                ("[CLS]", self.original_tokenizer.convert_tokens_to_ids("[CLS]")),
                ("[SEP]", self.original_tokenizer.convert_tokens_to_ids("[SEP]")),
            ],
        )


def convert_deberta_v2_tokenizer(
    tokenizer: DebertaV2Tokenizer
) -> DebertaV2TokenizerFast:
    tokenizer.vocab_file = tokenizer._tokenizer.vocab_file
    return DebertaV2TokenizerFast(
        tokenizer._tokenizer.vocab_file,
        **tokenizer.init_kwargs,
        tokenizer_object=DebertaV2Converter(tokenizer).converted()
    )

In [3]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import os
from transformers import AutoConfig,AutoModel
import torch.nn as nn
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModelForTokenClassification, AutoConfig
from transformers.models.deberta_v2.tokenization_deberta_v2_fast import DebertaV2TokenizerFast
from tqdm import tqdm
os.environ["CUDA_VISIBLE_DEVICES"]='0'
device='cuda' if torch.cuda.is_available() else 'cpu'

In [4]:
class Experiment:
    def __init__(self,DOWNLOADED_MODEL_PATH,
                 TRAINED_MODEL_PATH,
                 XGB_PATH,
                 FOLDS,
                 hidden_state_dimension,
                 BATCH_SIZE,
                 NUM_WORKERS,
                 MAX_LEN,
                 WINDOW_SIZE,
                 RNN,):
        self.DOWNLOADED_MODEL_PATH=DOWNLOADED_MODEL_PATH
        self.TRAINED_MODEL_PATH=TRAINED_MODEL_PATH
        self.XGB_PATH=XGB_PATH
        self.FOLDS=FOLDS
        self.hidden_state_dimension=hidden_state_dimension
        self.BATCH_SIZE=BATCH_SIZE
        self.NUM_WORKERS=NUM_WORKERS
        self.MAX_LEN=MAX_LEN
        self.WINDOW_SIZE=WINDOW_SIZE
        self.RNN=RNN

In [5]:
experiments=[]


experiments.append(Experiment(DOWNLOADED_MODEL_PATH="../input/deberta-v3-large",
                              BATCH_SIZE=4,
                              TRAINED_MODEL_PATH=[f"../input/test10-deberta-v3-large-pl-5th-tascj0-corrected-nb/fold{i}.pt" for i in range(6)],
                              XGB_PATH='../input/test10-deberta-v3-large-pl-5th-tascj0-corrected-nb',
                              FOLDS=np.arange(6),
                              hidden_state_dimension=1024,
                              NUM_WORKERS=2,
                              MAX_LEN=1280,
                              WINDOW_SIZE=1280,
                              RNN='GRU'))



model_paths=[f"../input/test10-deberta-v2-xlarge-pl-5th-tascj0-corrected-0/fold{i}.pt" for i in range(3)]
model_paths+=[f"../input/test10-deberta-v2-xlarge-pl-5th-tascj0-corrected-1/fold{i}.pt" for i in range(3,6)]
experiments.append(Experiment(DOWNLOADED_MODEL_PATH="../input/deberta-v2-xlarge",
                              BATCH_SIZE=4,
                              TRAINED_MODEL_PATH=model_paths,
                              XGB_PATH='../input/test10-deberta-v2-xlarge-pl-5th-tascj0-corrected-0',
                              FOLDS=np.arange(6),
                              hidden_state_dimension=1536,
                              NUM_WORKERS=2,
                              MAX_LEN=1280,
                              WINDOW_SIZE=1280,
                              RNN='GRU'))



experiments.append(Experiment(DOWNLOADED_MODEL_PATH="../input/deberta-xlarge",
                              BATCH_SIZE=4,
                              TRAINED_MODEL_PATH=[f"../input/fb-test10-deberta-xlarge-pl-5th-tascj0-nb/fold{i}.pt" for i in range(6)],
                              XGB_PATH='../input/fb-test10-deberta-xlarge-pl-5th-tascj0-nb',
                              FOLDS=np.arange(6),
                              hidden_state_dimension=1024,
                              NUM_WORKERS=2,
                              MAX_LEN=1280,
                              WINDOW_SIZE=1280,
                              RNN='GRU'))


model_paths=[f"../input/fb-test10-deberta-v2-xlarge-pl-5th-nb0/fold{i}.pt" for i in range(3)]
model_paths+=[f"../input/fb-test10-deberta-v2-xlarge-pl-5th-nb1/fold{i}.pt" for i in range(3,6)]
experiments.append(Experiment(DOWNLOADED_MODEL_PATH="../input/deberta-v2-xlarge",
                              BATCH_SIZE=4,
                              TRAINED_MODEL_PATH=model_paths,
                              XGB_PATH='../input/fb-test10-deberta-v2-xlarge-pl-5th-nb0',
                              FOLDS=np.arange(6),
                              hidden_state_dimension=1536,
                              NUM_WORKERS=2,
                              MAX_LEN=1280,
                              WINDOW_SIZE=1280,
                              RNN='GRU'))

experiments.append(Experiment(DOWNLOADED_MODEL_PATH="../input/deberta-v3-large",
                              BATCH_SIZE=4,
                              TRAINED_MODEL_PATH=[f"../input/fb-test10-deberta-v3-large-pl-5th-nb/fold{i}.pt" for i in range(6)],
                              XGB_PATH='../input/fb-test10-deberta-v3-large-pl-5th-nb',
                              FOLDS=np.arange(6),
                              hidden_state_dimension=1024,
                              NUM_WORKERS=2,
                              MAX_LEN=1280,
                              WINDOW_SIZE=1280,
                              RNN='GRU'))


# experiments.append(Experiment(DOWNLOADED_MODEL_PATH="../input/deberta-large",
#                               BATCH_SIZE=4,
#                               TRAINED_MODEL_PATH=[f"../input/fb-test10-deberta-large-pl-4th-tascj0-nb/fold{i}.pt" for i in range(6)],
#                               XGB_PATH='../input/fbtest10debertalargepl4thtascj0xgb',
#                               FOLDS=np.arange(6),
#                               hidden_state_dimension=1024,
#                               NUM_WORKERS=2,
#                               MAX_LEN=1280,
#                               WINDOW_SIZE=1280,
#                               RNN='GRU'))

# experiments.append(Experiment(DOWNLOADED_MODEL_PATH="../input/deberta-v3-large",
#                               BATCH_SIZE=4,
#                               TRAINED_MODEL_PATH=[f"../input/fb-test10-deberta-v3-large-pl-4th-nb/fold{i}.pt" for i in range(6)],
#                               XGB_PATH='../input/fbtest10debertav3largepl4thxgb',
#                               FOLDS=np.arange(6),
#                               hidden_state_dimension=1024,
#                               NUM_WORKERS=2,
#                               MAX_LEN=1280,
#                               WINDOW_SIZE=1280,
#                               RNN='GRU'))



# model_paths=[f"../input/fb-test10-deberta-v2-xlarge-pl-4th-nb0/fold{i}.pt" for i in range(3)]
# model_paths+=[f"../input/fb-test10-deberta-v2-xlarge-pl-4th-nb1/fold{i}.pt" for i in range(3,6)]
# experiments.append(Experiment(DOWNLOADED_MODEL_PATH="../input/deberta-v2-xlarge",
#                               BATCH_SIZE=4,
#                               TRAINED_MODEL_PATH=model_paths,
#                               XGB_PATH='../input/fbtest10debertav2xlargepl4thxgb',
#                               FOLDS=np.arange(6),
#                               hidden_state_dimension=1536,
#                               NUM_WORKERS=2,
#                               MAX_LEN=1280,
#                               WINDOW_SIZE=1280,
#                               RNN='GRU'))







# Read Data

In [6]:
test=pd.read_csv("../input/feedback-prize-effectiveness/test.csv")
full_texts={}

for essay_id in test['essay_id']:
    with open(os.path.join("../input/feedback-prize-effectiveness/test",essay_id+'.txt'),'r') as f:
        text=f.read()

    full_texts[essay_id]=text
    
test['label']=-1    

In [7]:
def get_substring_span(text, substring, min_length=10, fraction=0.999):
    """
    Returns substring's span from the given text with the certain precision.
    """

    position = text.find(substring)
    substring_length = len(substring)
    if position == -1:
        half_length = int(substring_length * fraction)
        half_substring = substring[:half_length]
        half_substring_length = len(half_substring)
        if half_substring_length < min_length:
            return [-1, 0]
        else:
            return get_substring_span(text=text,
                                    substring=half_substring,
                                    min_length=min_length,
                                    fraction=fraction)

    span = [position, position+substring_length]
    return span

discourse_mapping={'Lead': 0, 'Position': 1, 'Claim': 2, 'Evidence': 3, 'Counterclaim': 4, 'Rebuttal': 5, 'Concluding Statement': 6}

class FeedbackDataset(Dataset):
    def __init__(self, tokenizer, df, full_texts, train, aug=False, loss_type="BCELoss", max_len=512):
        self.tokenizer = tokenizer
        self.texts = df['discourse_text'].values
        self.labels = df['label'].values
        self.discourse_type=df['discourse_type'].values
        self.essay_ids=df['essay_id'].values
        self.full_texts=full_texts
        self.max_len = max_len
        self.aug = aug
        #self.nlp_aug=naw.SynonymAug()
        self.train=train

        self.encodings=[]
        self.labels=[]
        self.gather_indices=[]
        self.discourse_ids=[]
        self.discourse_type_ids=[]
        self.input_id_lengths=[]
        for key in tqdm(df['essay_id'].unique()):
            discourses=df[df['essay_id']==key]
            text=full_texts[key]
            reference_text=text[:]

            for discourse_text,label,id,type in zip(discourses['discourse_text'],discourses['label'],discourses['discourse_id'],discourses['discourse_type']):
                span=get_substring_span(reference_text, discourse_text.strip())
                text=text[:span[0]]+f"({type} start)"+discourse_text.strip()+f"({type} end)"+text[span[1]:]
                reference_text=reference_text[:span[0]]+f"({type} start)"+"*"*(span[1]-span[0])+f"({type} end)"+reference_text[span[1]:]


                #reference_text[:span[0]]+"*"*(span[1]-span[0])+text[span[1]:]

            encoding = self.tokenizer(text,
                                   add_special_tokens=True,
                                   max_length=self.max_len,
                                   padding=False,
                                   return_offsets_mapping=True,
                                   truncation=True)
            gather_indices=np.ones(len(encoding['input_ids']))*-1
            discourse_type_ids=np.zeros(len(encoding['input_ids']))
            cnt=0
            sample_labels=[]
            discourse_ids=[]



            for discourse_text,label,id,type in zip(discourses['discourse_text'],discourses['label'],discourses['discourse_id'],discourses['discourse_type']):
                span=get_substring_span(text, discourse_text.strip())
                n_tokens=0
                # print(encoding['offset_mapping'])
                # exit()
                for i in range(len(gather_indices)):
                    if encoding['offset_mapping'][i]!=(0,0) and encoding['offset_mapping'][i][0]>=span[0] and encoding['offset_mapping'][i][1]<=span[1]:
                        gather_indices[i]=cnt
                        discourse_type_ids[i]=discourse_mapping[type]
                        n_tokens+=1
                text=text[:span[0]]+"*"*(span[1]-span[0])+text[span[1]:]
                # if (gather_indices==3).sum()==0:
                #     print(gather_indices)
                if (gather_indices==cnt).sum()>0:
                    sample_labels.append(label)
                    discourse_ids.append(id)
                    cnt+=1
                # else:
                #     print(cnt)
                #     print(discourse_text)

            # for cnt in range(int(gather_indices.max())+1):
            #     if (gather_indices==cnt).sum()==0:
            #         print(gather_indices)
            #         print(cnt)
            #         print(len(sample_labels))
            #         print(discourses)
            #         gather_indices=np.ones(len(encoding['input_ids']))*-1
            #         cnt=0
            #         sample_labels=[]
            #         for discourse_text,label in zip(discourses['discourse_text'],discourses['label']):
            #             span=get_substring_span(text, discourse_text.strip())
            #             n_tokens=0
            #             # print(encoding['offset_mapping'])
            #             # exit()
            #             for i in range(len(gather_indices)):
            #                 if encoding['offset_mapping'][i]!=(0,0) and encoding['offset_mapping'][i][0]>=span[0] and encoding['offset_mapping'][i][1]<=span[1]:
            #                     gather_indices[i]=cnt
            #                     n_tokens+=1
            #             # if (gather_indices==3).sum()==0:
            #             print(gather_indices)
            #             if (gather_indices==cnt).sum()>0:
            #                 sample_labels.append(label)
            #                 cnt+=1
            #
            #         exit()


            self.encodings.append(encoding)
            self.labels.append(sample_labels)
            self.gather_indices.append(gather_indices)
            self.discourse_ids.append(discourse_ids)
            self.discourse_type_ids.append(discourse_type_ids)
            self.input_id_lengths.append(len(encoding['input_ids']))
        sorted_indices=np.argsort(self.input_id_lengths)
        self.encodings=[self.encodings[i] for i in sorted_indices]
        self.labels=[self.labels[i] for i in sorted_indices]
        self.gather_indices=[self.gather_indices[i] for i in sorted_indices]
        self.discourse_ids=[self.discourse_ids[i] for i in sorted_indices]
        self.discourse_type_ids=[self.discourse_type_ids[i] for i in sorted_indices]
        self.input_id_lengths=[self.input_id_lengths[i] for i in sorted_indices]
        
            # print(gather_indices)
            # print(sample_labels)
            #
            # print(key)
            # print(discourses)
        #exit()




        # self.anchors = df['anchor'].values
        # self.targets = df['target'].values
        # self.contexts = df['context'].values
        # if loss_type=='BCELoss':
        #     self.labels = df['score'].values
        # elif loss_type=='CrossEntropyLoss':
        #     self.labels = df['score_map'].values
        # elif loss_type=='OrdinalLoss':
        #     self.labels=[]
        #     for label in df['score_map'].values:
        #         temp=np.zeros(4)
        #         temp[:label]=1
        #         self.labels.append(temp)
            #self.labels = df['score_map'].values


        #self.level=level


    def __len__(self):
        return len(self.labels)


            # for text in

    def __getitem__(self, idx):

        # text=self.discourse_type[idx].lower()+'[SEP]'+self.texts[idx]
        #
        #
        # encoding = self.tokenizer(text,
        #                        self.full_texts[self.essay_ids[idx]],
        #                        add_special_tokens=True,
        #                        max_length=self.max_len,
        #                        padding=False,
        #                        return_offsets_mapping=True,
        #                        truncation=True)
        encoding=self.encodings[idx]
        encoding['wids']=np.array(encoding.word_ids())
        encoding['wids'][encoding['wids']==None]=-1
        encoding['wids']=encoding['wids'].astype('int')
        #encoding.sequence_ids()
        label = self.labels[idx]
        sequence_ids=np.array(encoding.sequence_ids())
        sequence_ids[sequence_ids==None]=-1
        # print(sequence_ids)
        # exit()

        data={k:torch.tensor(v, dtype=torch.long) for k,v in encoding.items()}
        data['labels']=torch.tensor(label, dtype=torch.float)
        data['sequence_ids']=torch.tensor(sequence_ids.astype("int"))
        data['gather_indices']=torch.tensor(self.gather_indices[idx])
        data['discourse_ids']=self.discourse_ids[idx]
        data['discourse_type_ids']=torch.tensor(self.discourse_type_ids[idx])
        return data

In [8]:
class CustomCollate:
    def __init__(self,tokenizer,train=True,sliding_window=None):
        self.tokenizer=tokenizer
        self.train=train
        self.sliding_window=sliding_window

    def __call__(self,data):
        """
        need to collate: input_ids, attention_mask, labels
        input_ids is padded with 1, attention_mask 0, labels -100

        """


        bs=len(data)
        # print(data[0])
        # exit()
        lengths=[]
        for i in range(bs):
            lengths.append(len(data[i]['input_ids']))
        max_len=max(lengths)
        if self.sliding_window is not None and max_len > self.sliding_window:
            max_len= int((np.floor(max_len/self.sliding_window-1e-6)+1)*self.sliding_window)

        input_ids, attention_mask, labels, BIO_labels, discourse_labels=[],[],[],[],[]
        sequence_ids=[]
        gather_indices=[]
        wids=[]
        discourse_ids=[]
        discourse_type_ids=[]
        for i in range(bs):
            input_ids.append(torch.nn.functional.pad(data[i]['input_ids'],(0,max_len-lengths[i]),value=self.tokenizer.pad_token_id))
            attention_mask.append(torch.nn.functional.pad(data[i]['attention_mask'],(0,max_len-lengths[i]),value=0))
            labels.append(data[i]['labels'])
            sequence_ids.append(torch.nn.functional.pad(data[i]['sequence_ids'],(0,max_len-lengths[i]),value=-1))
            gather_indices.append(torch.nn.functional.pad(data[i]['gather_indices'],(0,max_len-lengths[i]),value=-1))
            discourse_type_ids.append(torch.nn.functional.pad(data[i]['discourse_type_ids'],(0,max_len-lengths[i]),value=0))
            discourse_ids=discourse_ids+data[i]['discourse_ids']
            #wids.append(torch.nn.functional.pad(data[i]['wids'],(0,max_len-lengths[i]),value=-1))
        input_ids=torch.stack(input_ids)
        attention_mask=torch.stack(attention_mask)
        labels=torch.cat(labels)
        sequence_ids=torch.stack(sequence_ids)
        gather_indices=torch.stack(gather_indices)
        discourse_type_ids=torch.stack(discourse_type_ids)
        #wids=torch.stack(wids)

        #offsets=[encoding["offset_mapping"] for encoding in data]
        offsets=[]
        # print(len(offsets[0]))
        # exit()

        return {"input_ids":input_ids,"attention_mask":attention_mask,
        "labels":labels,"sequence_ids":sequence_ids,"wids":wids,"offsets":offsets,
        "sample_id":np.arange(len(input_ids)),"gather_indices":gather_indices,"discourse_ids":discourse_ids,
        "discourse_type_ids":discourse_type_ids}


# Network

In [9]:
class ResidualLSTM(nn.Module):

    def __init__(self, d_model, rnn='GRU'):
        super(ResidualLSTM, self).__init__()
        self.downsample=nn.Linear(d_model,d_model//2)
        if rnn=='GRU':
            self.LSTM=nn.GRU(d_model//2, d_model//2, num_layers=2, bidirectional=False, dropout=0.2)
        else:
            self.LSTM=nn.LSTM(d_model//2, d_model//2, num_layers=2, bidirectional=False, dropout=0.2)
        self.dropout1=nn.Dropout(0.2)
        self.norm1= nn.LayerNorm(d_model//2)
        self.linear1=nn.Linear(d_model//2, d_model)
        self.linear2=nn.Linear(d_model*4, d_model)
        self.dropout2=nn.Dropout(0.2)
        self.norm2= nn.LayerNorm(d_model)

    def forward(self, x):
        x=x.permute(1,0,2)
        res=x
        x=self.downsample(x)
        x, _ = self.LSTM(x)
        x = self.linear1(x)
        # x=self.dropout1(x)
        # x=self.norm1(x)
        # x=F.relu(self.linear1(x))
        # x=self.linear2(x)
        # x=self.dropout2(x)
        x=res+x
        x=x.permute(1,0,2)
        return self.norm2(x)

class SlidingWindowTransformerModel(nn.Module):
    def __init__(self,DOWNLOADED_MODEL_PATH, hidden_state_dimension, nclass, rnn='GRU', window_size=512, edge_len=64, no_backbone=False):
        super(SlidingWindowTransformerModel, self).__init__()
        config_model = AutoConfig.from_pretrained(DOWNLOADED_MODEL_PATH+'/config.json')
        self.no_backbone=no_backbone
        if no_backbone:
            pass
        else:
            self.backbone=AutoModel.from_pretrained(
                               DOWNLOADED_MODEL_PATH+'/pytorch_model.bin',config=config_model)

        if rnn=="GRU" or rnn=='LSTM':
            self.lstm=ResidualLSTM(hidden_state_dimension,rnn)
        else:
            self.lstm=ResNet()
        self.classification_head=nn.Linear(hidden_state_dimension,nclass)
        self.window_size=window_size
        self.edge_len=edge_len
        self.inner_len=window_size-edge_len*2

        self.discourse_embedding=nn.Embedding(8,256,padding_idx=0)
        self.downsample=nn.Linear(hidden_state_dimension+256,hidden_state_dimension)

    def forward(self,input_ids,attention_mask,sequence_ids,discourse_type_ids,gather_indices,return_vectors=False,return_transformer_hidden_states=False):



        # print(L)
        # exit()
        #x=self.backbone(input_ids=input_ids,attention_mask=attention_mask,return_dict=False)[0]
        #x=self.backbone.embeddings(input_ids)#+0.1*self.discourse_embedding(discourse_type_ids)
        discourse_type_ids=self.discourse_embedding(discourse_type_ids)
        x=input_ids
        # x=torch.cat([x,discourse_type_ids],-1)
        # x=self.downsample(x)

        #x=torch.cat([x,])

        if self.no_backbone==False:
            B,L=input_ids.shape
            if L<=self.window_size:
                x=self.backbone(x,attention_mask=attention_mask,return_dict=False)[0]
                #pass
            else:
                #print("####")
                #print(input_ids.shape)
                segments=(L-self.window_size)//self.inner_len
                if (L-self.window_size)%self.inner_len>self.edge_len:
                    segments+=1
                elif segments==0:
                    segments+=1
                x_new=self.backbone(x[:,:self.window_size],attention_mask=attention_mask[:,:self.window_size],return_dict=False)[0]
                # print(x_new.shape)
                # exit()

                for i in range(1,segments+1):
                    start=self.window_size-self.edge_len+(i-1)*self.inner_len
                    end=self.window_size-self.edge_len+(i-1)*self.inner_len+self.window_size
                    end=min(end,L)
                    x_next=x[:,start:end]
                    mask_next=attention_mask[:,start:end]
                    x_next=self.backbone(x_next,attention_mask=mask_next,return_dict=False)[0]
                    #L_next=x_next.shape[1]-self.edge_len,
                    if i==segments:
                        x_next=x_next[:,self.edge_len:]
                    else:
                        x_next=x_next[:,self.edge_len:self.edge_len+self.inner_len]
                    #print(x_next.shape)
                    x_new=torch.cat([x_new,x_next],1)
                x=x_new
                #print(start,end)
        #print(x.shape)
            if return_transformer_hidden_states:
                transformer_hidden_states=x

            # print(x.shape)
            # exit()

            # x=torch.cat([x,discourse_type_ids],-1)
            # x=self.downsample(x)

            #x=self.lstm(x)

            #x=self.classification_head(x).squeeze(-1)

            pooled_outputs=[]
            if return_vectors:
                vectors=[]
            for i in range(len(x)):
                #n_discourses=gather_indices[i].max()+1
                # unique_gather_indices=torch.unique_consecutive(gather_indices[i])
                # unique_gather_indices=unique_gather_indices[unique_gather_indices!=-1]
                #
                # #print(unique_gather_indices)
                #
                # for j in unique_gather_indices:
                n_discourses=gather_indices[i].max()+1
                tmp=[]
                for j in range(n_discourses):


                    vector=x[i][gather_indices[i]==j]
                    if return_vectors:
                        vectors.append(self.classification_head(vector))
                    mean_vector=vector.mean(0)
                    #max_vector,_=vector.max(0)
                    # print(max_vector)
                    # exit()
                    #pooled=torch.cat([mean_vector,max_vector],-1)
                    #pooled=mean_vector
                    tmp.append(mean_vector)
                    #pooled_outputs.append(pooled)
                tmp=torch.stack(tmp)
                tmp=self.lstm(tmp.unsqueeze(0))
                pooled_outputs.append(tmp.squeeze(0))


            #exit()
            pooled_outputs=torch.cat(pooled_outputs)
            x=pooled_outputs
            x=self.classification_head(x).squeeze(-1)


        else:
            transformer_hidden_states=input_ids
            x=self.lstm(transformer_hidden_states)
            x=self.classification_head(x)

        if return_vectors:
            return x,vectors
        else:
            return x


# Inference

In [10]:
def sorted_quantile(array, q):
    array = np.array(array)
    n = len(array)
    index = (n - 1) * q
    left = np.floor(index).astype(int)
    fraction = index - left
    right = left
    right = right + (fraction > 0).astype(int)
    i, j = array[left], array[right]
    return i + (j - i) * fraction

from scipy.stats import entropy
#make features
def get_xgb_features(train_df,prob_sequences):
    features2calculate=[f"instability_{i}" for i in range(4)]+\
    [f"begin_{i}" for i in range(3)]+\
    [f"end_{i}" for i in range(3)]#+\
    #["entropy"]

    calculated_features=[]
    for i,prob_seq in tqdm(enumerate(prob_sequences)):

        tmp=[]
        #quants = np.linspace(0,1,n_quan)
        prob_seq=np.array(prob_seq)
        instability = []
        #all_quants=[] 
        tmp.append(np.diff(prob_seq[:,:],0).mean(0))
        tmp.append([(np.diff(prob_seq[:,[1,2]].sum(1))**2).mean()])

        tmp.append(prob_seq[:5,:].mean(0))
        tmp.append(prob_seq[-5:,:].mean(0))

        calculated_features.append(np.concatenate(tmp))


    train_df[features2calculate]=calculated_features
    train_df['len']=[len(s) for s in prob_sequences]

    calculated_features=np.array(calculated_features)
    calculated_features.shape

    p_features=[]
    n_features=[]
    neighbor_features=['Ineffective','Adequate','Effective','discourse_type']
    neighbor_features_values=train_df[neighbor_features].values
    for i in tqdm(range(len(train_df))):
        if i>1 and train_df['essay_id'].iloc[i]==train_df['essay_id'].iloc[i-1]:
            p_features.append(neighbor_features_values[i-1])
        else:
            p_features.append(neighbor_features_values[i])

        if i<(len(train_df)-1) and train_df['essay_id'].iloc[i]==train_df['essay_id'].iloc[i+1]:
            n_features.append(neighbor_features_values[i+1])
        else:
            n_features.append(neighbor_features_values[i])

    train_df[[f+"_previous" for f in neighbor_features]]=p_features
    train_df[[f+"_next" for f in neighbor_features]]=n_features

    train_df['mean_Ineffective']=train_df.groupby("essay_id")["Ineffective"].transform("mean")
    train_df['mean_Adequate']=train_df.groupby("essay_id")["Adequate"].transform("mean")
    train_df['mean_Effective']=train_df.groupby("essay_id")["Effective"].transform("mean")

    train_df['std_Ineffective']=train_df.groupby("essay_id")["Ineffective"].transform("std")
    train_df['std_Adequate']=train_df.groupby("essay_id")["Adequate"].transform("std")
    train_df['std_Effective']=train_df.groupby("essay_id")["Effective"].transform("std")

    train_df['discourse_count']=train_df.groupby("essay_id")['discourse_type'].transform("count")

    cnts=train_df.groupby('essay_id')['discourse_type'].apply(lambda x: x.value_counts())

    #new_df=[]
    discourse_types=['Claim','Evidence','Concluding Statement','Lead','Position','Counterclaim','Rebuttal']
    value_count_hash={}
    for t in discourse_types:
        value_count_hash[t]={}
    for key in cnts.keys():
        value_count_hash[key[1]][key[0]]=cnts[key]

    discourse_cnts=[]    
    for essay_id in train_df['essay_id'].unique():
        row=[essay_id]
        for d in discourse_types:
            try:
                row.append(value_count_hash[d][essay_id])
            except:
                row.append(0)
        discourse_cnts.append(row)

    discourse_cnts=pd.DataFrame(discourse_cnts,columns=['essay_id']+[f'{d}_count' for d in discourse_types])    
    #discourse_cnts

    train_df=train_df.merge(discourse_cnts,how='left',on='essay_id')
    train_df

    #train_df

    return train_df

In [11]:
neighbor_features=['Ineffective','Adequate','Effective','discourse_type']
discourse_types=['Claim','Evidence','Concluding Statement','Lead','Position','Counterclaim','Rebuttal']

features=["Ineffective","Adequate","Effective",
          "instability_0","instability_1","instability_2","instability_3",
          "len","discourse_type"]
features+=[f"begin_{i}" for i in range(3)]
features+=[f"end_{i}" for i in range(3)]

features=features+[f+"_previous" for f in neighbor_features]+[f+"_next" for f in neighbor_features]+\
['mean_Ineffective','mean_Adequate','mean_Effective']+['std_Ineffective','std_Adequate','std_Effective']+\
['discourse_count']+[f'{d}_count' for d in discourse_types]

In [12]:
from cuml import ForestInference
import xgboost as xgb
import pickle
label_mapping={'Ineffective': 0, 'Adequate': 1, 'Effective': 2}
#xgb_preds = []

subs=[]
for exp in experiments:
    test_params = {'batch_size': exp.BATCH_SIZE,
                'shuffle': False,
                'num_workers': exp.NUM_WORKERS,
                'pin_memory':True
                }

    if "v2" in exp.DOWNLOADED_MODEL_PATH or "v3" in exp.DOWNLOADED_MODEL_PATH:
        tokenizer = AutoTokenizer.from_pretrained(exp.DOWNLOADED_MODEL_PATH)
        tokenizer = convert_deberta_v2_tokenizer(tokenizer)
    else:
        tokenizer = AutoTokenizer.from_pretrained(exp.DOWNLOADED_MODEL_PATH)
    test_dataset = FeedbackDataset(tokenizer, test, full_texts, False, 0, ' ', exp.MAX_LEN)
    test_loader = DataLoader(test_dataset, **test_params, collate_fn=CustomCollate(tokenizer))
    model = SlidingWindowTransformerModel(exp.DOWNLOADED_MODEL_PATH,
                                      hidden_state_dimension=exp.hidden_state_dimension,
                                      window_size=exp.WINDOW_SIZE,rnn=exp.RNN,
                                      nclass=3)
    model.to(device);
    preds=[]
    discourse_ids=[]
    for index,weight_path in enumerate(exp.TRAINED_MODEL_PATH):
        model.load_state_dict(torch.load(weight_path))
        model.eval()
        tmp=[]
        tmp_vectors=[]
        for batch in tqdm(test_loader):
            ids = batch['input_ids'].to(device, dtype = torch.long)
            mask = batch['attention_mask'].to(device, dtype = torch.long)
            sequence_ids = batch['sequence_ids'].to(device, dtype = torch.long)
            sample_id=batch['sample_id']
            gather_indices = batch['gather_indices'].to(device, dtype = torch.long)
            discourse_type_ids = batch['discourse_type_ids'].to(device, dtype = torch.long)
            if index==0:
                discourse_ids=discourse_ids+batch['discourse_ids']
            max_sample_id=sample_id.max()
            with torch.no_grad():
                output,vectors = model(ids,mask,sequence_ids,discourse_type_ids,gather_indices,return_vectors=True)
                vectors=[torch.nn.functional.softmax(v,-1).cpu().numpy() for v in vectors]
                tmp_vectors+=vectors
                output=torch.nn.functional.softmax(output,-1)

                


            tmp.append(output.cpu())
        tmp=torch.cat(tmp)
        preds.append(tmp)
        
        if index==0:
            prob_sequences=[tmp_vectors]
        else:
    #         for i in range(len(prob_sequences)):
    #             prob_sequences[i]=prob_sequences[i]+tmp_vectors[i]
            prob_sequences.append(tmp_vectors)            
        
        
    preds=torch.stack(preds).numpy()
    
    
    xgb_preds=[]
    for fold in exp.FOLDS:
        sub=pd.DataFrame(columns=['discourse_id']+list(label_mapping.keys()))
        sub['discourse_id']=discourse_ids
        sub[list(label_mapping.keys())]=preds[fold]
        sub=sub.merge(test[['discourse_id','discourse_type','essay_id']],how='left',on='discourse_id')

        sub=get_xgb_features(sub,prob_sequences[fold])

        for f in features:
            if f not in ['discourse_type_previous','discourse_type_next','discourse_type']:
                sub[f]= sub[f].astype('float')
            else:    
                sub[f]= sub[f].astype('category')

        d_test = xgb.DMatrix(sub[features],enable_categorical=True)
        for xgb_fold in exp.FOLDS:
            #xgb_model_loaded = pickle.load(open(f"{exp.XGB_PATH}/xgb_{xgb_fold}.p", "rb"))
            xgb_model_loaded = xgb.Booster()
            xgb_model_loaded.load_model(f"{exp.XGB_PATH}/xgb_{xgb_fold}.json")
            xgb_preds.append(xgb_model_loaded.predict(d_test))

    xgb_preds=np.stack(xgb_preds)
    xgb_preds.shape
    xgb_preds=xgb_preds.mean(0)        
    
    submission=pd.read_csv("../input/feedback-prize-effectiveness/sample_submission.csv")

    discourse_ids=list(submission['discourse_id'])
    
#     submission["Ineffective"]=1e-9
#     submission["Adequate"]=1e-9
#     submission["Effective"]=1e-9
    
    for i in range(len(sub)):
        index=discourse_ids.index(sub['discourse_id'].iloc[i])
        submission["Ineffective"].iloc[index]=xgb_preds[i,0]
        submission["Adequate"].iloc[index]=xgb_preds[i,1]
        submission["Effective"].iloc[index]=xgb_preds[i,2]

    submission.to_csv("submission.csv",index=False)
    subs.append(submission)
    
    

100%|██████████| 1/1 [00:00<00:00, 39.61it/s]
Some weights of the model checkpoint at ../input/deberta-v3-large/pytorch_model.bin were not used when initializing DebertaV2Model: ['classifier.bias', 'classifier.weight']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
  0%|          | 0/1 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


100%|██████████| 1/1 [00:01<00:00,  1.79s/it]
  0%|          | 0/1 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


100%|██████████| 1/1 [00:00<00:00,  2.68it/s]
  0%|          | 0/1 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


100%|██████████| 1/1 [00:00<00:00,  2.78it/s]
  0%|          | 0/1 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


100%|██████████| 1/1 [00:00<00:00,  3.74it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


100%|██████████| 1/1 [00:00<00:00,  2.76it/s]
  0%|          | 0/1 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


100%|██████████| 1/1 [00:00<00:00,  2.38it/s]
  0%|          | 0/1 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


100%|██████████| 1/1 [00:00<00:00,  2.62it/s]
10it [00:00, 3943.50it/s]
100%|██████████| 10/10 [00:00<00:00, 8966.02it/s]
10it [00:00, 8103.37it/s]
100%|██████████| 10/10 [00:00<00:00, 20340.95it/s]
10it [00:00, 6775.94it/s]
100%|██████████| 10/10 [00:00<00:00, 16307.56it/s]
10it [00:00, 7210.42it/s]
100%|██████████| 10/10 [00:00<00:00, 11422.40it/s]
10it [00:00, 6082.23it/s]
100%|██████████| 10/10 [00:00<00:00, 5014.71it/s]
10it [00:00, 5153.34it/s]
100%|██████████| 10/10 [00:00<00:00, 9012.26it/s]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
100%|██████████| 1/1 [00:00<00:00, 72.09it/s]
Some weights of the model checkpoint at ../input/deberta-v2-xlarge/pytorch

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


100%|██████████| 1/1 [00:04<00:00,  4.87s/it]
  0%|          | 0/1 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


100%|██████████| 1/1 [00:00<00:00,  1.92it/s]
  0%|          | 0/1 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


100%|██████████| 1/1 [00:00<00:00,  1.92it/s]
  0%|          | 0/1 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


100%|██████████| 1/1 [00:00<00:00,  1.86it/s]
  0%|          | 0/1 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


100%|██████████| 1/1 [00:00<00:00,  1.93it/s]
  0%|          | 0/1 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


100%|██████████| 1/1 [00:00<00:00,  1.84it/s]
10it [00:00, 3722.31it/s]
100%|██████████| 10/10 [00:00<00:00, 17353.35it/s]
10it [00:00, 7487.15it/s]
100%|██████████| 10/10 [00:00<00:00, 19544.75it/s]
10it [00:00, 4611.66it/s]
100%|██████████| 10/10 [00:00<00:00, 5829.47it/s]
10it [00:00, 4184.26it/s]
100%|██████████| 10/10 [00:00<00:00, 15500.01it/s]
10it [00:00, 5081.54it/s]
100%|██████████| 10/10 [00:00<00:00, 9238.56it/s]
10it [00:00, 7810.62it/s]
100%|██████████| 10/10 [00:00<00:00, 3338.88it/s]
100%|██████████| 1/1 [00:00<00:00, 67.07it/s]
Some weights of the model checkpoint at ../input/deberta-xlarge/pytorch_model.bin were not used when initializing DebertaModel: ['classifier.bias', 'classifier.weight']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing De

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


100%|██████████| 1/1 [00:00<00:00,  1.32it/s]
  0%|          | 0/1 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


100%|██████████| 1/1 [00:00<00:00,  1.36it/s]
  0%|          | 0/1 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


100%|██████████| 1/1 [00:00<00:00,  1.39it/s]
  0%|          | 0/1 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


100%|██████████| 1/1 [00:00<00:00,  1.34it/s]
  0%|          | 0/1 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


100%|██████████| 1/1 [00:00<00:00,  1.90it/s]

	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


100%|██████████| 1/1 [00:00<00:00,  1.36it/s]
  0%|          | 0/1 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


100%|██████████| 1/1 [00:00<00:00,  1.36it/s]
10it [00:00, 3615.78it/s]
100%|██████████| 10/10 [00:00<00:00, 18816.98it/s]
10it [00:00, 7986.11it/s]
100%|██████████| 10/10 [00:00<00:00, 13976.35it/s]
10it [00:00, 4888.47it/s]
100%|██████████| 10/10 [00:00<00:00, 15966.14it/s]
10it [00:00, 3119.83it/s]
100%|██████████| 10/10 [00:00<00:00, 19301.91it/s]
10it [00:00, 5084.00it/s]
100%|██████████| 10/10 [00:00<00:00, 15482.85it/s]
10it [00:00, 5983.32it/s]
100%|██████████| 10/10 [00:00<00:00, 5935.06it/s]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
100%|██████████| 1/1 [00:00<00:00, 78.30it/s]
Some weights of the model checkpoint at ../input/deberta-v2-xlarge/pytorch_model.bin were not used when initializing DebertaV2Model: ['lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.LayerNorm.bias']
- Th

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


100%|██████████| 1/1 [00:00<00:00,  1.25it/s]
  0%|          | 0/1 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


100%|██████████| 1/1 [00:00<00:00,  1.35it/s]
  0%|          | 0/1 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


100%|██████████| 1/1 [00:00<00:00,  1.37it/s]
  0%|          | 0/1 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


100%|██████████| 1/1 [00:00<00:00,  1.32it/s]
  0%|          | 0/1 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


100%|██████████| 1/1 [00:00<00:00,  1.20it/s]
  0%|          | 0/1 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


100%|██████████| 1/1 [00:00<00:00,  1.33it/s]
10it [00:00, 5187.76it/s]
100%|██████████| 10/10 [00:00<00:00, 15274.23it/s]
10it [00:00, 8230.58it/s]
100%|██████████| 10/10 [00:00<00:00, 11031.84it/s]
10it [00:00, 6368.52it/s]
100%|██████████| 10/10 [00:00<00:00, 11818.27it/s]
10it [00:00, 7927.24it/s]
100%|██████████| 10/10 [00:00<00:00, 6450.79it/s]
10it [00:00, 7866.29it/s]
100%|██████████| 10/10 [00:00<00:00, 14974.31it/s]
10it [00:00, 6884.94it/s]
100%|██████████| 10/10 [00:00<00:00, 10379.37it/s]
100%|██████████| 1/1 [00:00<00:00, 51.01it/s]
Some weights of the model checkpoint at ../input/deberta-v3-large/pytorch_model.bin were not used when initializing DebertaV2Model: ['classifier.bias', 'classifier.weight']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initial

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


100%|██████████| 1/1 [00:00<00:00,  1.54it/s]
  0%|          | 0/1 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


100%|██████████| 1/1 [00:00<00:00,  1.58it/s]
  0%|          | 0/1 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


100%|██████████| 1/1 [00:00<00:00,  1.57it/s]
  0%|          | 0/1 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


100%|██████████| 1/1 [00:00<00:00,  1.54it/s]
  0%|          | 0/1 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


100%|██████████| 1/1 [00:00<00:00,  1.57it/s]
  0%|          | 0/1 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


100%|██████████| 1/1 [00:00<00:00,  1.59it/s]
10it [00:00, 3014.45it/s]
100%|██████████| 10/10 [00:00<00:00, 12818.78it/s]
10it [00:00, 8190.40it/s]
100%|██████████| 10/10 [00:00<00:00, 15923.71it/s]
10it [00:00, 6053.26it/s]
100%|██████████| 10/10 [00:00<00:00, 8687.46it/s]
10it [00:00, 8171.25it/s]
100%|██████████| 10/10 [00:00<00:00, 15318.86it/s]
10it [00:00, 6091.06it/s]
100%|██████████| 10/10 [00:00<00:00, 7909.30it/s]
10it [00:00, 4248.26it/s]
100%|██████████| 10/10 [00:00<00:00, 8811.56it/s]


In [13]:
#xgb_model_loaded.predict_proba(sub[features]).shape

In [14]:
# xgb_model_loaded = pickle.load(open(f"{exp.XGB_PATH}/xgb_{xgb_fold}.p", "rb"))
# xgb_model_loaded.predict(d_test)

In [15]:
# #d_test = xgb.DMatrix(sub[features],enable_categorical=True)
# xgb_model_loaded = xgb.Booster()
# xgb_model_loaded.load_model(f"{exp.XGB_PATH}/xgb_{xgb_fold}.json")
# #xgb_model_loaded = pickle.load(open(f"{exp.XGB_PATH}/xgb_{xgb_fold}.p", "rb"))
# xgb_model_loaded.predict(d_test)

In [16]:
subs[0]

Unnamed: 0,discourse_id,Ineffective,Adequate,Effective
0,a261b6e14276,0.005244,0.311563,0.683192
1,5a88900e7dc1,0.031912,0.680193,0.287896
2,9790d835736b,0.090935,0.419728,0.489338
3,75ce6d68b67b,0.098606,0.475707,0.425687
4,93578d946723,0.179806,0.445937,0.374257
5,2e214524dbe3,0.042539,0.558329,0.399133
6,84812fc2ab9f,0.014382,0.450539,0.535079
7,c668ff840720,0.047245,0.529719,0.423036
8,739a6d00f44a,0.033323,0.518314,0.448363
9,bcfae2c9a244,0.017875,0.664,0.318126


In [17]:
#weights=[0.33918623, 0.43399894, 0.22681483]

#weights=[0.33988562, 0.48792195, 0.17219243]

for exp in experiments:
    print(exp.TRAINED_MODEL_PATH)

#weights=[0.3784224, 0.23337426, 0.12014701, 0.26805633]

#weights=[0.06290397, 0.21444155, 0.07934315, 0.20454198, 0.26092175, 0.17784759]
weights=[0.1996512, 0.22573007, 0.12467703, 0.25565791, 0.1942838]






assert len(subs)==len(weights)

weights=np.array(weights)
weights=weights/weights.sum()
submission=subs[0].copy()
submission["Ineffective"]=submission["Ineffective"].values*weights[0]
submission["Adequate"]=submission["Adequate"].values*weights[0]
submission["Effective"]=submission["Effective"].values*weights[0]


for sub,weight in zip(subs[1:],weights[1:]):
    submission["Ineffective"]=submission["Ineffective"].values+sub["Ineffective"].values*weight
    submission["Adequate"]=submission["Adequate"].values+sub["Adequate"].values*weight
    submission["Effective"]=submission["Effective"].values+sub["Effective"].values*weight

submission.to_csv("submission.csv",index=False)
submission

['../input/test10-deberta-v3-large-pl-5th-tascj0-corrected-nb/fold0.pt', '../input/test10-deberta-v3-large-pl-5th-tascj0-corrected-nb/fold1.pt', '../input/test10-deberta-v3-large-pl-5th-tascj0-corrected-nb/fold2.pt', '../input/test10-deberta-v3-large-pl-5th-tascj0-corrected-nb/fold3.pt', '../input/test10-deberta-v3-large-pl-5th-tascj0-corrected-nb/fold4.pt', '../input/test10-deberta-v3-large-pl-5th-tascj0-corrected-nb/fold5.pt']
['../input/test10-deberta-v2-xlarge-pl-5th-tascj0-corrected-0/fold0.pt', '../input/test10-deberta-v2-xlarge-pl-5th-tascj0-corrected-0/fold1.pt', '../input/test10-deberta-v2-xlarge-pl-5th-tascj0-corrected-0/fold2.pt', '../input/test10-deberta-v2-xlarge-pl-5th-tascj0-corrected-1/fold3.pt', '../input/test10-deberta-v2-xlarge-pl-5th-tascj0-corrected-1/fold4.pt', '../input/test10-deberta-v2-xlarge-pl-5th-tascj0-corrected-1/fold5.pt']
['../input/fb-test10-deberta-xlarge-pl-5th-tascj0-nb/fold0.pt', '../input/fb-test10-deberta-xlarge-pl-5th-tascj0-nb/fold1.pt', '../inp

Unnamed: 0,discourse_id,Ineffective,Adequate,Effective
0,a261b6e14276,0.006998,0.340908,0.652094
1,5a88900e7dc1,0.023835,0.73901,0.237155
2,9790d835736b,0.097193,0.440936,0.461871
3,75ce6d68b67b,0.131291,0.470152,0.398557
4,93578d946723,0.198225,0.428877,0.372898
5,2e214524dbe3,0.04146,0.592207,0.366333
6,84812fc2ab9f,0.020864,0.50675,0.472386
7,c668ff840720,0.061274,0.559227,0.379499
8,739a6d00f44a,0.052973,0.561777,0.38525
9,bcfae2c9a244,0.018044,0.681009,0.300947
