In [46]:
# default_exp fastai_huggingface

In [47]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Set Up and Data Loading

This kernel uses fastai and Huggingface transformser. fastai is already installed on Kaggle, and [here](https://www.kaggle.com/c/tensorflow2-question-answering/discussion/117716) is a discussion post that shows how to get Huggingface installled.

In [11]:
#export
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from pathlib import Path

import os

import torch
import torch.optim as optim

import random

# fastai
from fastai import *
from fastai.text import *
from fastai.callbacks import *

# classification metric
from scipy.stats import spearmanr

# transformers
from fastai.tabular import *

from transformers import PreTrainedModel, PreTrainedTokenizer, PretrainedConfig,RobertaModel
from transformers import RobertaForSequenceClassification, RobertaTokenizer, RobertaConfig,AlbertForSequenceClassification, AlbertTokenizer, AlbertConfig

This statement prints all of the directories in the /kaggle/input/ directory. This can be useful when trying to determine the path of the external datasets.

In [23]:
from gquest_nbdev.gquest_nbdev.fastai_huggingface import * 
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import pdb
from pathlib import Path 
from fastai.text import *

In [24]:

for dirname, _, filenames in os.walk('/kaggle/input'):
    print(dirname)

/kaggle/input


A utility function to set the seed for generating random numbers

In [14]:
#export
def seed_all(seed_value):
    random.seed(seed_value) # Python
    np.random.seed(seed_value) # cpu vars
    torch.manual_seed(seed_value) # cpu  vars

    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed_value)
        torch.cuda.manual_seed_all(seed_value) # gpu vars
        torch.backends.cudnn.deterministic = True  #needed
        torch.backends.cudnn.benchmark = False

In [25]:
seed=42
seed_all(seed)

In [26]:
#os.chdir(Path("./gquest_nbdev"))
#os.chdir(Path("/home/mrdbarros/projetos/gquest_nbdev"))

In [27]:
model_type = 'roberta'
pretrained_model_name = 'roberta-base' # 'roberta-base-openai-detector'
DATA_ROOT = Path("../input/google-quest-challenge/")
MODEL_ROOT = Path("../input/"+pretrained_model_name)
train = pd.read_csv(DATA_ROOT / 'train.csv')
test = pd.read_csv(DATA_ROOT / 'test.csv')
sample_sub = pd.read_csv(DATA_ROOT / 'sample_submission.csv')
print(train.shape,test.shape)
download_model=True

(6079, 41) (476, 11)


The training data. In this kernel, I'll use the `question_title`, `question_body` and `answer` columns.

In [28]:
train.head()

Unnamed: 0,qa_id,question_title,question_body,question_user_name,question_user_page,answer,answer_user_name,answer_user_page,url,category,...,question_well_written,answer_helpful,answer_level_of_information,answer_plausible,answer_relevance,answer_satisfaction,answer_type_instructions,answer_type_procedure,answer_type_reason_explanation,answer_well_written
0,0,What am I losing when using extension tubes in...,After playing around with macro photography on...,ysap,https://photo.stackexchange.com/users/1024,"I just got extension tubes, so here's the skin...",rfusca,https://photo.stackexchange.com/users/1917,http://photo.stackexchange.com/questions/9169/...,LIFE_ARTS,...,1.0,1.0,0.666667,1.0,1.0,0.8,1.0,0.0,0.0,1.0
1,1,What is the distinction between a city and a s...,I am trying to understand what kinds of places...,russellpierce,https://rpg.stackexchange.com/users/8774,It might be helpful to look into the definitio...,Erik Schmidt,https://rpg.stackexchange.com/users/1871,http://rpg.stackexchange.com/questions/47820/w...,CULTURE,...,0.888889,0.888889,0.555556,0.888889,0.888889,0.666667,0.0,0.0,0.666667,0.888889
2,2,Maximum protusion length for through-hole comp...,I'm working on a PCB that has through-hole com...,Joe Baker,https://electronics.stackexchange.com/users/10157,Do you even need grooves? We make several pro...,Dwayne Reid,https://electronics.stackexchange.com/users/64754,http://electronics.stackexchange.com/questions...,SCIENCE,...,0.777778,0.777778,0.555556,1.0,1.0,0.666667,0.0,0.333333,1.0,0.888889
3,3,Can an affidavit be used in Beit Din?,"An affidavit, from what i understand, is basic...",Scimonster,https://judaism.stackexchange.com/users/5151,"Sending an ""affidavit"" it is a dispute between...",Y e z,https://judaism.stackexchange.com/users/4794,http://judaism.stackexchange.com/questions/551...,CULTURE,...,0.888889,0.833333,0.333333,0.833333,1.0,0.8,0.0,0.0,1.0,1.0
4,5,How do you make a binary image in Photoshop?,I am trying to make a binary image. I want mor...,leigero,https://graphicdesign.stackexchange.com/users/...,Check out Image Trace in Adobe Illustrator. \n...,q2ra,https://graphicdesign.stackexchange.com/users/...,http://graphicdesign.stackexchange.com/questio...,LIFE_ARTS,...,1.0,1.0,0.666667,1.0,1.0,0.8,1.0,0.0,1.0,1.0


The predicted lables are in the columns of the sample submission. Note that some labels are with respect to the question, and some are with respect to the answer.

In [29]:
labels = list(sample_sub.columns[1:].values)

In [30]:
for label in labels: print(label) 

question_asker_intent_understanding
question_body_critical
question_conversational
question_expect_short_answer
question_fact_seeking
question_has_commonly_accepted_answer
question_interestingness_others
question_interestingness_self
question_multi_intent
question_not_really_a_question
question_opinion_seeking
question_type_choice
question_type_compare
question_type_consequence
question_type_definition
question_type_entity
question_type_instructions
question_type_procedure
question_type_reason_explanation
question_type_spelling
question_well_written
answer_helpful
answer_level_of_information
answer_plausible
answer_relevance
answer_satisfaction
answer_type_instructions
answer_type_procedure
answer_type_reason_explanation
answer_well_written


# Fine Tuning

In [31]:
train[['question_title','question_body','answer']].to_csv(Path('../input/raw_text.csv'))

# Specifying Data Preprocessing 

When using pretrained models, the current data needs to be preprocessed in the same way as the data that trained the model. In ``transformers``, each model architecture is associated with 3 main types of classes:
* A **model class** to load/store a particular pre-train model.
* A **tokenizer class** to pre-process the data and make it compatible with a particular model.
* A **configuration class** to load/store the configuration of a particular model.

For the RoBERTa architecture, we use `RobertaForSequenceClassification` for the **model class**, `RobertaTokenizer` for the **tokenizer class**, and `RobertaConfig` for the **configuration class**. 

In [57]:
#export
MODEL_CLASSES = {
    'albert': (AlbertForSequenceClassification, AlbertTokenizer, AlbertConfig),
    'roberta': (RobertaModel, RobertaTokenizer,
                RobertaConfig(hidden_act="gelu_new",
                              hidden_dropout_prob=0.1,
                              attention_probs_dropout_prob=0.1,
                              #max_position_embeddings=1024,
                              layer_norm_eps=1e-12))
}

NameError: name 'AlbertForSequenceClassification' is not defined

You will see later, that those classes share a common class method ``from_pretrained(pretrained_model_name, ...)``. In our case, the parameter ``pretrained_model_name`` is a string with the shortcut name of a pre-trained model/tokenizer/configuration to load, e.g ``'bert-base-uncased'``. We can find all the shortcut names in the transformers documentation [here](https://huggingface.co/transformers/pretrained_models.html#pretrained-models).

In [32]:
# Parameters
seed = 42
use_fp16 = True
bs = 8
MAX_SEQUENCE_LENGTH = 512

In [33]:
model_class, tokenizer_class, config_class = MODEL_CLASSES[model_type]

In [34]:
if download_model:
    new_dir=Path("../input")/pretrained_model_name
    !mkdir {new_dir}
    transformer_tokenizer = tokenizer_class.from_pretrained(pretrained_model_name)
    transformer_tokenizer.save_pretrained(MODEL_ROOT)

mkdir: cannot create directory ‘../input/roberta-base’: File exists


In [35]:
model_class.pretrained_model_archive_map.keys()

dict_keys(['roberta-base', 'roberta-large', 'roberta-large-mnli', 'distilroberta-base', 'roberta-base-openai-detector', 'roberta-large-openai-detector'])

## Implementing the RoBERTa tokenizer and numericalizer in fastai

Text data is preprocessed through tokenization and numericalization. To match the pretrained models, we need to use the same tokenization and numericalization as the model. Fortunately, the **tokenizer class** from ``transformers`` provides the correct pre-process tools that correspond to each pre-trained model.

In ``fastai``, data pre-processing is performed during the creation of the ``DataBunch``. When creating a `DataBunch`, the tokenizer and numericalizer are passed in the processor argument.

Therefore, the first step is to create a customized tokenize and numericalizer that use the correct transformer tokenizer classes. 

### Custom Tokenizer

A tokentizer takes the text and transforms it into tokens. The ``fastai`` documentation notes that: 
1. The [``TokenizeProcessor`` object](https://docs.fast.ai/text.data.html#TokenizeProcessor) takes as ``tokenizer`` argument a ``Tokenizer`` object.
2. The [``Tokenizer`` object](https://docs.fast.ai/text.transform.html#Tokenizer) takes as ``tok_func`` argument a ``BaseTokenizer`` object.
3. The [``BaseTokenizer`` object](https://docs.fast.ai/text.transform.html#BaseTokenizer) implement the function ``tokenizer(t:str) → List[str]`` that take a text ``t`` and returns the list of its tokens.

To use the RoBERTa tokenizer, we create a new class ``TransformersBaseTokenizer`` that inherits from ``BaseTokenizer`` and overwrite a new ``tokenizer`` function. It is important to note that RoBERTa requires a space to start the input string. The encoding methods should be called with ``add_prefix_space`` set to ``True``. The output of the tokenizer should have the following pattern. (Note that padding is added when the `DataBunch` is created.)

    roberta: [CLS] + prefix_space + tokens + [SEP] + padding

In [36]:
train['question_title_body']=train['question_title'] +" " + train['question_body']
test['question_title_body']=test['question_title'] +" " + test['question_body']

In [29]:
#export
class TransformersBaseTokenizer(BaseTokenizer):
    """Wrapper around PreTrainedTokenizer to be compatible with fast.ai"""
    def __init__(self, pretrained_tokenizer: PreTrainedTokenizer, model_type = 'roberta', **kwargs):
        self._pretrained_tokenizer = pretrained_tokenizer
        self.max_seq_len = pretrained_tokenizer.max_len
        self.model_type = model_type

    def __call__(self, *args, **kwargs):
        return self

    def tokenizer(self, t) -> List[List[str]]:


        all_columns_inputs=[]
        #pdb.set_trace()
        for column_i in range(len(t)):
            inputs = self._pretrained_tokenizer.encode_plus(t[column_i],add_special_tokens=True,
                                               max_length=self.max_seq_len,truncation_strategy='longest_first')
            input_ids =  inputs["input_ids"]
            input_masks = [1] * len(input_ids)
            input_segments = inputs["token_type_ids"]
            padding_length = self.max_seq_len - len(input_ids)
            padding_id = self._pretrained_tokenizer.pad_token_id
            input_ids = input_ids + ([padding_id] * padding_length)
            input_masks = input_masks + ([0] * padding_length)
            input_segments = input_segments + ([0] * padding_length)
            all_columns_inputs.append(np.array([input_ids, input_masks, input_segments]))


        return all_columns_inputs

In [37]:
if download_model:
    transformer_tokenizer = tokenizer_class.from_pretrained(pretrained_model_name)
    transformer_tokenizer.save_pretrained(MODEL_ROOT)

In [31]:
#export
class Tokenizer_MultiColumn(Tokenizer):

    def _process_all_1(self, texts:Collection[str]) -> List[List[str]]:
        "Process a list of `texts` in one process."
        tok = self.tok_func(self.lang)
        if self.special_cases: tok.add_special_cases(self.special_cases)
        return [self.process_text(t, tok) for t in texts]

In [38]:
transformer_tokenizer = tokenizer_class.from_pretrained(MODEL_ROOT)
transformer_base_tokenizer = TransformersBaseTokenizer(pretrained_tokenizer = transformer_tokenizer, model_type = model_type)
fastai_tokenizer = Tokenizer_MultiColumn(tok_func = transformer_base_tokenizer, pre_rules=[], post_rules=[])

### Custom Numericalizer

The numericalizer takes the the tokens, and turns them into numbers. The ``fastai`` documentation notes that:
1. The [``NumericalizeProcessor``  object](https://docs.fast.ai/text.data.html#NumericalizeProcessor) takes as ``vocab`` argument a [``Vocab`` object](https://docs.fast.ai/text.transform.html#Vocab)

To use the RoBERTa numericalizer, we create a new class ``TransformersVocab`` that inherits from ``Vocab`` and overwrite ``numericalize`` and ``textify`` functions.

In [33]:
#export
class TransformersVocab(Vocab):
    def __init__(self, tokenizer: PreTrainedTokenizer):
        super(TransformersVocab, self).__init__(itos = [])
        self.tokenizer = tokenizer

    def numericalize(self, t:Collection[List[str]]) -> List[List[int]]:
        "Convert a list of tokens `t` to their ids."
        return t
        #return self.tokenizer.encode(t)

    def textify(self, nums:Collection[List[int]], sep=' ') -> List[List[str]]:
        "Convert a list of `nums` to their tokens."
        ret = []
        for i in range(len(nums)):
            ret.append(self.tokenizer.decode(np.array(nums[i]).tolist()[0]))
        return ret

    def __getstate__(self):
        return {'itos':self.itos, 'tokenizer':self.tokenizer}

    def __setstate__(self, state:dict):
        self.itos = state['itos']
        self.tokenizer = state['tokenizer']
        self.stoi = collections.defaultdict(int,{v:k for k,v in enumerate(self.itos)})

### Custom processor

Now that we have our custom **tokenizer** and **numericalizer**, we can create the custom **processor**. Notice we are passing the ``include_bos = False`` and ``include_eos = False`` options. This is because ``fastai`` adds its own special tokens by default which interferes with the ``[CLS]`` and ``[SEP]`` tokens added by our custom tokenizer.

In [34]:
#export
def _multicolumn_texts(texts:Collection[str]):

    df = pd.DataFrame({i:texts[:,i] for i in range(texts.shape[1])})

    return df.iloc[:,range(texts.shape[1])].values

In [35]:
#export
class TokenizeProcessorDualBert(TokenizeProcessor):
    "`PreProcessor` that tokenizes the texts in `ds`."
    def __init__(self, ds:ItemList=None, tokenizer:Tokenizer=None, chunksize:int=10000,
                 mark_fields:bool=False, include_bos:bool=True, include_eos:bool=False):
        self.tokenizer,self.chunksize,self.mark_fields = ifnone(tokenizer, Tokenizer()),chunksize,mark_fields
        self.include_bos, self.include_eos = include_bos, include_eos

    def process_one(self, item):
        return self.tokenizer._process_all_1(_multicolumn_texts([item]))[0]

    def process(self, ds):
        ds.items = _multicolumn_texts(ds.items)
        tokens = []
        #pdb.set_trace()
        for i in progress_bar(range(0,len(ds),self.chunksize), leave=False):
            tokens += self.tokenizer.process_all(ds.items[i:i+self.chunksize])

        ds.items = tokens

In [39]:
transformer_vocab =  TransformersVocab(tokenizer = transformer_tokenizer)
numericalize_processor = NumericalizeProcessor(vocab=transformer_vocab)

tokenize_processor = TokenizeProcessorDualBert(tokenizer=fastai_tokenizer, include_bos=False, include_eos=False)

transformer_processor = [tokenize_processor, numericalize_processor]

# Loading and Processing Data

Now that we have a custom processor, which contains the custom tokenizer and numericalizer, we can create the `DataBunch`. During the DataBunch creation, we have to pay attention to set the processor argument to our new custom processor ``transformer_processor`` and manage correctly the padding. For RoBERTa, it's usually advised to pad the inputs on the right rather than the left.

In [40]:
pad_first = bool(model_type in ['xlnet'])
pad_idx = transformer_tokenizer.pad_token_id

This kernel uses [the data block API](https://docs.fast.ai/data_block.html#The-data-block-API), to create the `DataBunch`. 

In the `DataBunch` creation, I have specified to use the 'question_title','question_body', and 'answer' columns as the training data. Recall from the introduction that some of the target answers relate to the question (title + body) and some only to the answer. It's an open question as to whether it's a good choice to stick these all together. 


In [41]:
unique_sorted_values=[np.sort(train[labels[i]].unique()) for i in range(len(labels))]
unique_sorted_values


[array([0.333333, 0.444444, 0.5     , 0.555556, 0.666667, 0.777778, 0.833333, 0.888889, 1.      ]),
 array([0.333333, 0.444444, 0.5     , 0.555556, 0.666667, 0.777778, 0.833333, 0.888889, 1.      ]),
 array([0.      , 0.333333, 0.5     , 0.666667, 1.      ]),
 array([0.      , 0.333333, 0.5     , 0.666667, 1.      ]),
 array([0.      , 0.333333, 0.5     , 0.666667, 1.      ]),
 array([0.      , 0.333333, 0.5     , 0.666667, 1.      ]),
 array([0.333333, 0.444444, 0.5     , 0.555556, 0.666667, 0.777778, 0.833333, 0.888889, 1.      ]),
 array([0.333333, 0.444444, 0.5     , 0.555556, 0.666667, 0.777778, 0.833333, 0.888889, 1.      ]),
 array([0.      , 0.333333, 0.5     , 0.666667, 1.      ]),
 array([0.      , 0.333333, 0.5     , 0.666667, 1.      ]),
 array([0.      , 0.333333, 0.5     , 0.666667, 1.      ]),
 array([0.      , 0.333333, 0.5     , 0.666667, 1.      ]),
 array([0.      , 0.333333, 0.5     , 0.666667, 1.      ]),
 array([0.      , 0.333333, 0.5     , 0.666667, 1.      ]),


In [42]:
labels

['question_asker_intent_understanding',
 'question_body_critical',
 'question_conversational',
 'question_expect_short_answer',
 'question_fact_seeking',
 'question_has_commonly_accepted_answer',
 'question_interestingness_others',
 'question_interestingness_self',
 'question_multi_intent',
 'question_not_really_a_question',
 'question_opinion_seeking',
 'question_type_choice',
 'question_type_compare',
 'question_type_consequence',
 'question_type_definition',
 'question_type_entity',
 'question_type_instructions',
 'question_type_procedure',
 'question_type_reason_explanation',
 'question_type_spelling',
 'question_well_written',
 'answer_helpful',
 'answer_level_of_information',
 'answer_plausible',
 'answer_relevance',
 'answer_satisfaction',
 'answer_type_instructions',
 'answer_type_procedure',
 'answer_type_reason_explanation',
 'answer_well_written']

In [40]:
#export
def no_collate(samples:BatchSamples) -> Tuple[LongTensor, LongTensor]:
    "Function that collect samples and adds padding. Flips token order if needed"
    samples = to_data(samples)
    res=tensor(np.array([s[0] for s in samples]))

    return res, tensor(np.array([s[1] for s in samples]))

In [None]:
import pdb

In [51]:
#export
class TextClasDataBunch_Multi(TextDataBunch):
    "Create a `TextDataBunch` suitable for training an RNN classifier."
    @classmethod
    def create(cls, train_ds, valid_ds, test_ds=None, path:PathOrStr='.', bs:int=32, val_bs:int=None, pad_idx=1,
               pad_first=True, device:torch.device=None, no_check:bool=False, backwards:bool=False,
               dl_tfms:Optional[Collection[Callable]]=None, **dl_kwargs) -> DataBunch:
        "Function that transform the `datasets` in a `DataBunch` for classification. Passes `**dl_kwargs` on to `DataLoader()`"
        datasets = cls._init_ds(train_ds, valid_ds, test_ds)
        val_bs = ifnone(val_bs, bs)
        collate_fn = partial(no_collate)
        train_sampler = SortishSampler(datasets[0].x, key=lambda t: len(datasets[0][t][0].data), bs=bs)
        train_dl = DataLoader(datasets[0], batch_size=bs, sampler=train_sampler, drop_last=True, **dl_kwargs)
        dataloaders = [train_dl]
        for ds in datasets[1:]:
            lengths = [len(t) for t in ds.x.items]
            sampler = SortSampler(ds.x, key=lengths.__getitem__)
            dataloaders.append(DataLoader(ds, batch_size=val_bs, sampler=sampler, **dl_kwargs))
        return cls(*dataloaders, path=path, device=device, dl_tfms=dl_tfms, collate_fn=collate_fn, no_check=no_check)

In [42]:
#export
class TextList_Multi(TextList):
    _bunch=TextClasDataBunch_Multi

In [50]:
#export
class MixedObjectDataBunch(DataBunch):
    pass

In [None]:
#export
class MixedObjectLists(ItemLists):
    def __init__(self, path,train: ItemList, valid: ItemList):
        self.path, self.train, self.valid, self.test = path, train, valid, None


    def __repr__(self)->str:
        return f'{self.__class__.__name__};\n\nTrain: {self.train};\n\nValid: {self.valid};\n\nTest: {self.test}'

    def __getattr__(self, k):
        ft = getattr(self.train[0], k)
        if not isinstance(ft, Callable): return ft
        fv = getattr(self.valid[0], k)
        assert isinstance(fv, Callable)
        def _inner(*args, **kwargs):
            self.train = ft(*args, from_item_lists=True, **kwargs)
            assert isinstance(self.train, LabelList)
            kwargs['label_cls'] = self.train.y.__class__
            self.valid = fv(*args, from_item_lists=True, **kwargs)
            self.__class__ = LabelLists_Multi
            self.process()
            return self
        return _inner

    def __setstate__(self,data:Any): self.__dict__.update(data)

    def _label_from_list(self, labels:Iterator, label_cls:Callable=None, from_item_lists:bool=False, **kwargs)->'LabelList':
        "Label `self.items` with `labels`."
        if not from_item_lists:
            raise Exception("Your data isn't split, if you don't want a validation set, please use `split_none`.")
        labels = array(labels, dtype=object)
        label_cls = self.get_label_cls(labels, label_cls=label_cls, **kwargs)
        y = label_cls(labels, path=self.path, **kwargs)
        res = self._label_list(x=self.parent, y=y)
        return res

    def label_from_df(self, *args, **kwargs):
        "Label `self.items` from the values in `cols` in `self.inner_df`."


        for i,o in enumerate(self.train):
            ft = getattr(self.train[i], 'label_from_df')
            fv = getattr(self.valid[i], 'label_from_df')
            self.train[i]=ft(*args, from_item_lists=True, **kwargs)

            kwargs['label_cls'] = self.train[i].y.__class__
            self.valid[i] = fv(*args, from_item_lists=True, **kwargs)

        self.train_y = self.train[0].y
        self.valid_y = self.valid[0].y
        self.__class__ = LabelLists_Multi
        self.process()
        return self

In [None]:
#export
class MixedObjectList(ItemList):

    def __init__(self, item_lists):
        self.item_lists = item_lists
        self._label_list, self._split = LabelList_Multi, MixedObjectLists
        self.n = len(item_lists[0])
        self.path = Path('.')
        for i,o in enumerate(self.item_lists):
            item_lists[i].parent_data_group=weakref.ref(self)


    @classmethod
    def from_df(cls, df_list:List[DataFrame], cols_list=None,item_type_list=None, processors=None, **kwargs)->'MixedObjectList':
        res=[]

        for i,df in enumerate(df_list):
            if item_type_list[i] is TabularList:
                res.append(item_type_list[i].from_df(df, cat_names=cols_list[i], **kwargs))
            else:
                res.append(item_type_list[i].from_df(df, cols=cols_list[i], processor=processors[i], **kwargs))
        return cls(res)

    def split_by_idxs(self, train_idx, valid_idx):
        "Split the data between `train_idx` and `valid_idx`."
        train=[]
        valid=[]
        for i,o in enumerate(self.item_lists):
            self.item_lists[i]=self.item_lists[i].split_by_list(self.item_lists[i][train_idx], self.item_lists[i][valid_idx])
            self.item_lists[i].train.parent_data_group = weakref.ref(self)
            self.item_lists[i].valid.parent_data_group = weakref.ref(self)
            train.append(self.item_lists[i].train)
            valid.append(self.item_lists[i].valid)

        return self._split(self.path, train, valid)

    def split_subsets(self, train_size:float, valid_size:float, seed=None) -> 'MixedObjectLists':
        "Split the items into train set with size `train_size * n` and valid set with size `valid_size * n`."
        assert 0 < train_size < 1
        assert 0 < valid_size < 1
        assert train_size + valid_size <= 1.
        if seed is not None: np.random.seed(seed)
        n = self.n
        rand_idx = np.random.permutation(range(n))
        train_cut, valid_cut = int(train_size * n), int(valid_size * n)
        return self.split_by_idxs(rand_idx[:train_cut], rand_idx[-valid_cut:])

In [None]:
#export
class LabelList_Multi(LabelList):
    def __init__(self,parent_data_group,*args,**kwargs):
        self.parent_data_group=parent_data_group
        super().__init__(*args,**kwargs)

In [None]:
#export
class LabelLists_Multi(LabelLists):
    _bunch = MixedObjectDataBunch
    def get_processors(self):
        "Read the default class processors if none have been set."
        procs_x,procs_y = [listify(self.train[i].x._processor) for i in range_of(self.train)],listify(self.train[0].y._processor)

        xp = [ifnone(self.train[i].x.processor, [p(ds=self.train[i].x) for p in procs_x[i]]) for i in range_of(self.train)]
        yp = ifnone(self.train_y.processor, [p(ds=self.train_y) for p in procs_y])
        return xp,yp


    def process(self):
        "Process the inner datasets."
        xp, yp = self.get_processors()
        for ds, n in zip(self.lists, ['train', 'valid', 'test']):
            for i,o in enumerate(ds):
                o.process(xp[i], yp, name=n)
        # progress_bar clear the outputs so in some case warnings issued during processing disappear.
        for ds in self.lists:
            for i,o in enumerate(ds):
                if getattr(o, 'warn', False): warn(o.warn)
        return self

    def databunch(self, path:PathOrStr=None, bs:int=64, val_bs:int=None, num_workers:int=defaults.cpus,
                  dl_tfms:Optional[Collection[Callable]]=None, device:torch.device=None, collate_fn:Callable=data_collate,
                  no_check:bool=False, **kwargs)->'DataBunch':
        "Create an `DataBunch` from self, `path` will override `self.path`, `kwargs` are passed to `DataBunch.create`."
        path = Path(ifnone(path, self.path))
        databunchs=[]

        for i,o in enumerate(self.train):
            if self.test is None:
                test_index = None
            else:
                test_index=self.test[i]
            data = o._bunch.create(self.train[i], self.valid[i], test_ds=test_index, path=path, bs=bs, val_bs=val_bs,
                                    num_workers=num_workers, dl_tfms=dl_tfms, device=device, collate_fn=collate_fn, no_check=no_check, **kwargs)
            if getattr(self, 'normalize', False):#In case a normalization was serialized
                norm = self.normalize
                data.normalize((norm['mean'], norm['std']), do_x=norm['do_x'], do_y=norm['do_y'])
            data.label_list = self
            databunchs.append(data)
        databunchs[0].secondary_bunch=databunchs[1]
        return databunchs[0]

In [51]:
MixedObjectList.from_df(train, cols=['question_title_body','answer'], 
                 processor=transformer_processor)

KeyboardInterrupt: 

In [44]:
sd = (TextList_Multi.from_df(train, cols=['question_title_body','answer'], 
                 processor=transformer_processor)
                .split_subsets(train_size=0.8,valid_size=0.2)
                #.split_by_rand_pct(0.1,seed=seed)
                 .label_from_df(cols=labels,label_cls=MultiCategoryList))

KeyboardInterrupt: 

In [72]:
databunch = (sd.add_test(test[['question_title_body','answer']])
                .databunch(bs=bs))  

In [83]:
#print('[CLS] token :', transformer_tokenizer.cls_token)
#print('[SEP] token :', transformer_tokenizer.sep_token)
#print('[PAD] token :', transformer_tokenizer.pad_token)
#databunch.show_batch()

Check batch and numericalizer :

In [84]:
#print('[CLS] id :', transformer_tokenizer.cls_token_id)
#print('[SEP] id :', transformer_tokenizer.sep_token_id)
#print('[PAD] id :', pad_idx)
test_one_batch = databunch.one_batch()
#print('Batch shape : ',test_one_batch.shape)
print(test_one_batch)


(tensor([[[[    0,  2264,   473,  ...,     1,     1,     1],
          [    1,     1,     1,  ...,     0,     0,     0],
          [    0,     0,     0,  ...,     0,     0,     0]],

         [[    0,  1620,    13,  ...,     1,     1,     1],
          [    1,     1,     1,  ...,     0,     0,     0],
          [    0,     0,     0,  ...,     0,     0,     0]]],


        [[[    0, 27847,    22,  ...,     1,     1,     1],
          [    1,     1,     1,  ...,     0,     0,     0],
          [    0,     0,     0,  ...,     0,     0,     0]],

         [[    0,   113,  2847,  ...,     1,     1,     1],
          [    1,     1,     1,  ...,     0,     0,     0],
          [    0,     0,     0,  ...,     0,     0,     0]]],


        [[[    0,  7608,   473,  ...,     1,     1,     1],
          [    1,     1,     1,  ...,     0,     0,     0],
          [    0,     0,     0,  ...,     0,     0,     0]],

         [[    0, 46766, 18634,  ...,     1,     1,     1],
          [    1,     1, 

In [85]:
test_one_batch[0].shape

torch.Size([8, 2, 3, 512])

In [None]:
#export
class TabularModel(Module):
    "Basic model for tabular data."
    def __init__(self, emb_szs:ListSizes, n_cont:int, out_sz:int, layers:Collection[int], ps:Collection[float]=None,
                 emb_drop:float=0., y_range:OptRange=None, use_bn:bool=True, bn_final:bool=False):
        super().__init__()
        ps = ifnone(ps, [0]*len(layers))
        ps = listify(ps, layers)
        self.embeds = nn.ModuleList([embedding(ni, nf) for ni,nf in emb_szs])
        self.emb_drop = nn.Dropout(emb_drop)
        self.bn_cont = nn.BatchNorm1d(n_cont)
        n_emb = sum(e.embedding_dim for e in self.embeds)
        self.n_emb,self.n_cont,self.y_range = n_emb,n_cont,y_range
        sizes = self.get_sizes(layers, out_sz)
        actns = [nn.ReLU(inplace=True) for _ in range(len(sizes)-2)] + [None]
        layers = []
        for i,(n_in,n_out,dp,act) in enumerate(zip(sizes[:-1],sizes[1:],[0.]+ps,actns)):
            layers += bn_drop_lin(n_in, n_out, bn=use_bn and i!=0, p=dp, actn=act)
        if bn_final: layers.append(nn.BatchNorm1d(sizes[-1]))
        self.layers = nn.Sequential(*layers)

    def get_sizes(self, layers, out_sz):
        return [self.n_emb + self.n_cont] + layers + [out_sz]

    def forward(self, x_cat:Tensor, x_cont:Tensor) -> Tensor:
        if self.n_emb != 0:
            x = [e(x_cat[:,i]) for i,e in enumerate(self.embeds)]
            x = torch.cat(x, 1)
            x = self.emb_drop(x)
        if self.n_cont != 0:
            x_cont = self.bn_cont(x_cont)
            x = torch.cat([x, x_cont], 1) if self.n_emb != 0 else x_cont
        x = self.layers(x)
        if self.y_range is not None:
            x = (self.y_range[1]-self.y_range[0]) * torch.sigmoid(x) + self.y_range[0]
        return x

In [86]:
#export
class CustomTransformerModel(nn.Module):
    def __init__(self, transformer_model_q: PreTrainedModel, transformer_model_a: PreTrainedModel,emb_sizes=None):
        super(CustomTransformerModel,self).__init__()
        self.transformer_q = transformer_model_q
        self.transformer_a = transformer_model_a
        self.classifier = TabularModel(emb_sizes,1536, 30,[800,400],ps=[0.1,0.1])
        self.dropout = torch.nn.Dropout(0.1)

    def forward(self, input_text,input_categorical):
        #pdb.set_trace()

        q_id=input_text[:,0,0,:]
        q_mask=input_text[:,0,1,:]
        q_atn=input_text[:,0,2,:]

        a_id=input_text[:,1,0,:]
        a_mask=input_text[:,1,1,:]
        a_atn=input_text[:,1,2,:]

        logits_q = torch.mean(self.transformer_q(q_id,
                                attention_mask = q_mask, token_type_ids=q_atn)[0] ,dim=1)
        logits_a = torch.mean(self.transformer_a(a_id,
                                attention_mask = a_mask, token_type_ids=a_atn)[0],dim=1)

        output=self.dropout(torch.cat((logits_q, logits_a), dim=1))
        logits=self.classifier(input_categorical[0][0],output)
        return logits

To make the transformer adapted to multiclass classification, we need to specify the number of labels before loading the pre-trained model.

In [87]:
if download_model:
    config = config_class.from_pretrained(pretrained_model_name)
    config.save_pretrained(MODEL_ROOT)

In [88]:
config = config_class.from_pretrained(MODEL_ROOT,num_labels=200)
config.use_bfloat16 = use_fp16

In [89]:
if download_model:
    transformer_model_q = model_class.from_pretrained(pretrained_model_name, config = config)
    transformer_model_a = model_class.from_pretrained(pretrained_model_name, config = config)
    transformer_model_a.save_pretrained(MODEL_ROOT)

In [90]:
if not download_model:
    transformer_model_q = model_class.from_pretrained(MODEL_ROOT, config = config)
    transformer_model_a = model_class.from_pretrained(MODEL_ROOT, config = config)
custom_transformer_model = CustomTransformerModel(transformer_model_q = transformer_model_q,transformer_model_a=transformer_model_a)

### Adding the evaluation metric

It's important to be able to see how well the model is doing. For this competition: 
> Submissions are evaluated on the mean column-wise Spearman's correlation coefficient. The Spearman's rank correlation is computed for each target column, and the mean of these values is calculated for the submission score.

Although scipy provides an implementation of [Spearman's R](https://docs.scipy.org/doc/scipy-0.16.1/reference/generated/scipy.stats.spearmanr.html), we also need to take the average across all of the columns. Therefore,  we need to create our own [custom metric](https://docs.fast.ai/metrics.html#Creating-your-own-metric). The custom metric is only used on the validations set.
- `on_epoch_begin`: create empty numpy arrays to hold the predictions and targets
- `on_batch_end`: after each back, append the most recent output (predictions) and targets
- `on_epoch_end`: when the epoch is finished, compute Spearman's R on the columns, and then take the average



In [91]:
#export
class AvgSpearman(Callback):

    def __init__(self, labels,*args,**kwargs):
        super().__init__(*args,**kwargs)
        self.labels=labels

    def on_epoch_begin(self, **kwargs):
        self.preds = np.empty( shape=(0, 200) )
        self.target = np.empty( shape=(0,30) )

    def on_batch_end(self, last_output, last_target, **kwargs):
        self.preds = np.append(self.preds,last_output.cpu(),axis=0)
        self.target = np.append(self.target,last_target.cpu(),axis=0)

    def on_epoch_end(self, last_metrics, **kwargs):
        pos = 0
        spearsum=0.0
        for i in range(self.target.shape[1]):
            column_distinct_size = len(self.labels[i])
            #pdb.set_trace()
            processed_target = self.target[:,i]
            processed_pred = self.preds[:,i]
            #processed_pred = torch.matmul(F.softmax(torch.tensor(self.preds[:,pos:(pos+column_distinct_size)]),1),torch.tensor(self.labels[i]))
            spearsum+=spearmanr(processed_pred,processed_target).correlation
        res = spearsum/self.target.shape[1]
        return add_metrics(last_metrics, res)

In [None]:
#export
class AvgSpearman2(Callback):

    def __init__(self,*args,**kwargs):
        super().__init__(*args,**kwargs)

    def on_epoch_begin(self, **kwargs):
        self.preds = np.empty( shape=(0, 30) )
        self.target = np.empty( shape=(0,30) )

    def on_batch_end(self, last_output, last_target, **kwargs):
        self.preds = np.append(self.preds,last_output.cpu(),axis=0)
        self.target = np.append(self.target,last_target.cpu(),axis=0)

    def on_epoch_end(self, last_metrics, **kwargs):
        pos = 0
        spearsum=0.0
        for i in range(self.target.shape[1]):
            #pdb.set_trace()
            processed_target = self.target[:,i]
            processed_pred = self.preds[:,i]
            #processed_pred = torch.matmul(F.softmax(torch.tensor(self.preds[:,pos:(pos+column_distinct_size)]),1),torch.tensor(self.labels[i]))
            spearnew=spearmanr(processed_pred,processed_target).correlation
            spearsum +=spearnew

        res = spearsum/self.target.shape[1]
        return add_metrics(last_metrics, res)

In [None]:
#export
class AddExtraBunch(LearnerCallback):
    def on_epoch_begin(self,**kwargs):
        self.secondary_train_iter=iter(self.learn.data.secondary_bunch.train_dl)
        self.secondary_valid_iter = iter(self.learn.data.secondary_bunch.valid_dl)

    def on_batch_begin(self, last_input, last_target, train, **kwargs):
        "Applies mixup to `last_input` and `last_target` if `train`."
        if train:
            categorical_input = next(self.secondary_train_iter)
        else:
            categorical_input = next(self.secondary_valid_iter)
        new_input,new_target=(last_input,categorical_input),last_target
        return {'last_input': new_input, 'last_target': new_target}

### Fastai Learner with Custom Optimizer

In fastai, the `Learner` holds the data, model and other parameter, like the optimizer. Since we're using transformers, we want to use an optimizer designed for them: the AdamW optimizer. This optimizer matches Pytorch Adam optimizer Api, therefore, it becomes straightforward to integrate it within ``fastai``. To reproduce BertAdam specific behavior, you have to set ``correct_bias = False``. We include our new AvgSpearman metric.


In [92]:
weights = torch.tensor(train[labels[0]].value_counts(normalize=True).sort_values().values,dtype=torch.float32).cuda()
weights=(1/weights)/(1/weights).sum()
weights

tensor([0.5262, 0.1913, 0.1052, 0.0929, 0.0585, 0.0126, 0.0072, 0.0040, 0.0022],
       device='cuda:0')

In [96]:
#export
import pdb
class FlattenedLoss_BWW(FlattenedLoss):
    def __init__(self,unique_sorted_values,*args,**kwargs):
        super().__init__(*args,**kwargs)
        self.unique_sorted_values=unique_sorted_values
        self.total_entropy=torch.tensor(0.0).cuda()


    def __call__(self, input:Tensor, target:Tensor, **kwargs)->Rank0Tensor:
        
        input = input.transpose(self.axis,-1).contiguous()
        target = target.transpose(self.axis,-1).contiguous()
        if self.floatify: target = target.float()
        input = input.view(-1,input.shape[-1]) if self.is_2d else input.view(-1)
        self.total_entropy=0.0
        pos = 0

        for i in range(len(self.unique_sorted_values)):
            labeled_target = torch.empty(target.shape[0], dtype=torch.long).cuda()
            for j in range(len(self.unique_sorted_values[i])):
                labeled_target[(target[:,i]== self.unique_sorted_values[i][j]).nonzero()] = j
                if j==0:
                    occurences = (target[:,i] == self.unique_sorted_values[i][j]).sum(dtype=torch.float).unsqueeze(dim=0)
                else:
                    occurences = torch.cat((occurences,(target[:,i] == self.unique_sorted_values[i][j]).sum(dtype=torch.float).unsqueeze(dim=0)),axis=0)
            new_weights=torch.where(occurences>0.,1/occurences,torch.zeros(occurences.shape).cuda())
            new_weights = new_weights / new_weights.sum()
            self.func.weight = new_weights
            #pdb.set_trace()
            self.total_entropy+=self.func.__call__(input[:,pos:(pos+len(self.unique_sorted_values[i]))],
                                              labeled_target, **kwargs)
            pos+=len(self.unique_sorted_values[i])
        return self.total_entropy/len(self.unique_sorted_values)

In [97]:
#export
def CrossEntropyFlat_BWW(unique_sorted_values,*args, axis:int=-1, **kwargs):
    "Same as `nn.CrossEntropyLoss`, but flattens input and target."
    return_loss=FlattenedLoss_BWW(unique_sorted_values,nn.CrossEntropyLoss, *args, axis=axis, **kwargs)
    return return_loss

In [98]:
from fastai.callbacks import *
from transformers import AdamW

from functools import partial
AdamW = partial(AdamW, correct_bias=False)

learner = Learner(databunch, 
                  custom_transformer_model, 
                  opt_func = AdamW,
                  loss_func = CrossEntropyFlat_BWW(unique_sorted_values=unique_sorted_values),
                  metrics=[AvgSpearman(unique_sorted_values)]
                )

# Show graph of learner stats and metrics after each epoch.
#learner.callbacks.append(ShowGraph(learner))

# Put learn in FP16 precision mode. --> Not working in the tutorial
if use_fp16: learner = learner.to_fp16()

# Training the Model

Now that we've created the Learner, we can train the model. During training, we are going to use techniques known to help in other classification tasks: **discriminative layer training**, **gradual unfreezing** and **slanted triangular learning rates**. The kernel tutorial author noted that he didn't find any documentation about influence of these techniques with transformers. I've used them because I think that these techniques are probably domain general, and will therefore give a boost in this system. 

To implement unfreezing, our model needs to be specified into different layer groups. ``fastai`` allows us to "split" the structure model into groups, [described here](https://docs.fast.ai/basic_train.html#Discriminative-layer-training).

To see the structure of the RoBERTa model, look at the output of the following:

In [99]:
print(learner.model)

CustomTransformerModel(
  (transformer_q): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((76

Let's check how many layer groups we currently have:

In [100]:
num_groups = len(learner.layer_groups)
print('Learner split in',num_groups,'groups')

Learner split in 1 groups


One group won't allow us to unfreeze parts of the model. The tutorial kernel suggested to divide the RoBERTa model in 14 blocks:
* 1 Embedding
* 12 transformer
* 1 classifier

list_layers = [learner.model.transformer.roberta.embeddings,
              learner.model.transformer.roberta.encoder.layer[0],
              learner.model.transformer.roberta.encoder.layer[1],
              learner.model.transformer.roberta.encoder.layer[2],
              learner.model.transformer.roberta.encoder.layer[3],
              learner.model.transformer.roberta.encoder.layer[4],
              learner.model.transformer.roberta.encoder.layer[5],
              learner.model.transformer.roberta.encoder.layer[6],
              learner.model.transformer.roberta.encoder.layer[7],
              learner.model.transformer.roberta.encoder.layer[8],
              learner.model.transformer.roberta.encoder.layer[9],
              learner.model.transformer.roberta.encoder.layer[10],
              learner.model.transformer.roberta.encoder.layer[11],
              learner.model.transformer.roberta.pooler]

learner.split(list_layers);

#list_layers = [learner.model.transformer.albert.embeddings,
#              learner.model.transformer.albert.encoder.albert_layer_groups[0],
#              learner.model.transformer.albert.pooler]

list_layers = [learner.model.transformer.embeddings,
              learner.model.transformer.encoder.layer[0],
              learner.model.transformer.encoder.layer[1],
              learner.model.transformer.encoder.layer[2],
              learner.model.transformer.encoder.layer[3],
              learner.model.transformer.encoder.layer[4],
              learner.model.transformer.encoder.layer[5],
              learner.model.transformer.encoder.layer[6],
              learner.model.transformer.encoder.layer[7],
              learner.model.transformer.encoder.layer[8],
              learner.model.transformer.encoder.layer[9],
              learner.model.transformer.encoder.layer[10],
              learner.model.transformer.encoder.layer[11],
              learner.model.transformer.pooler,
              learner.model.classifier]

#learner.split(list_layers);

Let's check that we now have 14 layer groups:

In [101]:
num_groups = len(learner.layer_groups)
print('Learner split in',num_groups,'groups')

Learner split in 1 groups


### Model Training

To train the model we will:
1. Find an appropriate initial learning rate
1. Progressively unfreeze the layers while training

During all training, we use the **Slanted Triangular Learning Rates** with the `.fit_one_cycle` command, described [here](https://docs.fast.ai/callbacks.one_cycle.html). Originally, I wanted to unfreeze the entire model, but I kept running out of space. I'll trouble shoot in other versions. 

#### Find an appropriate learning rate


Due to randomness, there can be little differences in the learning rate. Based on a few runs on my computer, I've chosen 2e-4 for the starting point of my kaggle submission. 

#### Train with progressive unfreezing

Because the code to progressively unfreeze and train the model is very repetitive, I have made it into a loop. I unfreeze only the first 5 layer groups because I run out of memory after that. The learning rates and number of epochs are mostly arbitrary.

In [102]:
unfreeze_layers = [-1,-2,-3]
learning_rates = [3e-4, 1e-5, 5e-6]
epochs = [3,4,8]

In [103]:
unfreeze_layers = [-1,-5,-9,-15]
learning_rates = [2e-4, 5e-5,  5e-5, 1e-5]
epochs = [2, 2, 3,4]
learner.loss_func

FlattenedLoss of CrossEntropyLoss()

In [104]:
#export
def model_unfreezing_and_training(num_groups,learning_rates,unfreeze_layers,epochs):
    for layer in range(0,num_groups):
        print(layer)
        if layer == num_groups-1:
            learner.unfreeze()
        else:
            learner.freeze_to(unfreeze_layers[layer])

        print('freezing to:',unfreeze_layers[layer],' - ',epochs[layer],'epochs')
        learner.fit_one_cycle(epochs[layer],
                              max_lr=slice(learning_rates[layer]*0.95**num_groups, learning_rates[layer]),
                              moms=(0.8, 0.9))

In [106]:
learner.fit_one_cycle(6, 
                              max_lr=1e-5)

epoch,train_loss,valid_loss,avg_spearman,time


RuntimeError: CUDA out of memory. Tried to allocate 24.00 MiB (GPU 0; 3.94 GiB total capacity; 2.91 GiB already allocated; 26.06 MiB free; 3.06 GiB reserved in total by PyTorch)

In [105]:
learner.fit_one_cycle(20, 
                              max_lr=1e-5,
                              moms=(0.8, 0.9))


epoch,train_loss,valid_loss,avg_spearman,time


KeyboardInterrupt: 

In [None]:
model_unfreezing_and_training() #bce loss

In [None]:
model_unfreezing_and_training()

# Predictions and Submission

Now that the model is trained, we can generate our predictions from the test dataset. As [noted in other tutorials](https://mlexplained.com/2019/05/13/a-tutorial-to-fine-tuning-bert-with-fast-ai/) the function ``get_preds`` does not return elements in order by default. Therefore, we will have to resort the test elements into their correct order.

In [55]:
#export
def get_preds_as_nparray(ds_type,unique_sorted_values,databunch)  -> np.ndarray:
    """
    the get_preds method does not yield the elements in order by default
    we borrow the code from the RNNLearner to resort the elements into their correct order
    """
    preds = learner.get_preds(ds_type)[0].detach().cpu().numpy()
    pos =0
    processed_pred=torch.empty(preds.shape[0],30)
    for j in range(len(unique_sorted_values)):
        column_distinct_size = len(unique_sorted_values[j])
        #processed_pred = self.labels[torch.argmax(torch.tensor(self.preds),1)]
        processed_pred[:,j] = torch.matmul(F.softmax(torch.tensor(preds[:,pos:(pos+column_distinct_size)]),1),
                                        torch.tensor(unique_sorted_values[j],dtype=torch.float))
        pos+=column_distinct_size
    processed_pred=processed_pred.numpy()
    sampler = [i for i in databunch.dl(ds_type).sampler]
    reverse_sampler = np.argsort(sampler)
    return processed_pred[reverse_sampler, :]

> <ipython-input-55-1e71ebfc84aa>(8)get_preds_as_nparray()
-> pos =0
(Pdb) preds[0,:5]
array([0.000844, 0.000899, 0.000628, 0.001   , 0.004185], dtype=float32)
(Pdb) preds[1,:5]
array([0.000554, 0.000912, 0.000617, 0.00082 , 0.003595], dtype=float32)
(Pdb) n
> <ipython-input-55-1e71ebfc84aa>(9)get_preds_as_nparray()
-> processed_pred=torch.empty(preds.shape[0],30)
(Pdb) n
> <ipython-input-55-1e71ebfc84aa>(10)get_preds_as_nparray()
-> for j in range(len(unique_sorted_values)):
(Pdb) n
> <ipython-input-55-1e71ebfc84aa>(11)get_preds_as_nparray()
-> column_distinct_size = len(unique_sorted_values[j])
(Pdb) n
> <ipython-input-55-1e71ebfc84aa>(13)get_preds_as_nparray()
-> processed_pred[:,j] = torch.matmul(F.softmax(torch.tensor(preds[:,pos:(pos+column_distinct_size)]),1),
(Pdb) F.softmax(torch.tensor(preds[:,pos:(pos+column_distinct_size)]),1)
tensor([[0.1106, 0.1107, 0.1106,  ..., 0.1107, 0.1119, 0.1126],
        [0.1107, 0.1108, 0.1107,  ..., 0.1108, 0.1120, 0.1117],
        [0.1107, 0.11

BdbQuit: 

In [None]:
x = np.arange(10,1,-1)
labels=np.random.randn(9)
labels[a.astype(int)] 

In [63]:
test_preds,test_preds.shape

array([0.667481, 0.666484, 0.498487, 0.500345, 0.500687, 0.50056 , 0.666297, 0.666225, 0.499658, 0.496081, 0.499753,
       0.49986 , 0.498045, 0.495511, 0.497176, 0.498569, 0.500037, 0.499136, 0.499796, 0.325784, 0.667114, 0.667421,
       0.666604, 0.667614, 0.667803, 0.600912, 0.500067, 0.498771, 0.499926, 0.667574], dtype=float32)

In [None]:
test_preds,test_preds.shape

In [None]:
sample_submission = pd.read_csv(DATA_ROOT / 'sample_submission.csv')
sample_submission[labels] = test_preds
sample_submission.to_csv("submission.csv", index=False)

We check the order

In [None]:
test.head()

In [None]:
sample_submission.head()

In [1]:
#hide
from nbdev.export import notebook2script
notebook2script()

Converted 00_encoder.ipynb.
Converted 01_decoder.ipynb.
Converted Data analysis.ipynb.
This cell doesn't have an export destination and was ignored:
e
This cell doesn't have an export destination and was ignored:
e
Converted fastai_v1_transformers-BWW-Script.ipynb.
Converted fastai_v1_transformers-BWW.ipynb.
Converted fastai_v1_transformers.ipynb.
Converted index.ipynb.
