In [None]:
# default_exp utils

In [None]:
#hide
%reload_ext autoreload
%autoreload 2
%matplotlib inline

# utils

> Various utility functions used by the blurr package.

In [None]:
#export
import sys, inspect
from enum import Enum

import pandas as pd
import torch

from transformers import *
from fastai2.text.all import *

In [None]:
#hide
import pdb

from nbdev.showdoc import *
from fastcore.test import *

In [None]:
#cuda
torch.cuda.set_device(1)
print(f'Using GPU #{torch.cuda.current_device()}: {torch.cuda.get_device_name()}')

Using GPU #1: GeForce GTX 1080 Ti


In [None]:
#export
def str_to_class(classname):
    "converts string representation to class"
    return getattr(sys.modules[__name__], classname)

In [None]:
#export
class Singleton:
    def __init__(self,cls):
        self._cls, self._instance = cls, None

    def __call__(self, *args, **kwargs):
        if self._instance == None: self._instance = self._cls(*args, **kwargs)
        return self._instance

`Singleton` functions as python decorator.  Use this above any class to turn that class into a singleton (see [here](https://python-3-patterns-idioms-test.readthedocs.io/en/latest/Singleton.html) for more info on the singleton pattern).

In [None]:
@Singleton
class TestSingleton: pass

a = TestSingleton()
b = TestSingleton()
test_eq(a,b)

## ModelHelper

In [None]:
#export
@Singleton
class ModelHelper():
    
    def __init__(self):
        # get hf classes (tokenizers, configs, models, etc...)
        transformer_classes = inspect.getmembers(sys.modules[__name__], 
                                                 lambda member: inspect.isclass(member)
                                                 and member.__module__.startswith('transformers.'))
        
        # build a df that we can query against to get various transformers objects/info
        self._df = pd.DataFrame(transformer_classes, columns=['class_name', 'class_location'])
        
        # add the module each class is included in
        self._df['module'] = self._df.class_location.apply(lambda v: v.__module__)
        
        # remove class_location (don't need it anymore)
        self._df.drop(labels=['class_location'], axis=1, inplace=True)
        
        # break up the module into separate cols
        module_parts_df = self._df.module.str.split(".", n = -1, expand = True) 
        for i in range(len(module_parts_df.columns)):
            self._df[f'module_part_{i}'] = module_parts_df[i]

        # using module part 1, break up the functional area and arch into separate cols
        module_part_1_df = self._df.module_part_1.str.split("_", n = 1, expand = True) 
        self._df[['functional_area', 'arch']] = module_part_1_df
        
        # if functional area = modeling, pull out the task it is built for
        model_type_df = self._df[(self._df.functional_area == 'modeling')].class_name.str.split('For', n=1, expand=True)
        
        model_type_df[1] = np.where(model_type_df[1].notnull(), 
                                    'For' + model_type_df[1].astype(str), 
                                    model_type_df[1])
        
        self._df['model_task'] = model_type_df[1]
        
        model_type_df = self._df[(self._df.functional_area == 'modeling')].class_name.str.split('With', n=1, expand=True)
        model_type_df[1] = np.where(model_type_df[1].notnull(), 
                                    'With' + model_type_df[1].astype(str), 
                                    self._df[(self._df.functional_area == 'modeling')].model_task)
        
        self._df['model_task'] = model_type_df[1]
        
        # look at what we're going to remove (use to verify we're just getting rid of stuff we want too)
        # df[~df['hf_class_type'].isin(['modeling', 'configuration', 'tokenization'])]
        
        # only need these 3 functional areas for our querying purposes
        self._df = self._df[self._df['functional_area'].isin(['modeling', 'configuration', 'tokenization'])]
        
    def get_architectures(self): 
        """Used to get all the architectures supported by your `Transformers` install"""
        return self._df[(self._df.arch.notna()) & (self._df.arch != None)].arch.unique().tolist()
    
    def get_config(self, arch): 
        """Used the locate the name of the configuration class for a given architecture"""
        return self._df[(self._df.functional_area == 'configuration') & (self._df.arch == arch)].class_name.values[0]
    
    def get_tokenizers(self, arch): 
        """Used to get the available huggingface tokenizers for a given architecture. Note: There may be 
        multiple tokenizers and so this returns a list.
        """
        return self._df[(self._df.functional_area == 'tokenization') & (self._df.arch == arch)].class_name.values
    
    def get_tasks(self, arch=None): 
        """Get the type of tasks for which there is a custom model for (*optional: by architecture*). 
        There are a number of customized models built for specific tasks like token classification, 
        question/answering, LM, etc....
        """
        query = ['model_task.notna()']
        if (arch): query.append(f'arch == "{arch}"')

        return self._df.query(' & '.join(query), engine='python').model_task.unique().tolist()
    
    def get_models(self, arch=None, task=None):
        """The transformer models available for use (optional: by architecture | task)"""
        query = ['functional_area == "modeling"']
        if (arch): query.append(f'arch == "{arch}"')
        if (task): query.append(f'model_task == "{task}"')

        return self._df.query(' & '.join(query)).class_name.tolist()
    
    def get_classes_for_model(self, model_name_or_enum):
        """Get tokenizers, config, and model for a given model name / enum"""
        model_name = model_name_or_enum if isinstance(model_name_or_enum, str) else model_name_or_enum.name

        meta = self._df[self._df.class_name == model_name]
        tokenizers = self.get_tokenizers(meta.arch.values[0])
        config = self.get_config(meta.arch.values[0])

        return ([str_to_class(tok) for tok in tokenizers], str_to_class(config), str_to_class(model_name))
    
    def get_model_architecture(self, model_name_or_enum):
        """Get the architecture for a given model name / enum"""
        model_name = model_name_or_enum if isinstance(model_name_or_enum, str) else model_name_or_enum.name
        return self._df[self._df.class_name == model_name].arch.values[0]
    
    def get_auto_hf_objects(self, pretrained_model_name_or_path, task, config=None):
        """Returns the architecture (str), tokenizer (obj), config (obj), and model (obj) 
        given a known pre-trained model name or path and a task using Hugginface `AutoModel` capabilities.  
        If a `config` is passed in, it will be  used when building the model, else the default configuration 
        will be used (e.g., `AutoConfig.from_pretrained(...)`)
        """
        tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path)
        config = AutoConfig.from_pretrained(pretrained_model_name_or_path) if (config is None) else config

        model = str_to_class(f'AutoModel{task.name}').from_pretrained(pretrained_model_name_or_path, 
                                                                      config=config)
        arch = self.get_model_architecture(type(model).__name__)

        return (arch, tokenizer, config, model)
    
    def get_hf_objects(self, pretrained_model_name_or_path, tokenizer_cls, model_cls, config=None): 
        """Returns the architecture (str), tokenizer (class), config (class), and model (class) 
        given a known pre-trained model name or path, a tokenizer class, and model class.  If a `config` object 
        is passed in, it will be used when building the model, else the default configuration will be used.
        """
        tokenizer = tokenizer_cls.from_pretrained(pretrained_model_name_or_path)

        if (config is None):
            model = str_to_class(model_cls.name).from_pretrained(pretrained_model_name_or_path)
            config = model.config
        else:
            model = str_to_class(model_cls.name).from_pretrained(pretrained_model_name_or_path, config=config)

        arch = self.get_model_architecture(type(model).__name__)

        return (arch, tokenizer, config, model)


`ModelHelper` is a `Singleton` (there exists only one instance, and the same instance is returned upon subsequent instantiation requests).  You can get at via the `BLURR_MODEL_HELPER` constant below.

In [None]:
mh = ModelHelper()
mh2 = ModelHelper()
test_eq(mh, mh2)

In [None]:
#hide
display_df(mh._df.head())

print(list(mh._df.model_task.unique()))
print('')
print(list(mh._df.functional_area.unique()))
print('')
print(list(mh._df.module_part_2.unique()))
print('')
print(list(mh._df.module_part_3.unique()))

Unnamed: 0,class_name,module,module_part_0,module_part_1,module_part_2,module_part_3,functional_area,arch,model_task
1,AdaptiveEmbedding,transformers.modeling_transfo_xl,transformers,modeling_transfo_xl,,,modeling,transfo_xl,
2,AlbertConfig,transformers.configuration_albert,transformers,configuration_albert,,,configuration,albert,
3,AlbertForMaskedLM,transformers.modeling_albert,transformers,modeling_albert,,,modeling,albert,ForMaskedLM
4,AlbertForPreTraining,transformers.modeling_albert,transformers,modeling_albert,,,modeling,albert,ForPreTraining
5,AlbertForQuestionAnswering,transformers.modeling_albert,transformers,modeling_albert,,,modeling,albert,ForQuestionAnswering


[None, nan, 'ForMaskedLM', 'ForPreTraining', 'ForQuestionAnswering', 'ForSequenceClassification', 'ForTokenClassification', 'ForMultipleChoice', 'WithLMHead', 'ForConditionalGeneration', 'ForNextSentencePrediction', 'ForQuestionAnsweringSimple', 'WithLMHeadModel', 'ForClassification']

['modeling', 'configuration', 'tokenization']

[None]

[None]


### Provide global helper constant

Users of this library can simply use `BLURR_MODEL_HELPER` to access all the `ModelHelper` capabilities without having to fetch an instance themselves.

In [None]:
#export
BLURR_MODEL_HELPER = ModelHelper()

In [None]:
show_doc(ModelHelper(ModelHelper).get_architectures)

<h4 id="ModelHelper.get_architectures" class="doc_header"><code>ModelHelper.get_architectures</code><a href="__main__.py#L51" class="source_link" style="float:right">[source]</a></h4>

> <code>ModelHelper.get_architectures</code>()

Used to get all the architectures supported by your `Transformers` install

In [None]:
print(mh.get_architectures())

['transfo_xl', 'albert', 'auto', 'bart', 'bert', 'bert_japanese', 'ctrl', 'camembert', 'utils', 'distilbert', 'electra', 'encoder_decoder', 'flaubert', 'gpt2', 'mmbt', 'marian', 'openai', 'reformer', 'roberta', 't5', 'xlm', 'xlm_roberta', 'xlnet']


We'll also create an enum for downstream tasks

In [None]:
#export
HF_ARCHITECTURES = Enum('HF_ARCHITECTURES', BLURR_MODEL_HELPER.get_architectures())

In [None]:
print(L(HF_ARCHITECTURES))

(#23) [<HF_ARCHITECTURES.transfo_xl: 1>,<HF_ARCHITECTURES.albert: 2>,<HF_ARCHITECTURES.auto: 3>,<HF_ARCHITECTURES.bart: 4>,<HF_ARCHITECTURES.bert: 5>,<HF_ARCHITECTURES.bert_japanese: 6>,<HF_ARCHITECTURES.ctrl: 7>,<HF_ARCHITECTURES.camembert: 8>,<HF_ARCHITECTURES.utils: 9>,<HF_ARCHITECTURES.distilbert: 10>...]


In [None]:
show_doc(ModelHelper(ModelHelper).get_config)

<h4 id="ModelHelper.get_config" class="doc_header"><code>ModelHelper.get_config</code><a href="__main__.py#L55" class="source_link" style="float:right">[source]</a></h4>

> <code>ModelHelper.get_config</code>(**`arch`**)

Used the locate the name of the configuration class for a given architecture

In [None]:
print(mh.get_config('bert'))

BertConfig


In [None]:
show_doc(ModelHelper(ModelHelper).get_tokenizers)

<h4 id="ModelHelper.get_tokenizers" class="doc_header"><code>ModelHelper.get_tokenizers</code><a href="__main__.py#L59" class="source_link" style="float:right">[source]</a></h4>

> <code>ModelHelper.get_tokenizers</code>(**`arch`**)

Used to get the available huggingface tokenizers for a given architecture. Note: There may be 
multiple tokenizers and so this returns a list.

In [None]:
print(mh.get_tokenizers('electra'))

['ElectraTokenizer' 'ElectraTokenizerFast']


In [None]:
show_doc(ModelHelper(ModelHelper).get_tasks)

<h4 id="ModelHelper.get_tasks" class="doc_header"><code>ModelHelper.get_tasks</code><a href="__main__.py#L65" class="source_link" style="float:right">[source]</a></h4>

> <code>ModelHelper.get_tasks</code>(**`arch`**=*`None`*)

Get the type of tasks for which there is a custom model for (*optional: by architecture*). 
There are a number of customized models built for specific tasks like token classification, 
question/answering, LM, etc....

In [None]:
print(mh.get_tasks())
print('')
print(mh.get_tasks('bart'))

['ForMaskedLM', 'ForPreTraining', 'ForQuestionAnswering', 'ForSequenceClassification', 'ForTokenClassification', 'ForMultipleChoice', 'WithLMHead', 'ForConditionalGeneration', 'ForNextSentencePrediction', 'ForQuestionAnsweringSimple', 'WithLMHeadModel', 'ForClassification']

['ForConditionalGeneration', 'ForSequenceClassification']


We'll create an enum for tasks as well, one for all tasks and another for tasks available via huggingface's `AutoModel` capabilities

In [None]:
#export
HF_TASKS_ALL = Enum('HF_TASKS_ALL', BLURR_MODEL_HELPER.get_tasks())
HF_TASKS_AUTO = Enum('HF_TASKS_AUTO', BLURR_MODEL_HELPER.get_tasks('auto'))

In [None]:
print('--- all tasks ---')
print(L(HF_TASKS_ALL))
print('\n--- auto only ---')
print(L(HF_TASKS_AUTO))

--- all tasks ---
(#12) [<HF_TASKS_ALL.ForMaskedLM: 1>,<HF_TASKS_ALL.ForPreTraining: 2>,<HF_TASKS_ALL.ForQuestionAnswering: 3>,<HF_TASKS_ALL.ForSequenceClassification: 4>,<HF_TASKS_ALL.ForTokenClassification: 5>,<HF_TASKS_ALL.ForMultipleChoice: 6>,<HF_TASKS_ALL.WithLMHead: 7>,<HF_TASKS_ALL.ForConditionalGeneration: 8>,<HF_TASKS_ALL.ForNextSentencePrediction: 9>,<HF_TASKS_ALL.ForQuestionAnsweringSimple: 10>...]

--- auto only ---
(#6) [<HF_TASKS_AUTO.ForMultipleChoice: 1>,<HF_TASKS_AUTO.ForPreTraining: 2>,<HF_TASKS_AUTO.ForQuestionAnswering: 3>,<HF_TASKS_AUTO.ForSequenceClassification: 4>,<HF_TASKS_AUTO.ForTokenClassification: 5>,<HF_TASKS_AUTO.WithLMHead: 6>]


In [None]:
HF_TASKS_ALL.ForClassification

<HF_TASKS_ALL.ForClassification: 12>

In [None]:
show_doc(ModelHelper(ModelHelper).get_models)

<h4 id="ModelHelper.get_models" class="doc_header"><code>ModelHelper.get_models</code><a href="__main__.py#L75" class="source_link" style="float:right">[source]</a></h4>

> <code>ModelHelper.get_models</code>(**`arch`**=*`None`*, **`task`**=*`None`*)

The transformer models available for use (optional: by architecture | task)

In [None]:
print(L(mh.get_models()))

(#104) ['AdaptiveEmbedding','AlbertForMaskedLM','AlbertForPreTraining','AlbertForQuestionAnswering','AlbertForSequenceClassification','AlbertForTokenClassification','AlbertModel','AlbertPreTrainedModel','AutoModel','AutoModelForMultipleChoice'...]


In [None]:
print(mh.get_models(arch='bert'))

['BertForMaskedLM', 'BertForMultipleChoice', 'BertForNextSentencePrediction', 'BertForPreTraining', 'BertForQuestionAnswering', 'BertForSequenceClassification', 'BertForTokenClassification', 'BertLayer', 'BertModel', 'BertPreTrainedModel']


In [None]:
print(mh.get_models(task='ForTokenClassification'))

['AlbertForTokenClassification', 'AutoModelForTokenClassification', 'BertForTokenClassification', 'CamembertForTokenClassification', 'DistilBertForTokenClassification', 'ElectraForTokenClassification', 'RobertaForTokenClassification', 'XLMForTokenClassification', 'XLMRobertaForTokenClassification', 'XLNetForTokenClassification']


In [None]:
print(mh.get_models(arch='bert', task='ForTokenClassification'))

['BertForTokenClassification']


We'll create another enum for the huggingface models

In [None]:
#export
HF_MODELS = Enum('HF_MODELS', BLURR_MODEL_HELPER.get_models())

In [None]:
print(L(HF_MODELS))

(#104) [<HF_MODELS.AdaptiveEmbedding: 1>,<HF_MODELS.AlbertForMaskedLM: 2>,<HF_MODELS.AlbertForPreTraining: 3>,<HF_MODELS.AlbertForQuestionAnswering: 4>,<HF_MODELS.AlbertForSequenceClassification: 5>,<HF_MODELS.AlbertForTokenClassification: 6>,<HF_MODELS.AlbertModel: 7>,<HF_MODELS.AlbertPreTrainedModel: 8>,<HF_MODELS.AutoModel: 9>,<HF_MODELS.AutoModelForMultipleChoice: 10>...]


In [None]:
show_doc(ModelHelper(ModelHelper).get_classes_for_model)

<h4 id="ModelHelper.get_classes_for_model" class="doc_header"><code>ModelHelper.get_classes_for_model</code><a href="__main__.py#L83" class="source_link" style="float:right">[source]</a></h4>

> <code>ModelHelper.get_classes_for_model</code>(**`model_name_or_enum`**)

Get tokenizers, config, and model for a given model name / enum

In [None]:
tokenizers, config, model = mh.get_classes_for_model('RobertaForSequenceClassification')

print(tokenizers[0])
print(config)
print(model)

<class 'transformers.tokenization_roberta.RobertaTokenizer'>
<class 'transformers.configuration_roberta.RobertaConfig'>
<class 'transformers.modeling_roberta.RobertaForSequenceClassification'>


In [None]:
tokenizers, config, model = mh.get_classes_for_model(HF_MODELS.DistilBertModel)

print(tokenizers[0])
print(config)
print(model)

<class 'transformers.tokenization_distilbert.DistilBertTokenizer'>
<class 'transformers.configuration_distilbert.DistilBertConfig'>
<class 'transformers.modeling_distilbert.DistilBertModel'>


In [None]:
show_doc(ModelHelper(ModelHelper).get_model_architecture)

<h4 id="ModelHelper.get_model_architecture" class="doc_header"><code>ModelHelper.get_model_architecture</code><a href="__main__.py#L93" class="source_link" style="float:right">[source]</a></h4>

> <code>ModelHelper.get_model_architecture</code>(**`model_name_or_enum`**)

Get the architecture for a given model name / enum

In [None]:
mh.get_model_architecture('RobertaForSequenceClassification')

'roberta'

### Methods for loading pre-trained (configs, tokenizer, model) hugginface classes

In [None]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased-finetuned-mrpc")
model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased-finetuned-mrpc")

In [None]:
show_doc(ModelHelper(ModelHelper).get_auto_hf_objects)

<h4 id="ModelHelper.get_auto_hf_objects" class="doc_header"><code>ModelHelper.get_auto_hf_objects</code><a href="__main__.py#L98" class="source_link" style="float:right">[source]</a></h4>

> <code>ModelHelper.get_auto_hf_objects</code>(**`pretrained_model_name_or_path`**, **`task`**, **`config`**=*`None`*)

Returns the architecture (str), tokenizer (obj), config (obj), and model (obj) 
given a known pre-trained model name or path and a task using Hugginface `AutoModel` capabilities.  
If a `config` is passed in, it will be  used when building the model, else the default configuration 
will be used (e.g., `AutoConfig.from_pretrained(...)`)

In [None]:
arch, tokenizer, config, model = mh.get_auto_hf_objects("bert-base-cased-finetuned-mrpc",
                                                        task=HF_TASKS_AUTO.WithLMHead)

print(arch)
print(type(tokenizer))
print(type(config))
print(type(model))

bert
<class 'transformers.tokenization_bert.BertTokenizer'>
<class 'transformers.configuration_bert.BertConfig'>
<class 'transformers.modeling_bert.BertForMaskedLM'>


In [None]:
arch, tokenizer, config, model = mh.get_auto_hf_objects("fmikaelian/flaubert-base-uncased-squad",
                                                        task=HF_TASKS_AUTO.ForQuestionAnswering)

print(arch)
print(type(tokenizer))
print(type(config))
print(type(model))

flaubert
<class 'transformers.tokenization_flaubert.FlaubertTokenizer'>
<class 'transformers.configuration_flaubert.FlaubertConfig'>
<class 'transformers.modeling_flaubert.FlaubertForQuestionAnsweringSimple'>


In [None]:
show_doc(ModelHelper(ModelHelper).get_hf_objects)

<h4 id="ModelHelper.get_hf_objects" class="doc_header"><code>ModelHelper.get_hf_objects</code><a href="__main__.py#L113" class="source_link" style="float:right">[source]</a></h4>

> <code>ModelHelper.get_hf_objects</code>(**`pretrained_model_name_or_path`**, **`tokenizer_cls`**, **`model_cls`**, **`config`**=*`None`*)

Returns the architecture (str), tokenizer (class), config (class), and model (class) 
given a known pre-trained model name or path, a tokenizer class, and model class.  If a `config` object 
is passed in, it will be used when building the model, else the default configuration will be used.

In [None]:
arch, tokenizer, config, model = mh.get_hf_objects("bert-base-cased-finetuned-mrpc",
                                                   tokenizer_cls=BertTokenizer, 
                                                   config=None,
                                                   model_cls=HF_MODELS.BertForNextSentencePrediction)
print(arch)
print(type(tokenizer))
print(type(config))
print(type(model))


bert
<class 'transformers.tokenization_bert.BertTokenizer'>
<class 'transformers.configuration_bert.BertConfig'>
<class 'transformers.modeling_bert.BertForNextSentencePrediction'>


## Task Marker classes

These classes are provided for use with the @typedispatched `build_hf_input` in the data module.  This gives you the ability to use this new feature in fastai to alter the base huggingface tokenization strategy provided in the framework, with something particular to one of these tasks (and optionally, the type of huggingface tokenizer you are using).

In [None]:
#export
class ForMaskedLMTask: pass
class ForQuestionAnsweringTask: pass
class ForSequenceClassificationTask: pass
class ForTokenClassificationTask: pass
class ForPreTrainingTask: pass
class WithLMHeadTask: pass
class ForConditionalGenerationTask: pass
class ForMultipleChoiceTask: pass
class ForNextSentencePredictionTask: pass
class ForQuestionAnsweringSimpleTask: pass
class WithLMHeadModelTask: pass
class ForClassificationTask: pass

## Cleanup

In [None]:
#hide
from nbdev.export import notebook2script
notebook2script()

Converted 00_utils.ipynb.
Converted 01_data-core.ipynb.
Converted 01a_data-language-modeling.ipynb.
Converted 01c_data-question-answering.ipynb.
Converted 01d_data-token-classification.ipynb.
Converted 01e_data-summarization.ipynb.
Converted 02_modeling-core.ipynb.
Converted 02a_modeling-language-modeling.ipynb.
Converted 02c_modeling-question-answering.ipynb.
Converted 02d_modeling-token-classification.ipynb.
Converted index.ipynb.
