In [None]:
# default_exp examples.blurr_high_level_api

In [None]:
#all_slow

In [None]:
#hide
%reload_ext autoreload
%autoreload 2
%matplotlib inline

import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# Using the high-level Blurr API

> Show all of the high-level `BlurrFor<Task>` classes in action here with the raw data sourced from the [Hugging Face Datasets library](https://Hugging Face.co/docs/datasets/index.html).

In [None]:
#export
import torch
from fastai.text.all import *

from datasets import load_dataset, concatenate_datasets
from transformers import *

from blurr.utils import *
from blurr.data.core import *
from blurr.modeling.core import *
from blurr.modeling.token_classification import *
from blurr.modeling.question_answering import *

logging.set_verbosity_error()

In [None]:
#hide_input
import pdb

from nbdev.showdoc import *
from fastcore.test import *

from fastai import __version__ as fa_version
from torch import __version__ as pt_version
from transformers import __version__ as hft_version

print("Here's what we're running with ...\n")
print(f'Using pytorch {pt_version}')
print(f'Using fastai {fa_version}')
print(f'Using transformers {hft_version}')

Here's what we're running with ...

Using pytorch 1.7.1
Using fastai 2.4
Using transformers 4.8.1


In [None]:
#cuda
#hide_input
torch.cuda.set_device(1)
print(f'Using GPU #{torch.cuda.current_device()}: {torch.cuda.get_device_name()}')

Using GPU #1: GeForce GTX 1080 Ti


The high-level Blurr API provides one liners to build your DataBlock, DataLoaders, and Learner (with sensible defaults) from a DataFrame, CSV file, or a list of dictionaries like we get back from Hugging Face Datasets.

## Sequence Classification

### Multiclassification (one input)

In [None]:
#hide
try: del learn; torch.cuda.empty_cache()
except: pass

In [None]:
raw_datasets = load_dataset('glue', 'cola') 
print(f'{raw_datasets}\n')
print(f'{raw_datasets["train"][0]}\n')
print(f'{raw_datasets["train"].features}\n')

Reusing dataset glue (/home/wgilliam/.cache/huggingface/datasets/glue/cola/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 8551
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1043
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1063
    })
})

{'idx': 0, 'label': 1, 'sentence': "Our friends won't buy this analysis, let alone the next one we propose."}

{'sentence': Value(dtype='string', id=None), 'label': ClassLabel(num_classes=2, names=['unacceptable', 'acceptable'], names_file=None, id=None), 'idx': Value(dtype='int32', id=None)}



Capture the indexes for both train and validation sets, use the datasets `concatenate_datasets` to put them into a single dataset, and finally use the `IndexSplitter` method to define our train/validation splits as such:

In [None]:
train_ds = raw_datasets['train']#.select(range(10000))
valid_ds = raw_datasets['validation']#.select(range(2000))

In [None]:
n_train, n_valid = train_ds.num_rows, valid_ds.num_rows
train_idxs, valid_idxs = L(range(n_train)), L(range(n_train, n_train + n_valid))
raw_ds = concatenate_datasets([train_ds, valid_ds])

In [None]:
dl_kwargs = {'bs': 4, 'val_bs': 8}
learn_kwargs = { 'metrics': [accuracy] }

learn = BlearnerForSequenceClassification.from_dictionaries(raw_ds, 'distilroberta-base', 
                                                            text='sentence', label='label',
                                                            dblock_splitter=IndexSplitter(valid_idxs),
                                                            dl_kwargs=dl_kwargs, learner_kwargs=learn_kwargs)

In [None]:
learn.dls.show_batch(dataloaders=learn.dls, trunc_at=500, max_n=5)

Unnamed: 0,text,category
0,"Everybody who has ever, worked in any office which contained any typewriter which had ever been used to type any letters which had to be signed by any administrator who ever worked in any department like mine will know what I mean.",1
1,Reports the covers of which the government prescribes the height of the lettering on almost always put me to sleep.,1
2,We talked about the issues we had worked on as students and that our perspectives had changed over the years.,1
3,Paul has interviewed any student who was at the scene of the crime and Kate has interviewed them too.,0


In [None]:
learn.fit_one_cycle(1, lr_max=2e-3)

epoch,train_loss,valid_loss,accuracy,time
0,0.500305,0.48377,0.779482,01:04


In [None]:
learn.show_results(learner=learn, max_n=5)

Unnamed: 0,text,category,target
0,"Scientists at the South Hanoi Institute of Technology have succeeded in raising one dog with five legs, another with a cow's liver, and a third with no head.",1,1
1,"The newspaper has reported that they are about to appoint someone, but I can't remember who the newspaper has reported that they are about to appoint.",1,1
2,"Sandy is very anxious to see if the students will be able to solve the homework problem in a particular way, but she won't tell us which.",1,1
3,"Clinton is anxious to find out which budget dilemmas Panetta would be willing to tackle in a certain way, but he won't say in which.",1,1
4,"Harry told Sue that Albania is a lovely place for a vacation, and Tom told Sally that Albania is a lovely place for a vacation.",1,1


### Multiclassification (two inputs)

In [None]:
#hide
try: del learn; torch.cuda.empty_cache()
except: pass

In [None]:
raw_datasets = load_dataset('glue', 'mrpc') 
print(f'{raw_datasets}\n')
print(f'{raw_datasets["train"][0]}\n')
print(f'{raw_datasets["train"].features}\n')

Reusing dataset glue (/home/wgilliam/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 1725
    })
})

{'idx': 0, 'label': 1, 'sentence1': 'Amrozi accused his brother , whom he called " the witness " , of deliberately distorting his evidence .', 'sentence2': 'Referring to him as only " the witness " , Amrozi accused his brother of deliberately distorting his evidence .'}

{'sentence1': Value(dtype='string', id=None), 'sentence2': Value(dtype='string', id=None), 'label': ClassLabel(num_classes=2, names=['not_equivalent', 'equivalent'], names_file=None, id=None), 'idx': Value(dtype='int32', id=None)}



Capture the indexes for both train and validation sets, use the datasets `concatenate_datasets` to put them into a single dataset, and finally use the `IndexSplitter` method to define our train/validation splits as such:

In [None]:
train_ds = raw_datasets['train']#.select(range(10000))
valid_ds = raw_datasets['validation']#.select(range(2000))

In [None]:
n_train, n_valid = train_ds.num_rows, valid_ds.num_rows
train_idxs, valid_idxs = L(range(n_train)), L(range(n_train, n_train + n_valid))
raw_ds = concatenate_datasets([train_ds, valid_ds])

In [None]:
dl_kwargs = {'bs': 4, 'val_bs': 8}
learn_kwargs = { 'metrics': [F1Score(), accuracy] }

learn = BlearnerForSequenceClassification.from_dictionaries(raw_ds, 'distilroberta-base', 
                                                            text=['sentence1', 'sentence2'], 
                                                            label='label',
                                                            dblock_splitter=IndexSplitter(valid_idxs),
                                                            dl_kwargs=dl_kwargs, learner_kwargs=learn_kwargs)

In [None]:
learn.dls.show_batch(dataloaders=learn.dls, trunc_at=500, max_n=5)

Unnamed: 0,text,category
0,"Amrozi accused his brother, whom he called "" the witness "", of deliberately distorting his evidence. Referring to him as only "" the witness "", Amrozi accused his brother of deliberately distorting his evidence.",1
1,"Then the authority can hold another set of public hearings on raising commuting costs, the judge said. But the judge, Louis York, said the authority can hold another set of public hearings on raising commuting costs afterward.",1
2,She met Lady Mary at her Double Bay home yesterday to thank her for the donation. She met Lady Mary for the first time at her Double Bay home in Sydney yesterday to thank her in person for the donation.,1
3,""" We disagree with the judge's decision on notice for Engine Company 261, "" said a statement by Michael A. Cardozo, the city's corporation counsel. He added that : "" We disagree with the judge's decision on notice for Engine Company 261. """,1


In [None]:
learn.fit_one_cycle(1, lr_max=2e-3)

epoch,train_loss,valid_loss,f1_score,accuracy,time
0,0.520961,0.461445,0.850318,0.769608,00:26


In [None]:
learn.show_results(learner=learn, max_n=5)

Unnamed: 0,text,category,target
0,"He said the foodservice pie business doesn 't fit the company's long-term growth strategy. "" The foodservice pie business does not fit our long-term growth strategy.",1,1
1,SPOT products run a Microsoft operating system and the company's DirectBand radio technology developed with SCA Data Systems. The DirectBand network was developed with the assistance of SCA Data Systems.,0,1
2,"Morrill's wife, Ellie, sobbed and hugged Bondeson's sister-in-law during the service. At the service Morrill's widow, Ellie, sobbed and hugged Bondeson's sister-in-law as people consoled her.",0,1
3,""" But they never climb out of the pot of beer again. "" It's just that they never climb out of the beer again. """,1,1
4,"Christina's aunt, Shelley Riling, said the defense's claims were preposterous. Christina's aunt, Shelley Riling, said she will address the court.",0,1


### Multilabel classification

In [None]:
#hide
try: del learn; torch.cuda.empty_cache()
except: pass

In [None]:
raw_datasets = load_dataset('civil_comments')
print(f'{raw_datasets}\n')
print(f'{raw_datasets["train"][0]}\n')
print(f'{raw_datasets["train"].features}\n')

Using custom data configuration default
Reusing dataset civil_comments (/home/wgilliam/.cache/huggingface/datasets/civil_comments/default/0.9.0/e7a3aacd2ab7d135fa958e7209d10b1fa03807d44c486e3c34897aa08ea8ffab)


DatasetDict({
    train: Dataset({
        features: ['text', 'toxicity', 'severe_toxicity', 'obscene', 'threat', 'insult', 'identity_attack', 'sexual_explicit'],
        num_rows: 1804874
    })
    validation: Dataset({
        features: ['text', 'toxicity', 'severe_toxicity', 'obscene', 'threat', 'insult', 'identity_attack', 'sexual_explicit'],
        num_rows: 97320
    })
    test: Dataset({
        features: ['text', 'toxicity', 'severe_toxicity', 'obscene', 'threat', 'insult', 'identity_attack', 'sexual_explicit'],
        num_rows: 97320
    })
})

{'identity_attack': 0.0, 'insult': 0.0, 'obscene': 0.0, 'severe_toxicity': 0.0, 'sexual_explicit': 0.0, 'text': "This is so cool. It's like, 'would you want your mother to read this??' Really great idea, well done!", 'threat': 0.0, 'toxicity': 0.0}

{'text': Value(dtype='string', id=None), 'toxicity': Value(dtype='float32', id=None), 'severe_toxicity': Value(dtype='float32', id=None), 'obscene': Value(dtype='float32', id=None), 'thr

In [None]:
lbl_cols =  ['identity_attack', 'insult', 'obscene', 'toxicity', 'severe_toxicity', 'sexual_explicit', 'threat']

In [None]:
train_ds = raw_datasets['train'].select(range(10000))
valid_ds = raw_datasets['validation'].select(range(2000))

Capture the indexes for both train and validation sets, use the datasets `concatenate_datasets` to put them into a single dataset, and finally use the `IndexSplitter` method to define our train/validation splits as such:

In [None]:
n_train, n_valid = len(train_ds), len(valid_ds)
train_idxs, valid_idxs = L(range(n_train)), L(range(n_train, n_train + n_valid))
raw_ds = concatenate_datasets([train_ds, valid_ds])

The labels need to be OHE as ints (the raw data has them as floats). We could also do this kind of preprocessing by passing in a `preprocess_func` to our `BlearnerForSequenceClassification` factory method, especially useful if such preprocessing depends on one or more of the Hugging Face objects (e.g., config, tokenizer, model, architecture)

In [None]:
def make_ohe(item):
    for k in item.keys():
        if (k in lbl_cols):
            item[k] = int(np.round(item[k]))
    return item

raw_ds = raw_ds.map(make_ohe)

Loading cached processed dataset at /home/wgilliam/.cache/huggingface/datasets/civil_comments/default/0.9.0/e7a3aacd2ab7d135fa958e7209d10b1fa03807d44c486e3c34897aa08ea8ffab/cache-615cf128814440d8.arrow


In [None]:
dl_kwargs = {'bs': 4, 'val_bs': 8}
learn_kwargs = { 'metrics': [F1ScoreMulti(), accuracy_multi] }

# using a List[dict] such as a Hugging Face dataset
learn = BlearnerForSequenceClassification.from_dictionaries(raw_ds, 'distilroberta-base', 
                                                            text='text', label=lbl_cols,
                                                            dblock_splitter=IndexSplitter(valid_idxs),
                                                            dl_kwargs=dl_kwargs, learner_kwargs=learn_kwargs)

In [None]:
learn.dls.show_batch(dataloaders=learn.dls, trunc_at=500, max_n=5)

Unnamed: 0,text,None
0,"Predatory patrol towing isn't a big subject, and there is no advocacy group that is paying any attention to it, but the City of Portland has completely backed off of enforcing state law where the towing predators are operating on private property, and this is Commissioner Novick's failure. He's in charge of towing.\n\nThe City has allowed Retriever Towing to operate in open violation of ADA for years at their NW Quimby lot, and there is absolutely no provision in city ordinance that takes into ac",
1,"Addiction to legal pharmaceutical opioids like Percocet, Oxycodone, Percodan, are proven to be a direct GATEWAY to heroin use. Prescription drugs are directly responsible for over 237,000 deaths yearly in the US plus over 5,000 killed due to pharmaceutical intoxicated drivers. A child is admitted to an emergency room every 9 minutes for prescription drug poisoning the USA. \n\n\nAlcohol consumption is a direct GATEWAY to domestic violence, traffic fatalities, teen pregnancies and death. Over",
2,"I'm sure you've heard the adage: ""A lie gone unchallenged becomes the truth in 24 hours."" Why would you advocate allowing lies to stand unchallenged, simply because you can deal with them? Sorry, but I find that level of complacency disgusting.\n\nUnless... The idea of floating lies is not unique to the Left, but it is a prominent arrow in their quiver and has been used so frequently, that it has permanently altered the political climate for both sides.\n\nObjoke: \n\nQ: How can you tell when a Rep",
3,"I was one of 60-odd kids who were the first kindergarten class to participate in SI, which at the time was a grand experiment with kids from all walks in across the city. With all the challenges we encountered as the first K-12 SI class, we still find ourselves largely in Portland, well educated, compassionate, independently-minded and engaged people who loved the program we participated in. Many of us refer to SI as a family. For a public education, it was truly one of a kind, and I would hope",


In [None]:
learn.fit_one_cycle(1, lr_max=2e-3)

epoch,train_loss,valid_loss,f1_score,accuracy_multi,time
0,0.029738,0.041094,0.165975,0.987499,01:14


  _warn_prf(


In [None]:
learn.show_results(learner=learn, trun_at=500, max_n=5)

  _warn_prf(


Unnamed: 0,text,None,target
0,"Everyone tries to hack everyone else. I have no doubt Russia would try to hack even canada. However, the US has been doing the same, if we recall Snowden.\n\nEven Merkel's phone conversations were being tapped by the CIA. \n\nThe real purpose of this issue is political. Trump is upset because people are trying to imply that he didn't deserve his victory, that the Russians helped him. It's an ego thing. Good CEOs sometimes have giant egos. I have no problem with that as long as they produce results, I gladly buy shares in their company.\n\nOtoh, Russia did invade Crimea recently, and their missile brought down a commercial airliner and killed lots of innocent people. The world has a right to be annoyed at the Russians.\n\nIf you want to find evidence of Russians hacking, you will find them. But if you want to find China or some guy in a basement somewhere, I have no doubt you can find the same as well. Whether they succeeded or not, that's hard to prove, but there's lots of blackhats",,[]
1,"Glad to see, as Canadians we are more and more conscious and aware, of all these abuses from immigration, Trudeau can save travel time and costs if he would have paid attention in the first place, foreign labor and foreign student policies hurting the Canadians (middle-class and families and employees and youth! and disadvantaged disabled, homeless and aboriginals) The bad immigration policy list and abuse, goes on and on and on, it's hard to keep track, write it down people! Now it is high-time for all Canadian politicians to take their heads out of the sand, or wherever they may have them, learn something useful to help Canadians, and put a stop to this abuse, nonsense and madness we have been having with too much immigration, and do their jobs! do what's right. Lot's of attention and resources need to be focused on reducing immigration, fixing our bad policies, limiting foreign labor, etc. etc. it is not even funny! Nothing else matters. 40k per year. McCallum, Barton goodbye thanks",,[]
2,"Zuri, I mostly agree with you. Yes, i see that many people face those problems. Where our office / internet cafe is in Waianae is the official Homeless Outreach center for the whole Waianae Coast. Most people that want to move into the area shelters have to pass through our office to process the paperwork. So i see first hand the problems. \n.\nBut that ""minority"" you are referring to, is in fact the majority land owners along the coast, and the people that serve on the neighborhood boards, own businesses and the stores.\n.\nMy point is, at some point this community needs to grab opportunity by the ***** and take control of the community, or someone else eventually will. That someone will be land developers, or drug dealers, or both. You cant be passive and expect to accomplish great things. I hear all of the time that the people of Waianae want better. Well, EXCELLENT! Lets start with the basics and have the people on the boards and land owners kick out the drug dealers and other trash.",,[]
3,"Thank you for this article.\nErrata: My father, Dr. James Ford Lewis, was minister of the First Unitarian Church in Portland in '58-'60; he wasn't Portland's ""first Unitarian minister."" The church was founded here in 1867 (http://bit.ly/29ClQCp). \nI met Mr. Ellison at Sacramento City College, not at my own university.\nAt PCS, for ""Astoria"" I am coaching the Shoshoni language, not ""Shoshona,"" which does not exist, and there is no such thing as ""Scotch-Canadian patois."" The Scots-Canadian accent is what I spoke of. The Iowa, Arikara, Hawaiian languages will also be heard, along with another 10+ accents of English. \nMy work as OnStar's voice began with years of work at General Magic, where I recorded tens of thousands of prompts (rather than ""a succession of responses"") for a system, not doomed but premature, that supported 2.5 million users at its peak. (http://bit.ly/29Q7P7a). And I am the longest-working pro voice in *speech recognition,* not in general. That would be some bold claim!",,[]
4,"a) President Trump is not threatening to annihilate many countries. Kim is.\nb) Kim has stated that NK can produce as many hydrogen bombs as it wishes; there is no reason not to believe him.\nc) NK has tested 2 ICBMs and several intermediate range missiles.\nd) Fascists believe in racial purity, the dominance of the state over the individual, and the use of force to accomplish their goals. Kim believes that Koreans are racially superior, that the state is superior to the individual and can use any means necessary to control the individual, and Kim believes in the use of force to realize his goals. Kim is clearly a fascist.\ne) Nominally, Kim is a ""communist,"" but neither he nor the regime has any truck whatsoever with the key element of communism: ""From each according to his ability, to each according to his needs."" Moreover, Kim and the élite have lots of private property which is'streng verboten' under communism.\n\nYou have a fundamental misunderstanding of Kim and NK.\n\nVery sad.",,[]


## Token Classification

In [None]:
#hide
try: del learn; torch.cuda.empty_cache()
except: pass

In [None]:
raw_datasets = load_dataset('germeval_14') 
print(f'{raw_datasets}\n')
print(f'{raw_datasets["train"][0]}\n')
print(f'{raw_datasets["train"].features}\n')

Reusing dataset germ_eval14 (/home/wgilliam/.cache/huggingface/datasets/germ_eval14/germeval_14/2.0.0/0f174b84866aa3b8ebae65c271610520be4422405d7e8467bd24cfd493d325f0)


DatasetDict({
    train: Dataset({
        features: ['id', 'source', 'tokens', 'ner_tags', 'nested_ner_tags'],
        num_rows: 24000
    })
    validation: Dataset({
        features: ['id', 'source', 'tokens', 'ner_tags', 'nested_ner_tags'],
        num_rows: 2200
    })
    test: Dataset({
        features: ['id', 'source', 'tokens', 'ner_tags', 'nested_ner_tags'],
        num_rows: 5100
    })
})

{'id': '0', 'ner_tags': [19, 0, 0, 0, 7, 0, 0, 0, 0, 19, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'nested_ner_tags': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'source': 'n-tv.de vom 26.02.2005 [2005-02-26] ', 'tokens': ['Schartau', 'sagte', 'dem', '"', 'Tagesspiegel', '"', 'vom', 'Freitag', ',', 'Fischer', 'sei', '"', 'in', 'einer', 'Weise', 'aufgetreten', ',', 'die', 'alles', 'andere', 'als', 'überzeugend', 'war', '"', '.']}

{'id': Value(dtype='string', id=None), 'source': Value(dtype='string', id=None), 'tokens': Sequence(feature=Value(dtype='s

Capture the indexes for both train and validation sets, use the datasets `concatenate_datasets` to put them into a single dataset, and finally use the `IndexSplitter` method to define our train/validation splits as such:

In [None]:
train_ds = raw_datasets['train']#.select(range(1000))
valid_ds = raw_datasets['validation']#.select(range(500))

In [None]:
n_train, n_valid = train_ds.num_rows, valid_ds.num_rows
train_idxs, valid_idxs = L(range(n_train)), L(range(n_train, n_train + n_valid))
raw_ds = concatenate_datasets([train_ds, valid_ds])

We can grab the "labels" a token can be associated with as we do here or we can let the `BlearnerForTokenClassification` factory methods figure it out for us.

In [None]:
labels = train_ds.features['ner_tags'].feature.names
len(labels)

25

As we need pass the tag (not the index) for each example's tokens in a list, we use the handy `datasets.map` function to create a new attribute, "token_labels", with that data.  This could also be done by passing in a `preprocess_func`  to a `BlearnerForTokenClassification` factory method; especially useful if we need to use one or more of the Hugging Face objects (e.g., tokenzier, model, config, or architecture name)

In [None]:
def get_item_labels(example):
    example['token_labels'] = [ labels[tag_idx] for tag_idx in example['ner_tags'] ]
    return example
                         
raw_ds = raw_ds.map(get_item_labels)

  0%|          | 0/26200 [00:00<?, ?ex/s]

In [None]:
learn = BlearnerForTokenClassification.from_dictionaries(raw_ds, 'bert-base-multilingual-cased', 
                                                         tokens='tokens', token_labels='token_labels', labels=labels,
                                                         dblock_splitter=RandomSplitter(), 
                                                         dl_kwargs={'bs':2})

learn.unfreeze()
fit_cbs = [HF_TokenClassMetricsCallback()]

In [None]:
learn.dls.show_batch(dataloaders=learn.dls, max_n=2)

Unnamed: 0,token / target label
0,"[('Andere', 'O'), ('Albumtitel', 'O'), ('sind', 'O'), ('an', 'O'), ('bekannte', 'O'), ('Begriffe', 'O'), ('angelehnt', 'O'), (':', 'O'), ('Fettes', 'B-OTH'), ('Brot', 'I-OTH'), ('für', 'I-OTH'), ('die', 'I-OTH'), ('Welt', 'I-OTH'), ('(', 'O'), ('Brot', 'O'), ('für', 'O'), ('die', 'O'), ('Welt', 'O'), ('),', 'O'), ('Auf', 'O'), ('einem', 'B-OTH'), ('Auge', 'I-OTH'), ('blöd', 'I-OTH'), ('(', 'I-OTH'), ('„', 'O'), ('Auf', 'O'), ('einem', 'O'), ('Auge', 'O'), ('blind', 'O'), ('),', 'O'), ('Am', 'O'), ('Wasser', 'O'), ('gebaut', 'O'), ('(', 'B-OTH'), ('„', 'I-OTH'), ('Nah', 'I-OTH'), ('am', 'O'), ('Wasser', 'O'), ('gebaut', 'O'), (')', 'O'), ('und', 'O'), ('Strom', 'O'), ('und', 'O'), ('Drang', 'O'), ('(', 'O'), ('„', 'B-OTH'), ('Sturm', 'I-OTH'), ('und', 'I-OTH'), ('Drang', 'O'), (').', 'O')]"
1,"[('Demnach', 'O'), ('kommt', 'O'), ('die', 'O'), ('Staatsoper', 'O'), ('Unter', 'B-LOC'), ('den', 'I-LOC'), ('Linden', 'I-LOC'), ('auf', 'O'), ('72,', 'O'), ('8', 'O'), ('Prozent', 'O'), ('(', 'O'), ('76,', 'O'), ('4', 'O'), ('in', 'O'), ('2003', 'O'), ('),', 'O'), ('die', 'B-ORG'), ('Deutsche', 'I-ORG'), ('Oper', 'O'), ('auf', 'O'), ('64', 'O'), ('Prozent', 'O'), ('(', 'O'), ('61,', 'O'), ('9', 'O'), ('in', 'O'), ('2003', 'O'), (')', 'O'), ('und', 'O'), ('die', 'O'), ('Komische', 'O'), ('Oper', 'O'), ('auf', 'O'), ('52,', 'O'), ('7', 'O'), ('Prozent', 'O'), ('(', 'O'), ('48,', 'O'), ('7', 'O')]"


In [None]:
learn.fit_one_cycle(1, lr_max= 3e-5, moms=(0.8,0.7,0.8), cbs=fit_cbs)

epoch,train_loss,valid_loss,accuracy,precision,recall,f1,time
0,0.054739,0.06474,0.98047,0.852643,0.840121,0.846336,15:17


  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
learn.show_results(learner=learn, max_n=2, trunc_at=10)

Unnamed: 0,token / target label / predicted label
0,"[('Jetzt', 'O', 'O'), ('noch', 'O', 'O'), ('Gleichgesinnte', 'O', 'O'), ('treffen,', 'O', 'O'), ('das', 'O', 'O'), ('wäre', 'O', 'O'), ('ein', 'O', 'O'), ('sauberer', 'O', 'O'), ('Abschluss', 'O', 'O'), ('und', 'O', 'O')]"
1,"[('Da', 'O', 'O'), ('Intrigen,', 'O', 'O'), ('Verschwörungen', 'O', 'O'), ('und', 'O', 'O'), ('das', 'O', 'O'), ('brutale', 'O', 'O'), ('Verfolgen', 'O', 'O'), ('der', 'O', 'O'), ('eigenen', 'O', 'O'), ('Interessen', 'O', 'O')]"


In [None]:
print(learn.token_classification_report)

              precision    recall  f1-score   support

         LOC       0.91      0.89      0.90      1890
    LOCderiv       0.91      0.86      0.89       664
     LOCpart       0.69      0.75      0.72       118
         ORG       0.78      0.78      0.78      1258
    ORGderiv       0.00      0.00      0.00         0
     ORGpart       0.79      0.71      0.75       200
         OTH       0.70      0.70      0.70       638
    OTHderiv       0.66      0.56      0.61        55
     OTHpart       0.17      0.62      0.27         8
         PER       0.94      0.91      0.92      1739
    PERderiv       0.00      0.00      0.00         0
     PERpart       0.34      0.31      0.33        35

   micro avg       0.85      0.84      0.85      6605
   macro avg       0.57      0.59      0.57      6605
weighted avg       0.86      0.84      0.85      6605



In [None]:
txt ="I live in California, but I'd love to travel to Scotland and visit the Macallan distillery."
txt2 = "Jane Doe loves working for ohmeow.com."

In [None]:
res = learn.blurr_predict_tokens([txt.split(), txt2.split()])
for r in res: print(f'{[(tok, lbl) for tok,lbl in zip(r[0],r[1]) ]}\n')

[('I', 'O'), ('live', 'O'), ('in', 'O'), ('California,', 'B-LOC'), ('but', 'O'), ("I'd", 'O'), ('love', 'O'), ('to', 'O'), ('travel', 'O'), ('to', 'O'), ('Scotland', 'B-LOC'), ('and', 'O'), ('visit', 'O'), ('the', 'O'), ('Macallan', 'B-ORG'), ('distillery.', 'I-ORG')]

[('Jane', 'B-PER'), ('Doe', 'I-PER'), ('loves', 'O'), ('working', 'O'), ('for', 'O'), ('ohmeow.com.', 'B-OTH')]



## Question Answering

In [None]:
#hide
try: del learn; torch.cuda.empty_cache()
except: pass

In [None]:
raw_datasets = load_dataset('squad_v2')
print(f'{raw_datasets}\n')
print(f'{raw_datasets["train"][0]}\n')
print(f'{raw_datasets["train"].features}\n')

Reusing dataset squad_v2 (/home/wgilliam/.cache/huggingface/datasets/squad_v2/squad_v2/2.0.0/ba48bc29b974701e9ba8d80ac94f3e3df924aba41b764dcf9851debea7c672e4)


DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 130319
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 11873
    })
})

{'answers': {'answer_start': [269], 'text': ['in the late 1990s']}, 'context': 'Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny\'s Child. Managed by her father, Mathew Knowles, the group became one of the world\'s best-selling girl groups of all time. Their hiatus saw the release of Beyoncé\'s debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-

In [None]:
train_ds = raw_datasets['train'].select(range(1000))

We use the `preprocess_func` here as the preprocessing is dependent upon the Hugging Face tokenizer which will vary dependending on the pretrained model we use for the task.

In [None]:
def preprocess_ds(ds, hf_arch, hf_config, hf_tokenizer, hf_model, max_seq_len, 
                  context, question, tok_ans_start, tok_ans_end):
    
    def _preprocess(item):
        tok_kwargs = {}
        if(hf_tokenizer.padding_side == 'right'):
            tok_input = hf_tokenizer.convert_ids_to_tokens(hf_tokenizer.encode(item[question], item[context]), 
                                                           **tok_kwargs)
        else:
            tok_input = hf_tokenizer.convert_ids_to_tokens(hf_tokenizer.encode(item[context], item[question]), 
                                                           **tok_kwargs)

        tok_ans = hf_tokenizer.tokenize(str(item['answers']['text'][0]), **tok_kwargs)
        
        start_idx, end_idx = 0,0
        
        if(len(tok_input) < max_seq_len):
            for idx, tok in enumerate(tok_input):
                try:
                    if (tok == tok_ans[0] and tok_input[idx:idx + len(tok_ans)] == tok_ans): 
                        start_idx, end_idx = idx, idx + len(tok_ans)
                        break
                except: pass

        item['tokenized_input'] = tok_input
        item['tokenized_input_len'] = len(tok_input)
        item['tok_answer_start'] = start_idx
        item['tok_answer_end'] = end_idx

        return item
    
    ds = ds.map(_preprocess)
    return ds

In [None]:
pretrained_model_name = 'bert-large-uncased-whole-word-masking-finetuned-squad'

learn = BlearnerForQuestionAnswering.from_dataframe(train_ds, pretrained_model_name,
                                                    preprocess_func=preprocess_ds, max_seq_len=256,
                                                    dblock_splitter=RandomSplitter(), dl_kwargs={ 'bs': 4 })

  0%|          | 0/1000 [00:00<?, ?ex/s]

In [None]:
learn.dls.show_batch(dataloaders=learn.dls, max_n=2, trunc_at=500)

Unnamed: 0,text,start/end,answer
0,"how many copies of 4 sold in the first week? her fourth studio album 4 was released on june 28, 2011 in the us. 4 sold 310, 000 copies in its first week and debuted atop the billboard 200 chart, giving beyonce her fourth consecutive number - one album in the us. the album was preceded by two of its singles "" run the world ( girls ) "" and "" best thing i never had "", which both attained moderate success. the fourth single "" love on top "" was a commercial success in the us. 4 also produced four oth","(31, 34)","310, 000"
1,"which campaign does beyonce contribute to that encourages leadership in females? in an interview published by vogue in april 2013, beyonce was asked if she considers herself a feminist, to which she said, "" that word can be very extreme... but i guess i am a modern - day feminist. i do believe in equality "". she would later align herself more publicly with the movement, sampling "" we should all be feminists "", a speech delivered by nigerian author chimamanda ngozi adichie at a tedxeuston confere","(132, 135)",ban bossy


In [None]:
learn.fit_one_cycle(3, lr_max=1e-3)

epoch,train_loss,valid_loss,time
0,2.23343,1.807422,00:47
1,1.479179,1.103898,00:47
2,1.053905,1.113511,00:47


In [None]:
learn.show_results(learner=learn, skip_special_tokens=True, max_n=2, trunc_at=500)

Unnamed: 0,text,start/end,answer,pred start/end,pred answer
0,"how was the single released? on february 6, 2016, one day before her performance at the super bowl, beyonce released a new single exclusively on music streaming service tidal called "" formation "".","(29, 30)",exclusively,"(29, 35)",exclusively on music streaming service tidal
1,"who has beyonce at number one on her five best singer / dancers? beyonce has received praise for her stage presence and voice during live performances. jarett wieselman of the new york post placed her at number one on her list of the five best singer / dancers. according to barbara ellen of the guardian beyonce is the most in - charge female artist she's seen onstage, while alice jones of the independent wrote she "" takes her role as entertainer so seriously she's almost too good. "" the ex - pre","(30, 35)",jarett wieselman,"(30, 35)",jarett wieselman


## Cleanup

In summary, whether you want to work with Blurr's low, mid, or high-level API ... we got you covered :)

In [None]:
#hide
from nbdev.export import notebook2script
notebook2script()

Converted 00_utils.ipynb.
Converted 01_data-core.ipynb.
Converted 01_modeling-core.ipynb.
Converted 02_data-language-modeling.ipynb.
Converted 02_modeling-language-modeling.ipynb.
Converted 03_data-token-classification.ipynb.
Converted 03_modeling-token-classification.ipynb.
Converted 04_data-question-answering.ipynb.
Converted 04_modeling-question-answering.ipynb.
Converted 10_data-seq2seq-core.ipynb.
Converted 10_modeling-seq2seq-core.ipynb.
Converted 11_data-seq2seq-summarization.ipynb.
Converted 11_modeling-seq2seq-summarization.ipynb.
Converted 12_data-seq2seq-translation.ipynb.
Converted 12_modeling-seq2seq-translation.ipynb.
Converted 99a_examples-high-level-api.ipynb.
Converted 99b_examples-glue.ipynb.
Converted 99c_examples-glue-plain-pytorch.ipynb.
Converted 99d_examples-multilabel.ipynb.
Converted index.ipynb.
