In [None]:
# default_exp examples.blurr_high_level_api

In [None]:
#all_slow

In [None]:
#hide
%reload_ext autoreload
%autoreload 2
%matplotlib inline

# Using the high-level Blurr API

> Show all of the high-level `BlurrFor<Task>` classes in action here with the raw data sourced from the [Hugging Face Datasets library](https://huggingface.co/docs/datasets/index.html).

In [None]:
#export
import os

from datasets import load_dataset, concatenate_datasets
from transformers import *
from fastai.text.all import *

from blurr.utils import *
from blurr.data.core import *
from blurr.modeling.core import *

from blurr.data.language_modeling import BertMLMStrategy, CausalLMStrategy
from blurr.modeling.language_modeling import *

from blurr.modeling.token_classification import *
from blurr.modeling.question_answering import *
from blurr.modeling.seq2seq.summarization import *
from blurr.modeling.seq2seq.translation import *

logging.set_verbosity_error()

In [None]:
#hide_input
import pdb

from fastcore.test import *
from nbverbose.showdoc import show_doc

os.environ["TOKENIZERS_PARALLELISM"] = "false"
print("Here's what we're running with ...\n")
print_versions('torch fastai transformers')

Here's what we're running with ...

torch: 1.7.1
fastai: 2.5.2
transformers: 4.9.2


In [None]:
#cuda
#hide
torch.cuda.set_device(1)
print(f'Using GPU #{torch.cuda.current_device()}: {torch.cuda.get_device_name()}')

Using GPU #1: GeForce GTX 1080 Ti


While most of the code and examples in the documentation show how to work with Blurr given a pandas Dataframe, these set of examples will show you how to use the high-level Blurr API with any Hugging Face dataset. The high-level API provides one liners to build your DataBlock, DataLoaders, and Learner (with sensible defaults) from a DataFrame, CSV file, or a list of dictionaries as we do so here.

## Sequence Classification

### Multiclassification (one input)

In [None]:
#hide
try: del learn; torch.cuda.empty_cache()
except: pass

In [None]:
raw_datasets = load_dataset('glue', 'cola') 
print(f'{raw_datasets}\n')
print(f'{raw_datasets["train"][0]}\n')
print(f'{raw_datasets["train"].features}\n')

Reusing dataset glue (/home/wgilliam/.cache/huggingface/datasets/glue/cola/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 8551
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1043
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1063
    })
})

{'idx': 0, 'label': 1, 'sentence': "Our friends won't buy this analysis, let alone the next one we propose."}

{'sentence': Value(dtype='string', id=None), 'label': ClassLabel(num_classes=2, names=['unacceptable', 'acceptable'], names_file=None, id=None), 'idx': Value(dtype='int32', id=None)}



Capture the indexes for both train and validation sets, use the datasets `concatenate_datasets` to put them into a single dataset, and finally use the `IndexSplitter` method to define our train/validation splits as such:

In [None]:
train_ds = raw_datasets['train']#.select(range(10000))
valid_ds = raw_datasets['validation']#.select(range(2000))

In [None]:
n_train, n_valid = train_ds.num_rows, valid_ds.num_rows
train_idxs, valid_idxs = L(range(n_train)), L(range(n_train, n_train + n_valid))
raw_ds = concatenate_datasets([train_ds, valid_ds])

In [None]:
dl_kwargs = {'bs': 4, 'val_bs': 8}
learn_kwargs = { 'metrics': [accuracy] }

learn = BlearnerForSequenceClassification.from_dictionaries(raw_ds, 'distilroberta-base', 
                                                            text_attr='sentence', label_attr='label',
                                                            dblock_splitter=IndexSplitter(valid_idxs),
                                                            dl_kwargs=dl_kwargs, learner_kwargs=learn_kwargs)
learn = learn.to_fp16()

In [None]:
learn.dls.show_batch(dataloaders=learn.dls, trunc_at=500, max_n=5)

Unnamed: 0,text,target
0,"Everybody who has ever, worked in any office which contained any typewriter which had ever been used to type any letters which had to be signed by any administrator who ever worked in any department like mine will know what I mean.",1
1,John is prouder of having gone than people who don't know him would expect me to believe he would be.,1
2,I have wanted to meet for many years the man who spent so much money planning the assassination of Kennedy.,1
3,That it was obvious that it would confuse the guards for Herschel to throw a fit is not true.,1


In [None]:
learn.fit_one_cycle(1, lr_max=2e-3)

epoch,train_loss,valid_loss,accuracy,time
0,0.489101,0.502091,0.774688,01:04


In [None]:
learn.show_results(learner=learn, max_n=5)

Unnamed: 0,text,target,prediction
0,"Scientists at the South Hanoi Institute of Technology have succeeded in raising one dog with five legs, another with a cow's liver, and a third with no head.",1,1
1,"As a teacher, you have to deal simultaneously with the administration's pressure on you to succeed, and the children's to be a nice guy.",0,0
2,"Harry told Sue that Albania is a lovely place for a vacation, and Tom told Sally that Albania is a lovely place for a vacation.",1,1
3,"Sandy is wondering whether there will be students who have to drop the class for a certain reason, but she won't reveal what.",1,1
4,"Most columnists claim that a senior White House official has been briefing them, and the newspaper today reveals which one.",1,1


`Learner.blurr_predict` works here too

In [None]:
learn.blurr_predict('Blurr aint no joke yo')

[(('0',), (#1) [tensor(0)], (#1) [tensor([0.5810, 0.4190])])]

### Multiclassification (two inputs)

In [None]:
#hide
try: del learn; torch.cuda.empty_cache()
except: pass

In [None]:
raw_datasets = load_dataset('glue', 'mrpc') 
print(f'{raw_datasets}\n')
print(f'{raw_datasets["train"][0]}\n')
print(f'{raw_datasets["train"].features}\n')

Reusing dataset glue (/home/wgilliam/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 1725
    })
})

{'idx': 0, 'label': 1, 'sentence1': 'Amrozi accused his brother , whom he called " the witness " , of deliberately distorting his evidence .', 'sentence2': 'Referring to him as only " the witness " , Amrozi accused his brother of deliberately distorting his evidence .'}

{'sentence1': Value(dtype='string', id=None), 'sentence2': Value(dtype='string', id=None), 'label': ClassLabel(num_classes=2, names=['not_equivalent', 'equivalent'], names_file=None, id=None), 'idx': Value(dtype='int32', id=None)}



In [None]:
train_ds = raw_datasets['train']#.select(range(10000))
valid_ds = raw_datasets['validation']#.select(range(2000))

In [None]:
n_train, n_valid = train_ds.num_rows, valid_ds.num_rows
train_idxs, valid_idxs = L(range(n_train)), L(range(n_train, n_train + n_valid))
raw_ds = concatenate_datasets([train_ds, valid_ds])

In [None]:
dl_kwargs = {'bs': 4, 'val_bs': 8}
learn_kwargs = { 'metrics': [F1Score(), accuracy] }

learn = BlearnerForSequenceClassification.from_dictionaries(raw_ds, 'distilroberta-base', 
                                                            text_attr=['sentence1', 'sentence2'], 
                                                            label_attr='label',
                                                            dblock_splitter=IndexSplitter(valid_idxs),
                                                            dl_kwargs=dl_kwargs, learner_kwargs=learn_kwargs)
learn = learn.to_fp16()

In [None]:
learn.dls.show_batch(dataloaders=learn.dls, trunc_at=500, max_n=5)

Unnamed: 0,text,target
0,""" In Iraq, "" Sen. Pat Roberts, R-Kan., chairman of the intelligence committee, said on CNN's "" Late Edition "" Sunday, "" we're now fighting an anti-guerrilla... effort. "" "" In Iraq, "" Sen. Pat Roberts ( R-Kan. ), chairman of the intelligence committee, said on CNN's "" Late Edition "" yesterday, "" we're now fighting an anti-guerrilla... effort. """,1
1,Media giant Vivendi Universal EAUG.PA V.N set to work sifting through bids for its U.S. entertainment empire on Monday in a multibillion-dollar auction of some of Hollywood's best-known assets. Media moguls jostled for position as the deadline for bids for Vivendi Universal's U.S. entertainment empire neared on Monday in an auction of some of Hollywood's best-known assets.,1
2,"While opposition parties have welcomed the cabinet's decision on anti-retroviral treatment, some said Health Minister Manto Tshabalala-Msimang was not fit to preside over a rollout plan. Health Minister Manto-Tshabalala Msimang is not fit to preside over an anti-retroviral treatment rollout plan, according to some opposition parties.",0
3,"Wal-Mart, Kohl's Corp., Family Dollar Stores Inc., and Big Lots Inc. were among the merchants posting May sales that fell below Wall Street's modest expectations. Wal- Mart, Kohl's Corp., Family Dollar Stores Inc., and Big Lots Inc. posted May sales that fell below Wall Street's modest expectations.",1


In [None]:
learn.fit_one_cycle(1, lr_max=2e-3)

epoch,train_loss,valid_loss,f1_score,accuracy,time
0,0.507883,0.472271,0.854839,0.779412,00:28


In [None]:
learn.show_results(learner=learn, max_n=5)

Unnamed: 0,text,target,prediction
0,"He said the foodservice pie business doesn 't fit the company's long-term growth strategy. "" The foodservice pie business does not fit our long-term growth strategy.",1,1
1,BP shares slipped 0.8 percent to 433.50 pence ( $ 6.85 ) each in afternoon trading on the London Stock Exchange. BP shares slipped 48 cents to $ 41.72 Friday in trading on the New York Stock Exchange.,1,0
2,""" There is no need for one deadline for all to create the ASEAN Economic Community, "" Thaksin said. Thus, he said, there did not have to one deadline to create the economic community.",1,1
3,"November 17's last victim was British defence attache Stephen Saunders, who was shot on an Athens road in June 2000. November 17's last victim was British defense attache Stephen Saunders, who was shot and killed at point-blank range on a busy Athens road in June 2000.",1,1
4,"Last month Intel raised its revenue guidance for the quarter to between $ 7.6 billion and $ 7.8 billion. At the end of the second quarter, Intel initially predicted sales of between $ 6.9 billion and $ 7.5 billion.",1,0


### Multilabel classification

In [None]:
#hide
try: del learn; torch.cuda.empty_cache()
except: pass

In [None]:
raw_datasets = load_dataset('civil_comments')
print(f'{raw_datasets}\n')
print(f'{raw_datasets["train"][0]}\n')
print(f'{raw_datasets["train"].features}\n')

Using custom data configuration default
Reusing dataset civil_comments (/home/wgilliam/.cache/huggingface/datasets/civil_comments/default/0.9.0/e7a3aacd2ab7d135fa958e7209d10b1fa03807d44c486e3c34897aa08ea8ffab)


DatasetDict({
    train: Dataset({
        features: ['text', 'toxicity', 'severe_toxicity', 'obscene', 'threat', 'insult', 'identity_attack', 'sexual_explicit'],
        num_rows: 1804874
    })
    validation: Dataset({
        features: ['text', 'toxicity', 'severe_toxicity', 'obscene', 'threat', 'insult', 'identity_attack', 'sexual_explicit'],
        num_rows: 97320
    })
    test: Dataset({
        features: ['text', 'toxicity', 'severe_toxicity', 'obscene', 'threat', 'insult', 'identity_attack', 'sexual_explicit'],
        num_rows: 97320
    })
})

{'identity_attack': 0.0, 'insult': 0.0, 'obscene': 0.0, 'severe_toxicity': 0.0, 'sexual_explicit': 0.0, 'text': "This is so cool. It's like, 'would you want your mother to read this??' Really great idea, well done!", 'threat': 0.0, 'toxicity': 0.0}

{'text': Value(dtype='string', id=None), 'toxicity': Value(dtype='float32', id=None), 'severe_toxicity': Value(dtype='float32', id=None), 'obscene': Value(dtype='float32', id=None), 'thr

In [None]:
lbl_cols =  ['identity_attack', 'insult', 'obscene', 'toxicity', 'severe_toxicity', 'sexual_explicit', 'threat']

In [None]:
train_ds = raw_datasets['train'].select(range(10000))
valid_ds = raw_datasets['validation'].select(range(2000))

In [None]:
n_train, n_valid = len(train_ds), len(valid_ds)
train_idxs, valid_idxs = L(range(n_train)), L(range(n_train, n_train + n_valid))
raw_ds = concatenate_datasets([train_ds, valid_ds])

The labels need to be OHE as ints (the raw data has them as floats). We could also do this kind of preprocessing by passing in a `preprocess_func` to our `BlearnerForSequenceClassification` factory method, especially useful if such preprocessing depends on one or more of the Hugging Face objects (e.g., config, tokenizer, model, architecture)

In [None]:
def make_ohe(item):
    for k in item.keys():
        if (k in lbl_cols):
            item[k] = int(np.round(item[k]))
    return item

raw_ds = raw_ds.map(make_ohe)

Loading cached processed dataset at /home/wgilliam/.cache/huggingface/datasets/civil_comments/default/0.9.0/e7a3aacd2ab7d135fa958e7209d10b1fa03807d44c486e3c34897aa08ea8ffab/cache-615cf128814440d8.arrow


In [None]:
dl_kwargs = {'bs': 4, 'val_bs': 8}
learn_kwargs = { 'metrics': [F1ScoreMulti(), accuracy_multi] }

# using a List[dict] such as a Hugging Face dataset
learn = BlearnerForSequenceClassification.from_dictionaries(raw_ds, 'distilroberta-base', 
                                                            text_attr='text', label_attr=lbl_cols,
                                                            dblock_splitter=IndexSplitter(valid_idxs),
                                                            dl_kwargs=dl_kwargs, learner_kwargs=learn_kwargs)
learn = learn.to_fp16()

In [None]:
learn.dls.show_batch(dataloaders=learn.dls, trunc_at=500, max_n=5)

Unnamed: 0,text,target
0,"I have had a question about Einstein's Special Theory of Relativity for some time which scientists all seem to run away from. Until 1887 the equations used for Relativity were the Galilean transformation equations.\n\n x'=x-vt\n y'=y\n z'=z\n t'=t\n\nAfter 1887, scientists threw away the Galilean transfo",[]
1,"I just - I just can't tell if you're serious. It's baffling. I'm baffled.\n\nSo, Martin, are you telling us you invoked your grandfather as a virtue signal of your own working-class credibility, despite never having met him? How impossibly disrespectful of you. I'm glad your grandfather has passed; it spares him the heart-stabbing shame of an ungrateful, pedantic little snot of a grandson who would seek to ride his coveralls into righteousness. He deserves better than someone like you.\n\nThe simpl","[insult, toxicity]"
2,"""The Springfield School District is an even less affluent area. And its overall graduation rate was only 66%. So a third of the students failed to graduate."" \n\n When I graduated from Springfield High School the 'teachers' didn't seem to care how 'affluent' my parents were. They expected me to meet or exceed the same standards as the rich kids. In fact, I think they expected more out of them than they did me, but definitely encouraged me when I did meet or exceed standards. My 8th grade-educ",[]
3,I don't like this new system at all...you can't copy and paste? whats up with that? anyway wow...Wow… This article makes my head hurt and is filled with so many historical inaccuracies I don’t know where to start. \n1.\tM. Reza Behnam – Is a recognized expert on middle east cultures and has written several books on this topic …Google his name and decide for yourself but as far as I am concerned his opinion stands heads and shoulders above Mr. Weinermans\n2.\tMr. W. goes to some trouble to claim vi,[]


In [None]:
learn.fit_one_cycle(1, lr_max=2e-3)

epoch,train_loss,valid_loss,f1_score,accuracy_multi,time
0,0.034387,0.040988,0.173404,0.986714,01:25


  _warn_prf(


In [None]:
learn.show_results(learner=learn, trun_at=500, max_n=5)

  _warn_prf(


Unnamed: 0,text,target,prediction
0,"Everyone tries to hack everyone else. I have no doubt Russia would try to hack even canada. However, the US has been doing the same, if we recall Snowden.\n\nEven Merkel's phone conversations were being tapped by the CIA. \n\nThe real purpose of this issue is political. Trump is upset because people are trying to imply that he didn't deserve his victory, that the Russians helped him. It's an ego thing. Good CEOs sometimes have giant egos. I have no problem with that as long as they produce results, I gladly buy shares in their company.\n\nOtoh, Russia did invade Crimea recently, and their missile brought down a commercial airliner and killed lots of innocent people. The world has a right to be annoyed at the Russians.\n\nIf you want to find evidence of Russians hacking, you will find them. But if you want to find China or some guy in a basement somewhere, I have no doubt you can find the same as well. Whether they succeeded or not, that's hard to prove, but there's lots of blackhats",[],[]
1,"Glad to see, as Canadians we are more and more conscious and aware, of all these abuses from immigration, Trudeau can save travel time and costs if he would have paid attention in the first place, foreign labor and foreign student policies hurting the Canadians (middle-class and families and employees and youth! and disadvantaged disabled, homeless and aboriginals) The bad immigration policy list and abuse, goes on and on and on, it's hard to keep track, write it down people! Now it is high-time for all Canadian politicians to take their heads out of the sand, or wherever they may have them, learn something useful to help Canadians, and put a stop to this abuse, nonsense and madness we have been having with too much immigration, and do their jobs! do what's right. Lot's of attention and resources need to be focused on reducing immigration, fixing our bad policies, limiting foreign labor, etc. etc. it is not even funny! Nothing else matters. 40k per year. McCallum, Barton goodbye thanks",[],[]
2,"""We will stand by the Governor as he searches for answers to the crime wave."" says Senator Kelly.\n Where the heck has Kelly been the last three years as the crime wave grew?. And just what is your job Kelly?..if the Governor is doing the one doing the searching? Oh I remember,..it`s to continue to be against broad-based taxes once again for Alaska, for any reasonable fix he comes up with, as your Senate caucus has said for four special sessions ""there will be no tax bill to raise revenue"" for whatever ""fixes"" you say we need, and that the Governor searches,,..without you apparently. They say in the media your back-pedaling now, that your now willing to have the debate over the need for a tax to put this state on a balanced keel going forward. We`ll see if your oily conflicted fellow senators agree. Voters are watching, and want a plan to fix this crime/budget issue. If it takes more cops and new taxes to get it done then let's do it. We had a tax before and nobody died from it.",[],[]
3,"Apparently the question of whether or not there was a gun will remain unless or until the cops release sufficient video footage to establish one way or the other.\n\nCops merely shouting 'drop the gun!' means nothing in such cases, as they always do that when they've decided to shoot someone, regardless of whether or not their intended victim is armed.\n\nAnd then there's this: I worked for eight years in a 92-bed county-run psychiatric facility. Many, perhaps most, of the people brought there for evaluation were brought by the cops. The rule was the cops couldn't bring their firearms into the intake building. So I had opportunity to repeatedly observe them while they disarmed outside at their patrol cars (without them seeing me).\n\nI last things they always placed in the trunk were their 'throw-downs'--the once-fired, unregistered handguns they strapped to their calves, beneath their trouser legs, to make sure that whoever they shot 'had a gun'--regardless of whether they did or not.",[],[]
4,"By lacking faith in a God or gods I am not putting my faith in the position that a God does not exist. Whether or not I accept the position that ""God or gods does/do not exist"" is a secondary questions. \n\nI've heard it expressed this way by others and it might make it clearer for you. If you have a jar of gumballs of an unknown number and state that there are an even number of gumballs in the jar I can say ""I do not accept your claim."" Doing so does not mean that I'm accepting that their is an odd number of gumballs in the jar as that is a separate question. For that you need to move down a level to gnostic-atheism/hard atheism or anti-theism.\n\nAs for atheism being a religion, you and RolandX appear to be the wriggly ones. Atheism is a position taken on a singular claim - The rejection of the claim ""A God or gods exist."" (Not necessarily the acceptance of the claim No God or gods exist). Nothing else extends from that. A position on a singular issue does not a religion make.",[],[]


## Token Classification

In [None]:
#hide
try: del learn; torch.cuda.empty_cache()
except: pass

In [None]:
raw_datasets = load_dataset('germeval_14') 
print(f'{raw_datasets}\n')
print(f'{raw_datasets["train"][0]}\n')
print(f'{raw_datasets["train"].features}\n')

Reusing dataset germ_eval14 (/home/wgilliam/.cache/huggingface/datasets/germ_eval14/germeval_14/2.0.0/0f174b84866aa3b8ebae65c271610520be4422405d7e8467bd24cfd493d325f0)


DatasetDict({
    train: Dataset({
        features: ['id', 'source', 'tokens', 'ner_tags', 'nested_ner_tags'],
        num_rows: 24000
    })
    validation: Dataset({
        features: ['id', 'source', 'tokens', 'ner_tags', 'nested_ner_tags'],
        num_rows: 2200
    })
    test: Dataset({
        features: ['id', 'source', 'tokens', 'ner_tags', 'nested_ner_tags'],
        num_rows: 5100
    })
})

{'id': '0', 'ner_tags': [19, 0, 0, 0, 7, 0, 0, 0, 0, 19, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'nested_ner_tags': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'source': 'n-tv.de vom 26.02.2005 [2005-02-26] ', 'tokens': ['Schartau', 'sagte', 'dem', '"', 'Tagesspiegel', '"', 'vom', 'Freitag', ',', 'Fischer', 'sei', '"', 'in', 'einer', 'Weise', 'aufgetreten', ',', 'die', 'alles', 'andere', 'als', 'überzeugend', 'war', '"', '.']}

{'id': Value(dtype='string', id=None), 'source': Value(dtype='string', id=None), 'tokens': Sequence(feature=Value(dtype='s

In [None]:
train_ds = raw_datasets['train']#.select(range(1000))
valid_ds = raw_datasets['validation']#.select(range(500))

In [None]:
n_train, n_valid = train_ds.num_rows, valid_ds.num_rows
train_idxs, valid_idxs = L(range(n_train)), L(range(n_train, n_train + n_valid))
raw_ds = concatenate_datasets([train_ds, valid_ds])

We can grab the "labels" a token can be associated with as we do here or we can let the `BlearnerForTokenClassification` factory methods figure it out for us.

In [None]:
labels = train_ds.features['ner_tags'].feature.names
len(labels)

25

As we need pass the tag (not the index) for each example's tokens in a list, we use the handy `datasets.map` function to create a new attribute, "token_labels", with that data.  This could also be done by passing in a `preprocess_func`  to a `BlearnerForTokenClassification` factory method; especially useful if we need to use one or more of the Hugging Face objects (e.g., tokenzier, model, config, or architecture name)

In [None]:
def get_item_labels(example):
    example['token_labels'] = [ labels[tag_idx] for tag_idx in example['ner_tags'] ]
    return example
                         
raw_ds = raw_ds.map(get_item_labels)

  0%|          | 0/26200 [00:00<?, ?ex/s]

In [None]:
learn = BlearnerForTokenClassification.from_dictionaries(raw_ds, 'bert-base-multilingual-cased', 
                                                         tokens_attr='tokens', token_labels_attr='token_labels', 
                                                         labels=labels, dblock_splitter=IndexSplitter(valid_idxs), 
                                                         dl_kwargs={'bs':2})

learn.unfreeze()
fit_cbs = [HF_TokenClassMetricsCallback()]

In [None]:
learn.dls.show_batch(dataloaders=learn.dls, max_n=2)

Unnamed: 0,token / target label
0,"[('Andere', 'O'), ('Albumtitel', 'O'), ('sind', 'O'), ('an', 'O'), ('bekannte', 'O'), ('Begriffe', 'O'), ('angelehnt', 'O'), (':', 'O'), ('Fettes', 'B-OTH'), ('Brot', 'I-OTH'), ('für', 'I-OTH'), ('die', 'I-OTH'), ('Welt', 'I-OTH'), ('(', 'O'), ('Brot', 'O'), ('für', 'O'), ('die', 'O'), ('Welt', 'O'), ('),', 'O'), ('Auf', 'O'), ('einem', 'B-OTH'), ('Auge', 'I-OTH'), ('blöd', 'I-OTH'), ('(', 'I-OTH'), ('„', 'O'), ('Auf', 'O'), ('einem', 'O'), ('Auge', 'O'), ('blind', 'O'), ('),', 'O'), ('Am', 'O'), ('Wasser', 'O'), ('gebaut', 'O'), ('(', 'B-OTH'), ('„', 'I-OTH'), ('Nah', 'I-OTH'), ('am', 'O'), ('Wasser', 'O'), ('gebaut', 'O'), (')', 'O'), ('und', 'O'), ('Strom', 'O'), ('und', 'O'), ('Drang', 'O'), ('(', 'O'), ('„', 'B-OTH'), ('Sturm', 'I-OTH'), ('und', 'I-OTH'), ('Drang', 'O'), (').', 'O')]"
1,"[('Nach', 'O'), ('seiner', 'O'), ('Rückkehr', 'O'), ('hielt', 'O'), ('Strummer', 'B-PER'), ('ein', 'O'), ('Bandmeeting', 'O'), ('ab,', 'O'), ('in', 'O'), ('dem', 'O'), ('er', 'O'), ('Sheppard,', 'O'), ('White', 'B-PER'), ('und', 'O'), ('Howard', 'B-PER'), ('mitteilte,', 'O'), ('dass', 'B-PER'), ('er', 'O'), ('nicht', 'O'), ('mehr', 'O'), ('mit', 'O'), ('ihnen', 'O'), ('arbeiten', 'O'), ('werde,', 'O'), ('da', 'O'), ('er', 'O'), ('weiter', 'O'), ('versuchen', 'O'), ('wolle,', 'O'), ('Mick', 'O'), ('Jones', 'O'), ('wieder', 'O'), ('zurück', 'O'), ('in', 'O'), ('die', 'B-PER'), ('Band', 'I-PER'), ('zu', 'O'), ('holen.', 'O')]"


In [None]:
learn.fit_one_cycle(1, lr_max= 3e-5, moms=(0.8,0.7,0.8), cbs=fit_cbs)

epoch,train_loss,valid_loss,accuracy,precision,recall,f1,time
0,0.072144,0.065398,0.979713,0.858265,0.838203,0.848115,16:06


  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
learn.show_results(learner=learn, max_n=2, trunc_at=10)

Unnamed: 0,token / target label / predicted label
0,"[('Darüber', 'O', 'O'), ('hinaus', 'O', 'O'), ('produziert', 'O', 'O'), ('der', 'O', 'O'), ('hr', 'B-ORG', 'O'), ('allein', 'O', 'B-ORG'), ('oder', 'O', 'O'), ('federführend', 'O', 'O'), ('mit', 'O', 'O'), ('anderen', 'O', 'O')]"
1,"[('Papusza', 'B-PER', 'O'), ('erzählt', 'O', 'B-PER'), ('davon', 'O', 'I-PER'), ('in', 'O', 'I-PER'), ('ihrem', 'O', 'O'), ('längsten', 'O', 'O'), ('Gedicht', 'O', 'O'), (':', 'O', 'O'), ('""', 'O', 'O'), ('Ratfale', 'B-OTH', 'O')]"


In [None]:
print(learn.token_classification_report)

              precision    recall  f1-score   support

         LOC       0.91      0.90      0.90       776
    LOCderiv       0.90      0.87      0.88       243
     LOCpart       0.65      0.69      0.67        49
         ORG       0.82      0.74      0.78       553
    ORGderiv       0.00      0.00      0.00         0
     ORGpart       0.84      0.81      0.82        94
         OTH       0.69      0.71      0.70       259
    OTHderiv       0.75      0.52      0.62        23
     OTHpart       0.17      0.60      0.26         5
         PER       0.94      0.91      0.93       731
    PERderiv       0.00      0.00      0.00         0
     PERpart       0.11      0.40      0.17         5

   micro avg       0.86      0.84      0.85      2738
   macro avg       0.56      0.60      0.56      2738
weighted avg       0.87      0.84      0.85      2738



`Learner.blurr_predict_tokens` works here too

In [None]:
txt ="I live in California, but I'd love to travel to Scotland and visit the Macallan distillery."
txt2 = "Jane Doe loves working for ohmeow.com."

In [None]:
res = learn.blurr_predict_tokens([txt.split(), txt2.split()])
for r in res: print(f'{[(tok, lbl) for tok,lbl in zip(r[0],r[1]) ]}\n')

[('I', 'O'), ('live', 'O'), ('in', 'O'), ('California,', 'B-LOC'), ('but', 'O'), ("I'd", 'O'), ('love', 'O'), ('to', 'O'), ('travel', 'O'), ('to', 'O'), ('Scotland', 'B-LOC'), ('and', 'O'), ('visit', 'O'), ('the', 'O'), ('Macallan', 'B-ORG'), ('distillery.', 'I-ORG')]

[('Jane', 'B-PER'), ('Doe', 'I-PER'), ('loves', 'O'), ('working', 'O'), ('for', 'O'), ('ohmeow.com.', 'B-OTH')]



## Question Answering

In [None]:
#hide
try: del learn; torch.cuda.empty_cache()
except: pass

In [None]:
raw_datasets = load_dataset('squad_v2')
print(f'{raw_datasets}\n')
print(f'{raw_datasets["train"][0]}\n')
print(f'{raw_datasets["train"].features}\n')

Reusing dataset squad_v2 (/home/wgilliam/.cache/huggingface/datasets/squad_v2/squad_v2/2.0.0/ba48bc29b974701e9ba8d80ac94f3e3df924aba41b764dcf9851debea7c672e4)


DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 130319
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 11873
    })
})

{'answers': {'answer_start': [269], 'text': ['in the late 1990s']}, 'context': 'Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny\'s Child. Managed by her father, Mathew Knowles, the group became one of the world\'s best-selling girl groups of all time. Their hiatus saw the release of Beyoncé\'s debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-

In [None]:
train_ds = raw_datasets['train'].select(range(1000))

We use the `preprocess_func` here as the preprocessing is dependent upon the Hugging Face tokenizer which will vary dependending on the pretrained model we use for the task.

In [None]:
def preprocess_ds(ds, hf_arch, hf_config, hf_tokenizer, hf_model, max_seq_len, 
                  context_attr, question_attr, answer_text_attr, tok_ans_start, tok_ans_end):
    
    def _preprocess(item):
        tok_kwargs = {}
        if(hf_tokenizer.padding_side == 'right'):
            tok_input = hf_tokenizer.convert_ids_to_tokens(hf_tokenizer.encode(item[question_attr], item[context_attr]), 
                                                           **tok_kwargs)
        else:
            tok_input = hf_tokenizer.convert_ids_to_tokens(hf_tokenizer.encode(item[context_attr], item[question_attr]), 
                                                           **tok_kwargs)

        tok_ans = hf_tokenizer.tokenize(str(item['answers']['text'][0]), **tok_kwargs)
        
        start_idx, end_idx = 0,0
        
        if(len(tok_input) < max_seq_len):
            for idx, tok in enumerate(tok_input):
                try:
                    if (tok == tok_ans[0] and tok_input[idx:idx + len(tok_ans)] == tok_ans): 
                        start_idx, end_idx = idx, idx + len(tok_ans)
                        break
                except: pass

        item['tokenized_input'] = tok_input
        item['tokenized_input_len'] = len(tok_input)
        item['tok_answer_start'] = start_idx
        item['tok_answer_end'] = end_idx

        return item
    
    ds = ds.map(_preprocess)
    return ds

In [None]:
pretrained_model_name = 'bert-large-uncased-whole-word-masking-finetuned-squad'

learn = BlearnerForQuestionAnswering.from_dataframe(train_ds, pretrained_model_name,
                                                    preprocess_func=preprocess_ds, max_seq_len=256,
                                                    dblock_splitter=RandomSplitter(), dl_kwargs={ 'bs': 4 })
learn = learn.to_fp16()

  0%|          | 0/1000 [00:00<?, ?ex/s]

In [None]:
learn.dls.show_batch(dataloaders=learn.dls, max_n=2, trunc_at=500)

Unnamed: 0,text,start/end,answer
0,"which prominent star felt the 2009 female video of the year award should have went to beyonce instead of taylor swift? on april 4, 2008, beyonce married jay z. she publicly revealed their marriage in a video montage at the listening party for her third studio album, i am... sasha fierce, in manhattan's sony club on october 22, 2008. i am... sasha fierce was released on november 18, 2008 in the united states. the album formally introduces beyonce's alter ego sasha fierce, conceived during the mak","(0, 0)",
1,"her third album, "" i am... sasha fierce "" was released when? on april 4, 2008, beyonce married jay z. she publicly revealed their marriage in a video montage at the listening party for her third studio album, i am... sasha fierce, in manhattan's sony club on october 22, 2008. i am... sasha fierce was released on november 18, 2008 in the united states. the album formally introduces beyonce's alter ego sasha fierce, conceived during the making of her 2003 single "" crazy in love "", selling 482, 000","(0, 0)",


In [None]:
learn.fit_one_cycle(1, lr_max=1e-3)

epoch,train_loss,valid_loss,time
0,2.387673,1.632258,00:46


In [None]:
learn.show_results(learner=learn, skip_special_tokens=True, max_n=2, trunc_at=500)

Unnamed: 0,text,start/end,answer,pred start/end,pred answer
0,"who did they tie with for six top songs? at the 52nd annual grammy awards, beyonce received ten nominations, including album of the year for i am... sasha fierce, record of the year for "" halo "", and song of the year for "" single ladies ( put a ring on it ) "", among others. she tied with lauryn hill for most grammy nominations in a single year by a female artist. in 2010, beyonce was featured on lady gaga's single "" telephone "" and its music video. the song topped the us pop songs chart, becomin","(131, 134)",mariah carey,"(131, 134)",mariah carey
1,"what familial role was albert grzymała compared to in regards to frederic? two polish friends in paris were also to play important roles in chopin's life there. his fellow student at the warsaw conservatory, julian fontana, had originally tried unsuccessfully to establish himself in england ; albert grzymała, who in paris became a wealthy financier and society figure, often acted as chopin's adviser and "" gradually began to fill the role of elder brother in [ his ] life. "" fontana was to become,","(95, 97)",elder brother,"(95, 97)",elder brother


## Language modeling

In [None]:
#hide
try: del learn; torch.cuda.empty_cache()
except: pass

In [None]:
raw_datasets = load_dataset('wikitext', 'wikitext-2-raw-v1')
print(f'{raw_datasets}\n')
print(f'{raw_datasets["train"][0]}\n')
print(f'{raw_datasets["train"].features}\n')

Reusing dataset wikitext (/home/wgilliam/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/1.0.0/aa5e094000ec7afeb74c3be92c88313cd6f132d564c7effd961c10fd47c76f20)


DatasetDict({
    test: Dataset({
        features: ['text'],
        num_rows: 4358
    })
    train: Dataset({
        features: ['text'],
        num_rows: 36718
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 3760
    })
})

{'text': ''}

{'text': Value(dtype='string', id=None)}



In [None]:
train_ds = raw_datasets['train'].select(range(1000))
valid_ds = raw_datasets['validation'].select(range(1000))

In [None]:
n_train, n_valid = train_ds.num_rows, valid_ds.num_rows
train_idxs, valid_idxs = L(range(n_train)), L(range(n_train, n_train + n_valid))
raw_ds = concatenate_datasets([train_ds, valid_ds])

In [None]:
def remove_empty_text(example):
    if (example['text'].strip() == ''): example['text'] = '  '
    return example
raw_ds = raw_ds.map(remove_empty_text)

  0%|          | 0/2000 [00:00<?, ?ex/s]

Causal language modeling

In [None]:
learn = BlearnerForLM.from_dictionaries(raw_ds, 'gpt2', text_attr='text', 
                                        lm_strategy_cls=CausalLMStrategy,
                                        dblock_splitter=IndexSplitter(valid_idxs), 
                                        dl_kwargs={'bs':2}).to_fp16()

Using pad_token, but it is not set yet.


In [None]:
learn.dls.show_batch(dataloaders=learn.dls, max_n=2, trunc_at=250)

Unnamed: 0,text,target
0,"A lookout aboard Weehawken spotted Atlanta at 04 : 10 on the morning of 17 June. When the latter ship closed to within about 1 @.@ 5 miles ( 2 @.@ 4 km ) of the two Union ships, she fired one round from her bow gun that passed over Weehawken and lan","lookout aboard Weehawken spotted Atlanta at 04 : 10 on the morning of 17 June. When the latter ship closed to within about 1 @.@ 5 miles ( 2 @.@ 4 km ) of the two Union ships, she fired one round from her bow gun that passed over Weehawken and lande"
1,"After the Austro @-@ Hungarian Empire collapsed in 1918, the Austrians wanted to turn the fleet over to the newly created State of Slovenes, Croats and Serbs ( later to become a part of the Kingdom of Yugoslavia ) in order to prevent the Italians fr","the Austro @-@ Hungarian Empire collapsed in 1918, the Austrians wanted to turn the fleet over to the newly created State of Slovenes, Croats and Serbs ( later to become a part of the Kingdom of Yugoslavia ) in order to prevent the Italians from cla"


In [None]:
learn.fit_one_cycle(1, lr_max=3e-4, cbs=[BlearnerForLM.get_metrics_cb()])

epoch,train_loss,valid_loss,perplexity,lm_accuracy,time
0,4.224899,4.506542,90.607933,0.270709,00:41


In [None]:
learn.show_results(learner=learn, max_n=2, trunc_at=500)

Unnamed: 0,text,target,prediction
0,"Meridian is rightly considered an architectural treasure trove being one the nations most intact cities from the turn of the last century. Architecture students from around the nation and Canada are known to visit Meridian in groups as part of their coursework due to numerous structures in the city having been designed by noted architects. The only home in the US south designed by noted Canadian born Architect Louis S. Curtiss, famous for inventing the glass curtain wall skyscraper, is extant o","is rightly considered an architectural treasure trove being one the nations most intact cities from the turn of the last century. Architecture students from around the nation and Canada are known to visit Meridian in groups as part of their coursework due to numerous structures in the city having been designed by noted architects. The only home in the US south designed by noted Canadian born Architect Louis S. Curtiss, famous for inventing the glass curtain wall skyscraper, is extant on Highlan","\n a to.. of of and in the ancient of the century century.\n is are the the world are around are for be the. their of well of the of. to the and the city. been built by and architectural and\n city of the world of of by renowned architects architect and.a.Siss. is for hising the world and,,raper, famous well in the Park, The onlyfort, by Mile,, well considered an of the most buildingsistico buildingsrapers in the United. is generally considered to the'ss Three Threeman. The only Canadian architect"
1,"Ben Williams of New York magazine, wrote that the song is "" propelled by a catchy bass melody "". Joan Morgan of The Village Voice, in review of Confessions on a Dance Floor, wrote : "" The party continues admirably with the multilingual, kick @-@ your @-@ man @-@ to @-@ the @-@ curb'Sorry '. "" Stephen M. Deusner of Pitchfork Media wrote, "" The cascades of sound wash directly into'Sorry ', setting up the song's panlingual apologies and shifting bass tectonics. "" Jon Pareles of The New York Times","Williams of New York magazine, wrote that the song is "" propelled by a catchy bass melody "". Joan Morgan of The Village Voice, in review of Confessions on a Dance Floor, wrote : "" The party continues admirably with the multilingual, kick @-@ your @-@ man @-@ to @-@ the @-@ curb'Sorry '. "" Stephen M. Deusner of Pitchfork Media wrote, "" The cascades of sound wash directly into'Sorry ', setting up the song's panlingual apologies and shifting bass tectonics. "" Jon Pareles of The New York Times wrot",",, York City, and a ""a by the melodyline and\n\nie of New New Voice, wrote a of theessions of the Hotfloor, wrote that "" The song isiringably theitud the- -,-@,,-@ your @-@ your @-@ to @.,,\n\n King. Cohen of of Thefork,, : in The partyading of the and over theThe '., the the perfect.. casc-. '.'the.ributesonics. ""\nathon.to of The New York Times,, the song song of the song is of "" songs, and songs sad, and, a The ', the song.\n Bitis of The New, the song "" "", "" M of The New, wrote, the The The ', th"


`Learner.blurr_generate` works here too

In [None]:
learn.blurr_generate('Blurr is fun to work with because', max_length=50, do_sample=True, top_k=25)

[' Blurr is fun to work with because is(e) – the number of things being the number of elements that are the number of elements that are the numbers of words that are words that are the integers in the number of words in']

Masked language modeling

In [None]:
#hide
try: del learn; torch.cuda.empty_cache()
except: pass

In [None]:
learn = BlearnerForLM.from_dictionaries(raw_ds, 'bert-base-uncased', text_attr='text', 
                                        lm_strategy_cls=BertMLMStrategy,
                                        dblock_splitter=IndexSplitter(valid_idxs), 
                                        dl_kwargs={'bs':2}).to_fp16()

In [None]:
learn.fit_one_cycle(1, lr_max=3e-4, cbs=[BlearnerForLM.get_metrics_cb()])

epoch,train_loss,valid_loss,perplexity,lm_accuracy,time
0,0.985024,0.800326,2.226267,0.664875,00:38


In [None]:
learn.show_results(learner=learn, max_n=2, trunc_at=500)

Unnamed: 0,text,target,prediction
0,"meridian is right ##ly considered [MASK] [MASK] treasure tr [MASK] being one [MASK] nations most intact cities from the [MASK] of the last [catchment] [MASK] [architecture] students from around the [MASK] [MASK] canada are known to [MASK] meridian in groups as part of their course ##work due to numerous structures in the city [MASK] been designed by noted architects [MASK] the only [MASK] in the us south designed by noted canadian born architect [MASK] s [MASK] [MASK] , famous for in ##venting the glass curtain wall skyscraper , is extant on [MASK] park . the [MASK] fort designed three ##foot building is generally [MASK] [sprayed] of the best art deco skyscraper ##s in the us and is often compared to detroit ' [MASK] famed fisher building . noted california architect wallace ne ##ff [MASK] a number of homes in meridian as [well] as [MASK] the alabama black belt which ad ##jo ##ins [MASK] city across the nearby [alabama] state line . he had relatives [MASK] meridian and selma who were executives in the then thriving railroad industry and would take commissions in the area when commissions in california were lean . his work is mostly concentrated in the lower numbered blocks of [MASK] [MASK] springs drive where his 251 ##6 pop ##lar springs drive is often compared to [textiles] similarly designed falcon lair , [MASK] beverly hills home in [MASK] canyon [MASK] rudolph [MASK] ##o . one ne ##ff work was lost to [MASK] expansion of anderson [MASK] in 1990 and another in [infused] park [burned] in the 1950s . the meridian post office [MASK] its interior done entirely of [bronze] and verde marble is also noteworthy as a very fine [MASK] of the type of post office structures built in thriving [MASK] well to do cities in the 1920s and originally had lal ##ique lighting which was [MASK] [MASK] during a 1960s re [MASK] ##del ##ing and which are now [MASK] [MASK] residences on pop ##lar springs drive and [ric] north hills [MASK]","meridian is right ##ly considered [an] [architectural] treasure tr [##ove] being one [the] nations most intact cities from the [turn] of the last [century] [.] [architecture] students from around the [nation] [and] canada are known to [visit] meridian in groups as part of their course ##work due to numerous structures in the city [having] been designed by noted architects [.] the only [home] in the us south designed by noted canadian born architect [louis] s [.] [curtiss] , famous for in ##venting the glass curtain wall skyscraper , is extant on [highland] park . the [frank] fort designed three ##foot building is generally [considered] [one] of the best art deco skyscraper ##s in the us and is often compared to detroit ' [s] famed fisher building . noted california architect wallace ne ##ff [designed] a number of homes in meridian as [well] as [in] the alabama black belt which ad ##jo ##ins [the] city across the nearby [alabama] state line . he had relatives [in] meridian and selma who were executives in the then thriving railroad industry and would take commissions in the area when commissions in california were lean . his work is mostly concentrated in the lower numbered blocks of [pop] [##lar] springs drive where his 251 ##6 pop ##lar springs drive is often compared to [the] similarly designed falcon lair , [the] beverly hills home in [benedict] canyon [of] rudolph [valentin] ##o . one ne ##ff work was lost to [an] expansion of anderson [hospital] in 1990 and another in [marion] park [burned] in the 1950s . the meridian post office [with] its interior done entirely of [bronze] and verde marble is also noteworthy as a very fine [example] of the type of post office structures built in thriving [and] well to do cities in the 1920s and originally had lal ##ique lighting which was [removed] [sadly] during a 1960s re [##mo] ##del ##ing and which are now [in] [private] residences on pop ##lar springs drive and [in] north hills [.]","meridian is right ##ly considered [a] [a] treasure tr [##ove] being one [of] nations most intact cities from the [depths] of the last [catchment] [.] [architecture] students from around the [world] [of] canada are known to [visit] meridian in groups as part of their course ##work due to numerous structures in the city [having] been designed by noted architects [.] the only [building] in the us south designed by noted canadian born architect [william] s [.] [reid] , famous for in ##venting the glass curtain wall skyscraper , is extant on [anderson] park . the [george] fort designed three ##foot building is generally [the] [one] of the best art deco skyscraper ##s in the us and is often compared to detroit ' [s] famed fisher building . noted california architect wallace ne ##ff [designed] a number of homes in meridian as [well] as [in] the alabama black belt which ad ##jo ##ins [the] city across the nearby [alabama] state line . he had relatives [in] meridian and selma who were executives in the then thriving railroad industry and would take commissions in the area when commissions in california were lean . his work is mostly concentrated in the lower numbered blocks of [nearby] [oak] springs drive where his 251 ##6 pop ##lar springs drive is often compared to [his] similarly designed falcon lair , [a] beverly hills home in [the] canyon [by] rudolph [valentin] ##o . one ne ##ff work was lost to [the] expansion of anderson [park] in 1990 and another in [anderson] park [burned] in the 1950s . the meridian post office [with] its interior done entirely of [bronze] and verde marble is also noteworthy as a very fine [example] of the type of post office structures built in thriving [and] well to do cities in the 1920s and originally had lal ##ique lighting which was [also] [removed] during a 1960s re [##mo] ##del ##ing and which are now [the] [in] residences on pop ##lar springs drive and [the] north hills [.]"
1,"meridian is served by the meridian @ - @ lauderdale county public library , located at the [MASK] of 7th street and 26th avenue . the city originally [virus] two [MASK] libraries , both built in [MASK] [MASK] one for blacks [MASK] [MASK] for whites [cases] a group of [MASK] had formed the fort ##night ##ly [book] and magazine [MASK] [MASK] the 1880s and began raising money to build a library for the city . the books they collected and shared within the [MASK] were later the basis of the library collection for [MASK] . [MASK] wide [MASK] for the library [MASK] the club enlisted israel marks , a city leader , to approach the [MASK] [philanthropist] andrew [MASK] for funding assistance . the library for blacks was built at 13th street and 28th avenue on land donated by st . paul methodist [MASK] [MASK] and the library for whites was [MASK] in a building originally [MASK] by members of [MASK] first presbyterian [MASK] of meridian , who sold it [MASK] [MASK] city on [MASK] 25 [,] 1911 [MASK] the african [MASK] library was the only library for blacks in [MASK] state until after world war i [MASK] is the only carnegie library [MASK] [MASK] for [snooker] americans in the country . [MASK] two libraries served the city until 1967 , when the [institutions] became integrated because of the civil rights act of 1964 , combined their [hull] [MASK] and moved [liquid] materials [MASK] their current location . the former [MASK] library [was] renovated and converted into the meridian museum of art in 1970 , and the [MASK] [MASK] @ - @ american library [MASK] demolished [MASK] may 28 , 2008 [MASK]","meridian is served by the meridian @ - @ lauderdale county public library , located at the [corner] of 7th street and 26th avenue . the city originally [had] two [carnegie] libraries , both built in [1913] [–] one for blacks [and] [one] for whites [.] a group of [women] had formed the fort ##night ##ly [book] and magazine [club] [in] the 1880s and began raising money to build a library for the city . the books they collected and shared within the [club] were later the basis of the library collection for [meridian] . [with] wide [support] for the library [,] the club enlisted israel marks , a city leader , to approach the [national] [philanthropist] andrew [carnegie] for funding assistance . the library for blacks was built at 13th street and 28th avenue on land donated by st . paul methodist [church] [,] and the library for whites was [established] in a building originally [owned] by members of [the] first presbyterian [church] of meridian , who sold it [to] [the] city on [september] 25 [,] 1911 [.] the african [american] library was the only library for blacks in [the] state until after world war i [and] is the only carnegie library [ever] [built] for [african] americans in the country . [the] two libraries served the city until 1967 , when the [institutions] became integrated because of the civil rights act of 1964 , combined their [collections] [,] and moved [all] materials [to] their current location . the former [white] library [was] renovated and converted into the meridian museum of art in 1970 , and the [former] [african] @ - @ american library [was] demolished [on] may 28 , 2008 [.]","meridian is served by the meridian @ - @ lauderdale county public library , located at the [corner] of 7th street and 26th avenue . the city originally [had] two [carnegie] libraries , both built in [1886] [,] one for blacks [and] [one] for whites [.] a group of [members] had formed the fort ##night ##ly [book] and magazine [club] [in] the 1880s and began raising money to build a library for the city . the books they collected and shared within the [library] were later the basis of the library collection for [blacks] . [a] wide [support] for the library [,] the club enlisted israel marks , a city leader , to approach the [local] [philanthropist] andrew [carnegie] for funding assistance . the library for blacks was built at 13th street and 28th avenue on land donated by st . paul methodist [church] [church] and the library for whites was [located] in a building originally [owned] by members of [the] first presbyterian [church] of meridian , who sold it [to] [the] city on [june] 25 [,] 1911 [.] the african [american] library was the only library for blacks in [the] state until after world war i [and] is the only carnegie library [,] [only] for [african] americans in the country . [the] two libraries served the city until 1967 , when the [institutions] became integrated because of the civil rights act of 1964 , combined their [hull] [##s] and moved [their] materials [to] their current location . the former [city] library [was] renovated and converted into the meridian museum of art in 1970 , and the [meridian] [meridian] @ - @ american library [was] demolished [on] may 28 , 2008 [.]"


In [None]:
tfm = first_blurr_tfm(learn.dls)

`Learner.blurr_fill_mask` works here too

In [None]:
learn.blurr_fill_mask(f'Blurr is a {tfm.hf_tokenizer.mask_token}.', n_preds=5)

['Blurr is a word.',
 'Blurr is a book.',
 'Blurr is a game.',
 'Blurr is a problem.',
 'Blurr is a concept.']

## Summarization

In [None]:
#hide
try: del learn; torch.cuda.empty_cache()
except: pass

In [None]:
raw_datasets = load_dataset("cnn_dailymail", '3.0.0')
print(f'{raw_datasets}\n')
print(f'{raw_datasets["train"][0]}\n')
print(f'{raw_datasets["train"].features}\n')

Reusing dataset cnn_dailymail (/home/wgilliam/.cache/huggingface/datasets/cnn_dailymail/3.0.0/3.0.0/3cb851bf7cf5826e45d49db2863f627cba583cbc32342df7349dfe6c38060234)


DatasetDict({
    train: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 287113
    })
    validation: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 13368
    })
    test: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 11490
    })
})

{'article': 'It\'s official: U.S. President Barack Obama wants lawmakers to weigh in on whether to use military force in Syria. Obama sent a letter to the heads of the House and Senate on Saturday night, hours after announcing that he believes military action against Syrian targets is the right step to take over the alleged use of chemical weapons. The proposed legislation from Obama asks Congress to approve the use of military force "to deter, disrupt, prevent and degrade the potential for future uses of chemical weapons or other weapons of mass destruction." It\'s a step that is set to turn an international crisis into a fierce domestic political battle. The

In [None]:
train_ds = raw_datasets['train'].select(range(1000))
valid_ds = raw_datasets['validation'].select(range(1000))

In [None]:
n_train, n_valid = train_ds.num_rows, valid_ds.num_rows
train_idxs, valid_idxs = L(range(n_train)), L(range(n_train, n_train + n_valid))
raw_ds = concatenate_datasets([train_ds, valid_ds])

In [None]:
learn = BlearnerForSummarization.from_dictionaries(raw_ds, 'facebook/bart-large-cnn', 
                                                   text_attr='article', summary_attr='highlights', 
                                                   max_length=256, max_target_length=130,
                                                   dblock_splitter=IndexSplitter(valid_idxs),
                                                   dl_kwargs={'bs':2}).to_fp16()

In [None]:
learn.dls.show_batch(dataloaders=learn.dls, max_n=2, input_trunc_at=500, target_trunc_at=250)

Unnamed: 0,text,target
0,"<s> (CNN) -- When Ji Yeqing awakened, she was already in the recovery room. Chinese authorities had dragged her out of her home and down four flights of stairs, she said, restraining and beating her husband as he tried to come to her aid. They whisked her into a clinic, held her down on a bed and forced her to undergo an abortion. Her offense? Becoming pregnant with a second child, in violation of China's one-child policy. ""After the abortion, I felt empty, as if something was scooped out of me,","China's one-child policy results in forced abortions and sterilizations, activists say.\nWomen tell of emotional and physical consequences from the procedures.\nActivist Chen Guangcheng works to advocate for victims of such practices."
1,"<s> (CNN) -- Gabriel García Márquez, the influential, Nobel Prize-winning author of ""One Hundred Years of Solitude"" and ""Love in the Time of Cholera,"" has died, his family and officials said. He was 87. The literary giant was treated in April for infections and dehydration at a Mexican hospital. García Márquez, a native of Colombia, is widely credited with helping to popularize ""magical realism,"" a genre ""in which the fantastic and the realistic are combined in a richly composed world of imagina","NEW: Colombia's President declares three days of national mourning.\nThe 87-year-old is widely credited with helping to popularize ""magical realism""\nGarcía Márquez stands as one of the most honored authors on Earth.\nThe Colombian author died in Mexic"


In [None]:
metrics_cb = BlearnerForSummarization.get_metrics_cb()
learn.fit_one_cycle(1, lr_max=4e-5, cbs=[metrics_cb])

epoch,train_loss,valid_loss,rouge1,rouge2,rougeL,bertscore_precision,bertscore_recall,bertscore_f1,time
0,1.694605,1.82829,0.345469,0.143262,0.244784,0.871203,0.892175,0.881458,11:44


In [None]:
learn.show_results(learner=learn, max_n=2, input_trunc_at=500, target_trunc_at=250)

Unnamed: 0,text,target,prediction
0,"(CNN)Reading the headlines out of Madison, Wisconsin, it's hard not to think about Ferguson, Missouri. But law enforcement's response to the shooting of 19-year-old Tony Robinson will not unfold in the same chaotic, violent and distrusting way as the shooting of 18-year-old Michael Brown, Madison's top police leaders vowed. ""I think it's very clear that Madison, Wisconsin, is not Ferguson, Missouri,"" said Jim Palmer, the executive director of the Wisconsin Professional Police Association. The h",Police officials in Madison say their responses to shooting by officer reflect their role in community.\nOne example: Madison chief talked to teen's family soon after shooting.\nA month went by before Ferguson chief apologized to Brown's family.,"Madison, Wisconsin, police chief has been outspoken about the shooting of 19-year-old Tony Robinson .\nChief Mike Koval has been out front and outspoken about Robinson's shooting since it happened late Friday night .\n""We have to say we are sorry at t"
1,"Manila (CNN)Forty years ago, one of the greatest boxing matches in history took place in an unlikely setting: the capital of the Philippines. Muhammad Ali's epic win over great rival Joe Frazier in 1975 became known as the ""Thrilla in Manila."" Four decades later, and a crowd is gathered at the same venue in this sweating, sprawling city -- known as the Araneta Coliseum -- for a very different contest: a basketball game between two of the country's pro teams. And yet, the Coliseum cannot seem to","CNN meets the Philippines fighter on court in Manila, where he coaches a basketball team.\nThe 36-year-old ring legend has also dabbled in politics in his homeland.\nPacquaio says his two oldest children are desperate for him to fight Mayweather.","Muhammad Ali's epic win over Joe Frazier in 1975 became known as the ""Thrilla in Manila"" Four decades later, a crowd is gathered at the same venue in this sweating, sprawling city .\nManny ""the Pacman"" Pacquiao is the coach, as well as one of the sho"


`Learner.blurr_generate` works here too

In [None]:
test_article = """
About 10 men armed with pistols and small machine guns raided a casino in Switzerland and made off 
into France with several hundred thousand Swiss francs in the early hours of Sunday morning, police said. 
The men, dressed in black clothes and black ski masks, split into two groups during the raid on the Grand Casino 
Basel, Chief Inspector Peter Gill told CNN. One group tried to break into the casino's vault on the lower level 
but could not get in, but they did rob the cashier of the money that was not secured, he said. The second group 
of armed robbers entered the upper level where the roulette and blackjack tables are located and robbed the 
cashier there, he said. As the thieves were leaving the casino, a woman driving by and unaware of what was 
occurring unknowingly blocked the armed robbers' vehicles. A gunman pulled the woman from her vehicle, beat 
her, and took off for the French border. The other gunmen followed into France, which is only about 100 
meters (yards) from the casino, Gill said. There were about 600 people in the casino at the time of the robbery. 
There were no serious injuries, although one guest on the Casino floor was kicked in the head by one of the 
robbers when he moved, the police officer said. Swiss authorities are working closely with French authorities, 
Gill said. The robbers spoke French and drove vehicles with French lRicense plates. CNN's Andreena Narayan 
contributed to this report.
"""

In [None]:
outputs = learn.blurr_generate(test_article, num_return_sequences=3)

for idx, o in enumerate(outputs):
    print(f'=== Prediction {idx+1} ===\n{o}\n')

=== Prediction 1 ===
 About 10 armed with pistols and small machine guns raided a casino in Switzerland .
The robbers made off with several hundred thousand Swiss francs, police said .
A woman driving by unknowingly blocked the robbers' vehicles .
There were no serious injuries, although one guest was kicked in the head by one of the robbers .

=== Prediction 2 ===
 About 10 armed with pistols and small machine guns raided a casino in Switzerland .
The robbers made off with several hundred thousand Swiss francs, police say .
A woman driving by unknowingly blocked the robbers' vehicles .
There were no serious injuries, although one guest was kicked in the head by one of the robbers .

=== Prediction 3 ===
 About 10 armed with pistols and small machine guns raided a casino in Switzerland .
The robbers made off with several hundred thousand Swiss francs, police said .
A woman driving by unknowingly blocked the robbers' vehicles .
There were no serious injuries, although one guest was kick

## Translation

In [None]:
#hide
try: del learn; torch.cuda.empty_cache()
except: pass

In [None]:
raw_datasets = load_dataset('wmt16', 'de-en')
print(f'{raw_datasets}\n')
print(f'{raw_datasets["train"][0]}\n')
print(f'{raw_datasets["train"].features}\n')

Reusing dataset wmt16 (/home/wgilliam/.cache/huggingface/datasets/wmt16/de-en/1.0.0/0d9fb3e814712c785176ad8cdb9f465fbe6479000ee6546725db30ad8a8b5f8a)


DatasetDict({
    train: Dataset({
        features: ['translation'],
        num_rows: 4548885
    })
    validation: Dataset({
        features: ['translation'],
        num_rows: 2169
    })
    test: Dataset({
        features: ['translation'],
        num_rows: 2999
    })
})

{'translation': {'de': 'Wiederaufnahme der Sitzungsperiode', 'en': 'Resumption of the session'}}

{'translation': Translation(languages=['de', 'en'], id=None)}



In [None]:
train_ds = raw_datasets['train'].select(range(1000))
valid_ds = raw_datasets['validation'].select(range(1000))

In [None]:
n_train, n_valid = train_ds.num_rows, valid_ds.num_rows
train_idxs, valid_idxs = L(range(n_train)), L(range(n_train, n_train + n_valid))
raw_ds = concatenate_datasets([train_ds, valid_ds])

In [None]:
def make_dict(item):
    return item['translation']

raw_ds = raw_ds.map(make_dict)

Loading cached processed dataset at /home/wgilliam/.cache/huggingface/datasets/wmt16/de-en/1.0.0/0d9fb3e814712c785176ad8cdb9f465fbe6479000ee6546725db30ad8a8b5f8a/cache-2d203da04becbf79.arrow


In [None]:
learn = BlearnerForTranslation.from_dataframe(raw_ds, 'Helsinki-NLP/opus-mt-de-en', 
                                              src_lang_name='German', src_lang_attr='de', 
                                              trg_lang_name='English', trg_lang_attr='en', 
                                              dblock_splitter=RandomSplitter(),
                                              dl_kwargs={'bs':2}).to_fp16()

In [None]:
learn.dls.show_batch(dataloaders=learn.dls, max_n=2, input_trunc_at=500, target_trunc_at=250)

Unnamed: 0,text,target
0,"▁Angesichts▁dieser Situation▁muß▁aus dem▁Bericht, den das▁Parlament annimmt,▁klar▁hervorgehen,▁daß▁Maßnahmen▁notwendig▁sind, die▁eindeutig auf die▁Bekämpfung der relativen▁Armut und der Arbeitslosigkeit▁gerichtet▁sind.▁Maßnahmen▁wie die für diese▁Zwecke▁angemessene▁Verwendung der▁Strukturfonds, die▁häufig▁unsachgemäß▁eingesetzt▁werden, und▁zwar mit▁zentralen▁staatlichen▁Politiken, die▁Modernisierung der▁Bereiche Telekommunikation und▁Kommunikation,▁indem man vor▁allem die am▁wenigsten▁entwickelt","Given this situation, the report approved by Parliament must highlight the need for measures that aim unequivocally to fight relative poverty and unemployment: measures such as the appropriate use of structural funds for these purposes, which are oft"
1,"Sie▁wird▁aber auf▁Seite 5▁dieser▁Leitlinien▁ganz▁eindeutig▁genannt, und▁ich▁möchte▁darauf▁verweisen -▁weil▁sie▁mich▁dazu▁aufgefordert▁haben -,▁daß diese▁Partnerschaft für▁mich - und▁ich▁habe▁lange▁genug eine Region▁betreut, um dies▁beurteilen zu▁können - ein▁sehr▁wirkungsvolles Instrument zur▁Mobilisierung der▁geistigen▁Ressourcen auf▁lokaler▁Ebene▁ist -▁sowohl derer im▁öffentlichen▁Sektor - die Stadt- und▁Gemeinderäte, den▁schulischen und▁gesellschaftlichen▁Bereich, die▁Vereine und▁Verbände -▁a","However, I do wish to mention - since you have asked me to do so - that, as far as I am concerned, this partnership - and I spent long enough as a regional administrator within my own country to be able to say this most sincerely - is a tool, one use"


In [None]:
metrics_cb = BlearnerForTranslation.get_metrics_cb()
learn.fit_one_cycle(1, lr_max=4e-5, cbs=[metrics_cb])

[nltk_data] Downloading package wordnet to /home/wgilliam/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


epoch,train_loss,valid_loss,bleu,meteor,sacrebleu,time
0,1.278377,1.21033,0.347698,0.581037,33.268828,02:02


In [None]:
learn.show_results(learner=learn, max_n=2, input_trunc_at=500, target_trunc_at=250)

Unnamed: 0,text,target,prediction
0,"▁Deshalb▁besteht der▁Vorschlag der▁Fraktion der▁Sozialdemokratischen▁Partei▁Europas, den Sie▁erwähnt▁haben,▁darin, den▁Mittwoch▁als▁Termin der▁Vorstellung des▁Programms der▁Kommission Prodi für die▁Wahlperiode▁beizubehalten, und in▁dieses▁Programm▁auch das▁Verwaltungsreformprojekt▁einzubeziehen, da wir▁andernfalls in eine paradoxe Situation▁geraten▁könnten: Mit der Ausrede, der▁Wortlaut liege nicht vor,▁wird▁einerseits dem▁Präsidenten der▁Kommission das▁Recht▁abgesprochen, in▁diesem▁Parlament zu","Therefore, the proposal of the Group of the Party of European Socialists, and which you have mentioned, is that the Prodi Commission present its legislative programme on Wednesday, including its proposed administrative reform, because, otherwise, we","That is why the proposal of the Group of the Party of European Socialists, which you have mentioned, is to maintain Wednesday as the date for the presentation of the Prodi Commission programme for the parliamentary term, and to include in this progra"
1,"▁Ich▁möchte▁daher die▁Kommission auf▁zwei▁Punkte▁hinweisen:▁Erstens▁muß die▁Konzertierung▁als Instrument der▁Koordinierung und der▁Beteiligung▁sämtlicher▁lokaler und▁regionaler▁Marktteilnehmer an den▁Entscheidungen optimal▁genutzt▁werden, um▁speziell▁Ungleichgewichte und▁Ungleichheiten zu▁vermeiden;▁zweitens▁bedarf es▁einer▁Vereinfachung und▁transparenteren▁Gestaltung der▁Verwaltungsprozesse, die sich▁allzu▁häufig▁unnötig in die▁Länge▁ziehen und derart▁kompliziert▁sind,▁daß▁sie, was vor▁allem vo","Firstly, we need to make the best possible use of consultation as a means of ensuring proper coordination and participation by all local and regional operators in decision-making, precisely so that imbalances and inequalities can be avoided. Secondly","I would therefore like to draw the Commission' s attention to two points: firstly, consultation as an instrument of coordination and the participation of all local and regional operators in decisions must be used to the best extent possible in order"


`Learner.blurr_generate` works here too

In [None]:
test_de = "Ich trinke gerne Bier"

In [None]:
learn.blurr_generate(test_de)

['I like to drink beer']

## Summary

In summary, whether you want to work with Blurr's low, mid, or high-level API ... we got you covered :)

In [None]:
#hide
from nbdev.export import notebook2script
notebook2script()

Converted 00_utils.ipynb.
Converted 01_data-core.ipynb.
Converted 01_modeling-core.ipynb.
Converted 02_data-language-modeling.ipynb.
Converted 02_modeling-language-modeling.ipynb.
Converted 03_data-token-classification.ipynb.
Converted 03_modeling-token-classification.ipynb.
Converted 04_data-question-answering.ipynb.
Converted 04_modeling-question-answering.ipynb.
Converted 10_data-seq2seq-core.ipynb.
Converted 10_modeling-seq2seq-core.ipynb.
Converted 11_data-seq2seq-summarization.ipynb.
Converted 11_modeling-seq2seq-summarization.ipynb.
Converted 12_data-seq2seq-translation.ipynb.
Converted 12_modeling-seq2seq-translation.ipynb.
Converted 99a_examples-high-level-api.ipynb.
Converted 99b_examples-glue.ipynb.
Converted 99c_examples-glue-plain-pytorch.ipynb.
Converted 99d_examples-multilabel.ipynb.
Converted 99e_examples-causal-lm-gpt2.ipynb.
Converted index.ipynb.
