In [104]:
import checklist
from checklist.editor import Editor
from checklist.perturb import Perturb

## Creating examples from existing datasets via perturbations 

In [105]:
editor = Editor()

Let's start by creating a fictitious dataset:

In [213]:
data = ['John is a very smart person, he lives in Ireland.',
        'Mark Stewart was born and raised in Chicago',
        'Luke Smith has 3 sisters.',
        'Mary is not a nurse.',
        'Julianne is an engineer.',
        'My brother Andrew used to be a lawyer.']

### Writing your own perturbations

Let's say we want to write a perturbation function to replace some professions with other professions:

In [214]:
import re
def change_professions(x, *args, **kwargs):
    # Returns empty or a list of strings with profesions changed
    professions = ['doctor', 'nurse', 'engineer', 'lawyer']
    ret = []
    for p in professions:
        if re.search(r'\b%s\b' % p, x):
            ret.extend([re.sub(r'\b%s\b' % p, p2, x) for p2 in professions if p != p2])
    return ret
            

In [215]:
change_professions(data[2])

[]

We could use this function on every example in `data`, and keep only cases where it applies.
There is an auxiliary function that does this (and more) for us, called `Perturb.perturb`:

In [216]:
ret = Perturb.perturb(data, change_professions, keep_original=True)
ret.data

[['Mary is not a nurse.',
  'Mary is not a doctor.',
  'Mary is not a engineer.',
  'Mary is not a lawyer.'],
 ['Julianne is an engineer.',
  'Julianne is an doctor.',
  'Julianne is an nurse.',
  'Julianne is an lawyer.'],
 ['My brother Andrew used to be a lawyer.',
  'My brother Andrew used to be a doctor.',
  'My brother Andrew used to be a nurse.',
  'My brother Andrew used to be a engineer.']]

Notice how `Perturb.perturb` automatically ignored examples in our dataset where the perturbation didn't return anything, e.g. 'John is a very smart person'.  
We set `keep_original=True`, and therefore the original data point is kept as the first in every example list. This is typically what we want to do in perturbation tests. This is what we would get if we had set it to `False`:

In [217]:
ret = Perturb.perturb(data, change_professions, keep_original=False)
ret.data

[['Mary is not a doctor.', 'Mary is not a engineer.', 'Mary is not a lawyer.'],
 ['Julianne is an doctor.', 'Julianne is an nurse.', 'Julianne is an lawyer.'],
 ['My brother Andrew used to be a doctor.',
  'My brother Andrew used to be a nurse.',
  'My brother Andrew used to be a engineer.']]

We can also specify a number of samples if our dataset is too large:

In [218]:
ret = Perturb.perturb(data, change_professions, keep_original=False, nsamples=1)
ret.data

[['My brother Andrew used to be a doctor.',
  'My brother Andrew used to be a nurse.',
  'My brother Andrew used to be a engineer.']]

Finally, we may want our perturbation function to return some metadata. In our case, maybe we want to remember which profession was swapped into which profession. To do so, let's rewrite `change_professions` so that it returns an additional list with metadata:

In [219]:
def change_professions(x, meta=False, *args, **kwargs):
    # Returns empty or a list of strings with profesions changed
    professions = ['doctor', 'nurse', 'engineer', 'lawyer']
    ret = []
    ret_meta = []
    for p in professions:
        if re.search(r'\b%s\b' % p, x):
            ret.extend([re.sub(r'\b%s\b' % p, p2, x) for p2 in professions if p != p2])
            ret_meta.extend([(p, p2) for p2 in professions if p != p2])
    if meta:
        return ret, ret_meta
    else:
        return ret
            

In [220]:
change_professions(data[2], meta=True)

([], [])

We can now call `Perturb.perturb` with `meta=True`. Whatever keyword arguments we use in `Perturb.perturb` get passed along to the perturbation function. In addition, if we set `meta=True`, `ret.meta` will have the metadata.

In [221]:
ret = Perturb.perturb(data, change_professions, keep_original=True, nsamples=1, meta=True)
print('Data')
print(ret.data)
print('Metadata')
print(ret.meta)

Data
[['Mary is not a nurse.', 'Mary is not a doctor.', 'Mary is not a engineer.', 'Mary is not a lawyer.']]
Metadata
[[None, ('nurse', 'doctor'), ('nurse', 'engineer'), ('nurse', 'lawyer')]]


### General-purpose perturbations

We provide some general-purpose perturbation functions. Some assume you have preprocessed the data with spacy, so let's do that:

In [222]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [223]:
pdata = list(nlp.pipe(data))

#### Punctuation

`Perturb.strip_punctuation` removes punctuation:

In [224]:
pdata[0], Perturb.strip_punctuation(pdata[0])

(John is a very smart person, he lives in Ireland.,
 'John is a very smart person, he lives in Ireland')

`Perturb.punctuation` adds and / or removes punctuation (notice that we add it when it's not there and remove it when it is)

In [225]:
ret = Perturb.perturb(pdata, Perturb.punctuation)
ret.data[:4]

[['John is a very smart person, he lives in Ireland.',
  'John is a very smart person, he lives in Ireland'],
 ['Mark Stewart was born and raised in Chicago',
  'Mark Stewart was born and raised in Chicago.'],
 ['Luke Smith has 3 sisters.', 'Luke Smith has 3 sisters'],
 ['Mary is not a nurse.', 'Mary is not a nurse']]

#### Typos

In [226]:
data[0], Perturb.add_typos(data[0])

('John is a very smart person, he lives in Ireland.',
 'John is a very smartp erson, he lives in Ireland.')

In [227]:
ret = Perturb.perturb(data, Perturb.add_typos, nsamples=1)
ret.data

[['Julianne is an engineer.', 'Julianne si an engineer.']]

#### Contractions

`Perturb.expand_contractions` and `Perturb.contract` act on a single string:

In [228]:
data[2], Perturb.contract(data[2])

('Luke Smith has 3 sisters.', 'Luke Smith has 3 sisters.')

In [229]:
Perturb.expand_contractions('What\'s going on?')

'What is going on?'

`Perturb.contractions` contracts AND expands contractions if possible:

In [230]:
Perturb.contractions('What\'s going on? I am not happy')

['What is going on? I am not happy', "What's going on? I'm not happy"]

In [231]:
ret = Perturb.perturb(data, Perturb.contractions)
ret.data

[['Mary is not a nurse.', "Mary isn't a nurse."]]

#### Changing named entities

The following functions all assume you have parsed the input with spacy.  

Perturb.change_names allows you to replace person names automatically.  
You can specify if you only want first names, first and last names, etc.

In [250]:
ret = Perturb.perturb(pdata[2:3], Perturb.change_names, nsamples=1)
ret.data

[['Luke Smith has 3 sisters.',
  'Michael Morgan has 3 sisters.',
  'Christopher Anderson has 3 sisters.',
  'Matthew Jones has 3 sisters.',
  'David Bennett has 3 sisters.',
  'James Cruz has 3 sisters.',
  'John James has 3 sisters.',
  'Joshua Anderson has 3 sisters.',
  'Daniel Cooper has 3 sisters.',
  'Joseph Johnson has 3 sisters.',
  'William Scott has 3 sisters.']]

You can also specify how many replacements with `n` (default is 10):

In [233]:
ret = Perturb.perturb(pdata, Perturb.change_names, nsamples=1, n=3)
ret.data

[['Julianne is an engineer.',
  'Sydney is an engineer.',
  'Hannah is an engineer.',
  'Isabella is an engineer.']]

In [234]:
ret = Perturb.perturb(pdata, Perturb.change_names, nsamples=1, first_only=True, n=3)
ret.data

[['John is a very smart person, he lives in Ireland.',
  'Jeffrey is a very smart person, he lives in Ireland.',
  'Jeffrey is a very smart person, he lives in Ireland.',
  'William is a very smart person, he lives in Ireland.']]

In [235]:
ret = Perturb.perturb(pdata, Perturb.change_names, nsamples=1, last_only=True, n=3)
ret.data

[['Luke Smith has 3 sisters.',
  'Luke Edwards has 3 sisters.',
  'Luke Bennett has 3 sisters.',
  'Luke Murphy has 3 sisters.']]

You can also set `meta=True` if you want to save the change in the metadata:

In [236]:
ret = Perturb.perturb(pdata, Perturb.change_names, nsamples=1, n=3, last_only=True, meta=True)
ret.data[0][1:], ret.meta[0][1:]

(['Luke Richardson has 3 sisters.',
  'Luke Parker has 3 sisters.',
  'Luke Smith has 3 sisters.'],
 [('Smith', 'Richardson'), ('Smith', 'Parker'), ('Smith', 'Smith')])

Similarly, you can change locations with `Perturb.change_location`:

In [237]:
ret = Perturb.perturb(pdata, Perturb.change_location, nsamples=1, n=3, meta=True)
ret.data[0], ret.meta[0]

(['Mark Stewart was born and raised in Chicago',
  'Mark Stewart was born and raised in Austin',
  'Mark Stewart was born and raised in Miami',
  'Mark Stewart was born and raised in Glendale'],
 [None, ('Chicago', 'Austin'), ('Chicago', 'Miami'), ('Chicago', 'Glendale')])

And numbers:

In [238]:
ret = Perturb.perturb(pdata, Perturb.change_number, nsamples=1, n=3, meta=True)
ret.data[0], ret.meta[0]

(['Luke Smith has 3 sisters.',
  'Luke Smith has 4 sisters.',
  'Luke Smith has 2 sisters.',
  'Luke Smith has 2 sisters.'],
 [None, ('3', '4'), ('3', '2'), ('3', '2')])