This is a notebook which will help you get started with the dataset '300 Years of British Patents'!  :)

## Load the dataset

The dataset consists of two gzip-compressed JSONL files.

The easiest way to load the dataset is by using the 'load_dataset' function from the HuggingFace 'datasets' library.

In [None]:
%%capture
!pip install datasets
!pip install transformers

In [None]:
from datasets import load_dataset
import pandas as pd
from transformers import pipeline, AutoModelForTokenClassification, AutoModelForSequenceClassification, AutoTokenizer

#### Download data from HuggingFace

In [None]:
# to download the text data as a HF dataset:
dataset_all_years = load_dataset(
    "matthewleechen/300YearsOfBritishPatents",
    data_files="texts.jsonl.gz"
)

README.md:   0%|          | 0.00/6.54k [00:00<?, ?B/s]

texts.jsonl.gz:   0%|          | 0.00/1.69G [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [None]:
# to download entities as a HF dataset:
dataset_all_entities = load_dataset(
    "matthewleechen/300YearsOfBritishPatents",
    data_files="entities.jsonl.gz"
)

entities.jsonl.gz:   0%|          | 0.00/32.2M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [None]:
### you can subset the dataset to years you want in a couple of ways

# subset to a number of years
years_of_interest = [1784, 1826]

# get text data for only those years
dataset_subset_specified = dataset_all_years["train"].filter(
    lambda x: x["year"] in years_of_interest
)

# or get text data for a range of years
dataset_subset_range = dataset_all_years["train"].filter(
    lambda x: 1789 <= x["year"] <= 1792
)

Filter:   0%|          | 0/322874 [00:00<?, ? examples/s]

Exception ignored in: <function _xla_gc_callback at 0x7d8dbb224790>
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/jax/_src/lib/__init__.py", line 96, in _xla_gc_callback
    def _xla_gc_callback(*args):
KeyboardInterrupt: 


Filter:   0%|          | 0/322874 [00:00<?, ? examples/s]

#### Inspect the dataset structure

In [None]:
# inspect datasetdict
print(dataset_all_years)

DatasetDict({
    train: Dataset({
        features: ['patent_id', 'year', 'patent_title', 'full_text', 'word_tokens', 'predicted_BPO_classes'],
        num_rows: 322874
    })
})


In [None]:
# inspect datasetdict
print(dataset_all_entities)

DatasetDict({
    train: Dataset({
        features: ['patent_id', 'year', 'front_page_entities'],
        num_rows: 322874
    })
})


In [None]:
# variables in the dataset and their data types
dataset_all_years['train'].features

{'patent_id': Value(dtype='string', id=None),
 'year': Value(dtype='int64', id=None),
 'patent_title': Value(dtype='string', id=None),
 'full_text': [{'page_num': Value(dtype='int64', id=None),
   'page_text': Value(dtype='string', id=None)}],
 'word_tokens': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
 'predicted_BPO_classes': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None)}

In [None]:
# the text data is represented in the data as a dictionary, here we display one
dataset_all_years['train'][123456]

{'patent_id': 'GB188100826A',
 'year': 1881,
 'patent_title': 'IMPROVE- MENTS IN TOBACCO-POUCHES',
 'full_text': [{'page_num': 1,
   'page_text': 'LETTERS PATENT to James Burbridge, Manager at Messrs. W. Warne and Co.\'s\nIndia Rubber Works, Tottenham, Middlesex, for an Invention of "IMPROVE-\nMENTS IN TOBACCO-POUCHES."\nPROVISIONAL SPECIFICATION left by the said James Burbridge at the\nOffice of the Commissioners of Patents on the 26th February 1881.\nJAMES BURBRIDGE, Manager at Messrs. W. Warne & Co.\'s India Rubber Works,\nTottenham, Middlesex. "IMPROVEMENTS IN TOBACCO POUCHES."\nThis Invention relates to a novel construction of pouch to be used for the\nreception of tobacco, fusees, and other materials used by smokers, the object being\nto obtain a compact and secure receptacle for such articles.\nTo this end we cut from a sheet of plastic rubber compound 2 pieces of any\ndesired form, say semi elliptical, and of corresponding size, and place them together\nback to back so as to fo

In [None]:
# here we visualize a dictionary of entities
dataset_all_entities['train'][123456]

{'patent_id': 'GB186100760A',
 'year': 1861,
 'front_page_entities': [{'class': 'DATE',
   'entity_text': '26th March 1861.',
   'start': 119,
   'end': 135,
   'person_id': None,
   'inventor_id': None,
   'latitude': None,
   'longitude': None},
  {'class': 'PER',
   'entity_text': 'Hannah Emes',
   'start': 34,
   'end': 45,
   'person_id': [1],
   'inventor_id': 82258,
   'latitude': None,
   'longitude': None},
  {'class': 'ADD',
   'entity_text': "St. John's Villas, Adelaide Road, Hampstead, in the County of Middlesex",
   'start': 155,
   'end': 226,
   'person_id': [1],
   'inventor_id': None,
   'latitude': 51.5545,
   'longitude': -0.174}]}

#### HF datasets --> pandas dataframes

In [None]:
# convert each HF dataset to pandas
df_years = dataset_all_years["train"].to_pandas()
df_entities = dataset_all_entities["train"].to_pandas()

In [None]:
# merge on patent_id
df_merged = pd.merge(df_years, df_entities, on="patent_id", how="inner")

#### Build a csv of unique patent-inventors

In [None]:
# get entities dataframe
def process_huggingface_dataset(hf_dataset):

    all_records = []

    for row in hf_dataset["train"]:

        patent_id = row["patent_id"]

        year = row["year"]

        person_data = {}

        # process and store 'PER' entity information
        for entity in row["front_page_entities"]:

            if entity["class"] == "PER":

                for pid in entity["person_id"]:

                    if pid not in person_data:

                        # initialize columns
                        person_data[pid] = {
                            "patent_id": patent_id,
                            "inventor_id": entity.get("inventor_id"),
                            "year": year,
                            "name": entity["entity_text"],
                            "occupation": None,
                            "address": None,
                            "firm": None,
                        }

        # map 'OCC', 'ADD', 'FIRM' to 'PER' using 'person_id'
        for entity in row["front_page_entities"]:

            if entity["class"] in ["OCC", "ADD", "FIRM"]:

                for pid in entity["person_id"]:

                    if pid in person_data:

                        if entity["class"] == "OCC" and person_data[pid]["occupation"] is None:
                            person_data[pid]["occupation"] = entity["entity_text"]

                        elif entity["class"] == "ADD" and person_data[pid]["address"] is None:
                            person_data[pid]["address"] = entity["entity_text"]

                        elif entity["class"] == "FIRM" and person_data[pid]["firm"] is None:
                            person_data[pid]["firm"] = entity["entity_text"]

        all_records.extend(list(person_data.values()))

    return pd.DataFrame(all_records)


entities_df = process_huggingface_dataset(dataset_all_entities)
entities_df

Unnamed: 0,patent_id,inventor_id,year,name,occupation,address,firm
0,GB167800201A,24518,1678,CHARLES HOWARD,Esquire,,
1,GB167800203A,106090,1678,JOHN ROBERTS,Esquire,,
2,GB167800206A,59762,1678,GEORGE,,,
3,GB167800206A,136363,1678,Lord Viscount GRANDISON,,,
4,GB167800205A,153725,1678,ROBERT LEDGINGHAM,Merchant,Citty of London,
...,...,...,...,...,...,...,...
395936,GB179702163A,100315,1797,JOHN FALCONER ATLEE,Distiller,"Wandsworth, in the County of Surrey",
395937,GB179702176A,110304,1797,JOSEPH BARTON,Chymist,"Parish of Saint Botolpli, Bishopsgate, in tlie...",
395938,GB179702205A,94617,1797,JAMES WELDON,Engineer,"Litchfield, in the County of Stafford",
395939,GB179702188A,84166,1797,Henry Johnson,Gentleman,London,


In [None]:
# save as csv
entities_df.to_csv("patent_inventors.csv")

In [None]:
unique_inventors_count = entities_df["inventor_id"].nunique()
print(f"Number of unique inventors: {unique_inventors_count}")

Number of unique inventors: 205082


## Try our named entity recognition models

Our NER models are available here:
- For entities: https://huggingface.co/matthewleechen/patent_entities_ner.
- For titles: https://huggingface.co/matthewleechen/patent_titles_ner.

In [None]:
ent_model_repo = "matthewleechen/patent_entities_ner" # for extracting named entities from the front page
title_model_repo = "matthewleechen/patent_titles_ner" # for extracting titles from the front page

# set entity tokenizer + model
ent_tokenizer = AutoTokenizer.from_pretrained(ent_model_repo)
ent_model = AutoModelForTokenClassification.from_pretrained(ent_model_repo)

# set title tokenizer + model
title_tokenizer = AutoTokenizer.from_pretrained(title_model_repo)
title_model = AutoModelForTokenClassification.from_pretrained(title_model_repo)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
# a custom recognizer that reconstructs HF pipelines entities

def recognizer(text, model, tokenizer):

    # HF ner pipeline
    token_level_results = pipeline("ner", model=model, tokenizer=tokenizer)(text)

    # keep entities tracked
    entities = []
    current_entity = None

    for item in token_level_results:

        tag = item['entity']

        # replace '▁' with space for easier reading (_ is created by the XLM-RoBERTa tokenizer)
        word = item['word'].replace('▁', ' ')

        # aggregate I-O-B tagged entities
        if tag.startswith('B-'):

            if current_entity:
                entities.append(current_entity)

            current_entity = {'type': tag[2:], 'text': word.strip(), 'start': item['start'], 'end': item['end']}

        elif tag.startswith('I-'):

            if current_entity and tag[2:] == current_entity['type']:
                current_entity['text'] += word
                current_entity['end'] = item['end']

            else:

                if current_entity:
                    entities.append(current_entity)

                current_entity = {'type': tag[2:], 'text': word.strip(), 'start': item['start'], 'end': item['end']}

        else:
            # deal with O tag
            if current_entity:
                entities.append(current_entity)
            current_entity = None


    if current_entity:

        # add to entities
        entities.append(current_entity)


    # track entity merges
    merged_entities = []

    # merge entities of the same type
    for entity in entities:

        if merged_entities and merged_entities[-1]['type'] == entity['type'] and merged_entities[-1]['end'] == entity['start']:
            merged_entities[-1]['text'] += entity['text']
            merged_entities[-1]['end'] = entity['end']

        else:
            merged_entities.append(entity)

    # clean up extra spaces
    for entity in merged_entities:
        entity['text'] = ' '.join(entity['text'].split())

    # convert to list of dicts
    return [{'class': entity['type'],
             'entity_text': entity['text'],
             'start': entity['start'],
             'end': entity['end']} for entity in merged_entities]


In [None]:
# let's try out an example
example = """
LETTERS PATENT to James Burbridge, Manager at Messrs. W. Warne and Co.\'s\nIndia Rubber Works, Tottenham, Middlesex, for an Invention of "IMPROVE-\nMENTS IN TOBACCO-POUCHES."\nPROVISIONAL SPECIFICATION left by the said James Burbridge at the\nOffice of the Commissioners of Patents on the 26th February 1881.\nJAMES BURBRIDGE, Manager at Messrs. W. Warne & Co.\'s India Rubber Works,\nTottenham, Middlesex. "IMPROVEMENTS IN TOBACCO POUCHES."\nThis Invention relates to a novel construction of pouch to be used for the\nreception of tobacco, fusees, and other materials used by smokers, the object being\nto obtain a compact and secure receptacle for such articles.\nTo this end we cut from a sheet of plastic rubber compound 2 pieces of any\ndesired form, say semi elliptical, and of corresponding size, and place them together\nback to back so as to form a foundation or stiffening piece for the pouch. If\ndesired, we may insert at different places between these two pieces of rubber metal\nor other plates which will prevent the rubber from adhering together when pressure\nis applied, and thus one, two, or more pockets will be formed in which to place\ncigarette paper books or matches. On either side of this foundation piece a\nsimilarly shaped piece of plastic rubber compound is attached by its edges, and\ninto the spaces between the external pieces and the foundation plate a suitable\nquantity of water or other liquid is injected, the opening left for this purpose being\nimmediately closed up.\nA cap piece of sheet rubber of suitable form is next applied to the pouch, and is\nsecured to the straight edge thereof by solution, which, when pressure is applied,\ncauses the cap piece to adhere to the body of the pouch.\nSome powdered talc or other suitable material is inserted under the cap piece\nto prevent it adhering to the body of the pouch, except at those parts which have\nreceived the solution.
"""

In [None]:
# return the extracted entities
recognizer(example, ent_model, ent_tokenizer)

Device set to use cpu


[{'class': 'PER', 'entity_text': 'James Burbridge', 'start': 19, 'end': 34},
 {'class': 'OCC', 'entity_text': 'Manager', 'start': 36, 'end': 43},
 {'class': 'FIRM',
  'entity_text': "Messrs. W. Warne and Co.'s",
  'start': 47,
  'end': 73},
 {'class': 'FIRM',
  'entity_text': 'India Rubber Works',
  'start': 74,
  'end': 92},
 {'class': 'ADD',
  'entity_text': 'Tottenham, Middlesex',
  'start': 94,
  'end': 114},
 {'class': 'PER', 'entity_text': 'James Burbridge', 'start': 216, 'end': 231},
 {'class': 'DATE',
  'entity_text': '26th February 1881.',
  'start': 285,
  'end': 304},
 {'class': 'PER', 'entity_text': 'JAMES BURBRIDGE', 'start': 305, 'end': 320},
 {'class': 'OCC', 'entity_text': 'Manager', 'start': 322, 'end': 329},
 {'class': 'FIRM',
  'entity_text': "Messrs. W. Warne & Co.'s",
  'start': 333,
  'end': 357},
 {'class': 'ADD',
  'entity_text': 'Tottenham, Middlesex.',
  'start': 378,
  'end': 399}]

In [None]:
# return the extracted title
recognizer(example, title_model, title_tokenizer)

Device set to use cpu


[{'class': 'TITLE',
  'entity_text': 'IMPROVE- MENTS IN TOBACCO-POUCHES',
  'start': 137,
  'end': 170}]

## Try our patent classification model

Our patent classification model is available at: https://huggingface.co/matthewleechen/multilabel_patent_classifier.

In [None]:
class_model_repo = "matthewleechen/multilabel_patent_classifier" # for extracting named entities from the front page

# set classifier tokenizer + model
class_tokenizer = AutoTokenizer.from_pretrained(class_model_repo)
class_model = AutoModelForSequenceClassification.from_pretrained(class_model_repo)

In [None]:
# return labels function
def get_class_labels(text, threshold=0.5):

    # pipeline returns a list of lists e.g. [[{'label':'LABEL1', 'score':0.8}, {'label':'LABEL2', 'score':0.2}, ...]]
    # grab the first (and only) item: a list of dicts for this text
    output = pipeline(task="text-classification", model=class_model, tokenizer=class_tokenizer, top_k=None)(text)[0]

    # convert to dict {label -> score}
    label_scores = {item['label']: item['score'] for item in output}

    # filter for sigmoid threshold
    return [label for label, score in label_scores.items() if score >= threshold]

In [None]:
# feed a title as an example
example_title = "IMPROVE- MENTS IN TOBACCO-POUCHES"
labels = get_class_labels(example_title)
print(labels)

['Tobacco']
