# Sentence Boilerplate

In [3]:
%pip install spacy datasets -q

Note: you may need to restart the kernel to use updated packages.


In [4]:
from datasets import load_dataset
from collections import Counter
import re
import spacy

## Load spacy

In [6]:
!python -m spacy download en_core_web_sm

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m497.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: en-core-web-sm
Successfully installed en-core-web-sm-3.8.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [7]:
nlp = spacy.load("en_core_web_sm")

## Load dataset

In [14]:
ds = load_dataset("mhurhangee/us-patent-descriptions")
train = ds["train"]

## Helper: normalize a sentence

In [18]:
punct_re = re.compile(r"[^\w\s]")
space_re = re.compile(r"\s+")

def normalize(sent):
    sent = sent.lower().strip()
    sent = space_re.sub(" ", sent)
    sent = punct_re.sub("", sent)
    return sent

## Collect all sentences in train

In [23]:
from tqdm.auto import tqdm

all_sents = []
for row in tqdm(train, total=len(train)):
    doc = nlp(row["description_text"])
    for sent in doc.sents:
        norm = normalize(sent.text)
        if len(norm.split()) > 3:
            all_sents.append(norm)


  0%|          | 0/10000 [00:00<?, ?it/s]

KeyboardInterrupt: 

V. slow 3 hours or more to analyse dataset.

#  Rewriting to improve speed

In [24]:
nlp = spacy.load("en_core_web_sm", disable=["tagger", "parser", "ner"])
nlp.add_pipe("sentencizer")

<spacy.pipeline.sentencizer.Sentencizer at 0x7b724cb70310>

In [27]:
all_sents = []
texts = [row["description_text"] for row in train]

for doc in tqdm(nlp.pipe(texts, batch_size=8, n_process=8), total=len(texts)):
    for sent in doc.sents:
        norm = normalize(sent.text)
        if len(norm.split()) > 3:  # ignore very short fragments
            all_sents.append(norm)

  0%|          | 0/10000 [00:00<?, ?it/s]



In [28]:
counter = Counter(all_sents)

In [32]:
for sent, freq in counter.most_common(10):
    print(freq, ":", sent)

349 : details are not described herein again
246 : as used herein the singular forms a an and the are intended to include the plural forms as well unless the context clearly indicates otherwise
189 : as used herein the term andor includes any and all combinations of one or more of the associated listed items
181 : these are of course merely examples and are not intended to be limiting
168 : this repetition is for the purpose of simplicity and clarity and does not in itself dictate a relationship between the various embodiments andor configurations discussed
158 : the spatially relative terms are intended to encompass different orientations of the device in use or operation in addition to the orientation depicted in the figures
157 : in addition the present disclosure may repeat reference numerals andor letters in the various examples
149 : the apparatus may be otherwise oriented rotated 90 degrees or at other orientations and the spatially relative descriptors used herein may likewise 

Good but normalisation too aggressive. Instead, lets try with paragraphs -> sentences -> n-grams and keep normalisation minimal or map it.

In [43]:
import random

random.choice(texts)

"DETAILED DESCRIPTION\n\nOverview\n\nThere can be a tradeoff between running workloads on cloud computing resources compared to edge computing resources, such as available processing power and latency. In some examples where a required response time is not very short or an amount of calculation required is extensively high, it can be clear that a given type of calculation should be performed on the cloud instead of the edge. However, in some examples, there can be a preference to running workloads on the edge to reduce networking bottlenecks and to distribute computation usage across edge nodes.\n\nWith respect to image processing systems, deciding whether to perform certain processing on the edge or on the cloud can vary according to specifics of a given image (e.g., a number of details in the image, or a number of areas of interest in the image). In some examples, a relatively high complexity of processing associated with an image can indicate that cloud processing is preferable to e