In [1]:
from transformers.pipelines import pipeline
from keybert import KeyBERT
from onclusiveml.ml_compile import CompiledPipeline
from onclusiveml.ml_models.keywords import CompiledKeyBERT
import torch_neuron

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
hf_pipeline = pipeline(task='feature-extraction',model="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")

In [3]:
keybert = KeyBERT(model=hf_pipeline)

In [4]:
try:
    compiled_pipeline_short = CompiledPipeline.from_pretrained('compiled_pipeline_short')
    print('Reloaded')
except:
    print('Recompiling')
    compiled_pipeline_short = CompiledPipeline.from_pipeline(pipeline=hf_pipeline,max_length=25, batch_size=1,neuron=True,validate_compilation=False)
    compiled_pipeline_short.save_pretrained('compiled_pipeline_short')
    print('Recompiled and exported')

Reloaded


In [5]:
try:
    compiled_pipeline_medium = CompiledPipeline.from_pretrained('compiled_pipeline_medium')
    print('Reloaded')
except:
    print('Recompiling')
    compiled_pipeline_medium = CompiledPipeline.from_pipeline(pipeline=hf_pipeline,max_length=250, batch_size=1,neuron=True,validate_compilation=False)
    compiled_pipeline_medium.save_pretrained('compiled_pipeline_medium')
    print('Recompiled and exported')

Reloaded


In [6]:
try:
    compiled_pipeline_long = CompiledPipeline.from_pretrained('compiled_pipeline_long')
    print('Reloaded')
except:
    print('Recompiling')
    compiled_pipeline_long = CompiledPipeline.from_pipeline(pipeline=hf_pipeline,max_length=500, batch_size=1,neuron=True,validate_compilation=False)
    compiled_pipeline_long.save_pretrained('compiled_pipeline_long')
    print('Recompiled and exported')

Reloaded


In [7]:
short_doc = "I'm reading a book on the history of glue. I just can't seem to put it down."
short_medium = "Flowers are not only visually pleasing, but they are also used to enhance the aesthetic appeal of homes and gardens. Flowers are a symbol of love and affection and are often given as gifts to loved ones. The fragrance of flowers is known to lift moods and calm anxiety. In addition, flowers are essential to pollination, a process that helps in the reproduction of plants. Overall, flowers are beautiful, therapeutic, and essential to the ecosystem."
medium_doc = """Flowers come in different shapes, sizes, and colors, and their beauty has captivated humans for centuries. Apart from their aesthetic appeal, flowers are also essential to the ecosystem, playing a vital role in pollination necessary for the growth of crops and maintaining genetic diversity of plant species.
Flowers have therapeutic properties and are used in traditional medicine for centuries. Chamomile, lavender, and marigold have soothing effects on the mind and body. The fragrance of flowers is calming, and they are used in aromatherapy worldwide.
Flowers have cultural and emotional significance, often used in religious ceremonies, weddings, and funerals. Different flowers have symbolic meanings, such as lilies representing purity and innocence. Flowers are also a significant part of home decor, adding charm and personality to homes and gardens.
In conclusion, flowers have multiple dimensions of significance."""
medium_long_doc = """Flowers are nature's way of displaying beauty and elegance. They come in various colors, shapes, and sizes, and their attractiveness has fascinated humans for centuries. However, flowers have more to offer than their aesthetic value, and they play a crucial role in the ecosystem.
Flowers are the reproductive organs of plants, and they play a vital role in pollination. They produce nectar and pollen that attract pollinators such as bees, butterflies, and hummingbirds, who transfer pollen from the stamen to the stigma of the flowers. This process helps in the fertilization of plants and is essential for the growth of crops and other vegetation. Flowers also ensure the genetic diversity of plant species.
For example, roses are associated with love and passion, while lilies represent purity and innocence. Flowers are also used to express emotions like joy, gratitude, and sympathy.
Moreover, flowers have an essential aesthetic value. They add color, charm, and personality to homes and gardens. The beauty of flowers is mesmerizing and captivating, and it has a calming effect on the mind and soul. Flowers arrangements, whether simple or elaborate, have a significant impact on the overall aesthetic appeal of a space."""
long_doc = """Flowers are nature's way of displaying beauty and elegance. They come in various colors, shapes, and sizes, and their attractiveness has fascinated humans for centuries. However, flowers have more to offer than their aesthetic value, and they play a crucial role in the ecosystem.
Flowers are the reproductive organs of plants, and they play a vital role in pollination. They produce nectar and pollen that attract pollinators such as bees, butterflies, and hummingbirds, who transfer pollen from the stamen to the stigma of the flowers. This process helps in the fertilization of plants and is essential for the growth of crops and other vegetation. Flowers also ensure the genetic diversity of plant species.
Apart from their ecological significance, flowers have numerous medicinal benefits. Many flowers have therapeutic properties that have been used in traditional medicine for centuries. For instance, chamomile is known for its calming effects and is used to treat anxiety and insomnia. Lavender is also used to promote relaxation, relieve pain, and improve sleep quality. Flowers like calendula have anti-inflammatory and antiseptic properties and can be used to treat skin infections and wounds.
Moreover, flowers have an essential aesthetic value. They add color, charm, and personality to homes and gardens. The beauty of flowers is mesmerizing and captivating, and it has a calming effect on the mind and soul. Flowers arrangements, whether simple or elaborate, have a significant impact on the overall aesthetic appeal of a space.
In conclusion, flowers are not just pretty things to look at. They have significant ecological, medicinal, cultural, and aesthetic values. They play a vital role in the ecosystem, ensuring the growth of crops and maintaining genetic diversity of plant species. Flowers also have therapeutic properties and are used in traditional medicine worldwide. They are an essential part of cultural traditions and are used to express emotions. The beauty and charm of flowers are mesmerizing and captivating, and it adds personality and charm to homes and gardens. Flowers are more than just plants; they are essential to human life and bring happiness and joy to those who admire them."""

docs ={
    'doc_short':[short_doc,short_doc,short_doc],
    'doc_short_medium': [short_medium,short_medium,short_medium],
    'doc_medium':[medium_doc,medium_doc,medium_doc,],
    'doc_medium_long': [medium_long_doc,medium_long_doc,medium_long_doc,],
    'doc_long': [long_doc,long_doc,long_doc,]
}

for doc in docs:
    print(len(docs[doc][0].split()))

17
75
135
194
347


In [8]:
for doc in docs:
    print(len(hf_pipeline.tokenizer(docs[doc][0],truncation=True)['input_ids']))

26
109
202
288
497


In [13]:
keybert_samples = {
    'uncompiled':keybert,
    'hybrid':CompiledKeyBERT(document_pipeline=hf_pipeline,compiled_word_pipeline=compiled_pipeline_short),
    'compiled_short':CompiledKeyBERT(document_pipeline=compiled_pipeline_short,compiled_word_pipeline=compiled_pipeline_short),
    'compiled_medium':CompiledKeyBERT(document_pipeline=compiled_pipeline_medium,compiled_word_pipeline=compiled_pipeline_short),
    'compiled_long':CompiledKeyBERT(document_pipeline=compiled_pipeline_long,compiled_word_pipeline=compiled_pipeline_short),
}

(compiled_pipeline_short.compiled_pipeline.model.compilation_specs['tracing__max_length'],
compiled_pipeline_medium.compiled_pipeline.model.compilation_specs['tracing__max_length'],
compiled_pipeline_long.compiled_pipeline.model.compilation_specs['tracing__max_length'])

(25, 250, 500)

In [14]:
keybert_results = {
    'uncompiled':{},
    'hybrid':{},
    'compiled_short':{},
    'compiled_medium':{},
    'compiled_long':{},
    'n_tokens': {}
}

In [20]:
import time

n_runs = 50

for doc_scenario in docs.keys():
    
    print(f'Document: {doc_scenario}')
    keybert_results['n_tokens'][doc_scenario] = len(hf_pipeline.tokenizer(docs[doc_scenario][0],truncation=True)['input_ids'])
    
    for keybert_scenario in keybert_samples.keys():
        
        print(f'Pipeline: {keybert_scenario}')
        
        start = time.time()
        
        for i in range(n_runs):
            keywords = keybert_samples[keybert_scenario].extract_keywords(docs[doc_scenario])
            
        keybert_results[keybert_scenario][doc_scenario] = (time.time() - start) / n_runs
        
        print(f'Pipeline {keybert_scenario} duration: {keybert_results[keybert_scenario][doc_scenario]}')
        print(f'Pipeline {keybert_scenario} keywords: {keywords}')
        print('---------------------------------------------------------------')
        time.sleep(2)
        
    print(f'=============================================================')

Document: doc_short
Pipeline: uncompiled
Pipeline uncompiled duration: 0.07366365432739258
Pipeline uncompiled keywords: [[('glue', 0.7159), ('book', 0.3394), ('history', 0.3388), ('reading', 0.1668), ('just', 0.1213)], [('glue', 0.7159), ('book', 0.3394), ('history', 0.3388), ('reading', 0.1668), ('just', 0.1213)], [('glue', 0.7159), ('book', 0.3394), ('history', 0.3388), ('reading', 0.1668), ('just', 0.1213)]]
---------------------------------------------------------------
Pipeline: hybrid
Pipeline hybrid duration: 0.054367995262145995
Pipeline hybrid keywords: [[('glue', 0.7157), ('book', 0.3398), ('history', 0.3389), ('reading', 0.1663), ('just', 0.1214)], [('glue', 0.7157), ('book', 0.3398), ('history', 0.3389), ('reading', 0.1663), ('just', 0.1214)], [('glue', 0.7157), ('book', 0.3398), ('history', 0.3389), ('reading', 0.1663), ('just', 0.1214)]]
---------------------------------------------------------------
Pipeline: compiled_short
Pipeline compiled_short duration: 0.0309437561

In [21]:
import pandas as pd

results_df = pd.DataFrame(keybert_results)[['uncompiled','hybrid','compiled_medium','compiled_long','n_tokens']]
results_df

Unnamed: 0,uncompiled,hybrid,compiled_medium,compiled_long,n_tokens
doc_short,0.073664,0.054368,0.077978,0.183859,26
doc_short_medium,0.340919,0.199799,0.181506,0.290036,109
doc_medium,0.710269,0.426113,0.325059,0.422534,202
doc_medium_long,0.892405,0.525907,0.392816,0.490599,288
doc_long,1.289443,0.76224,0.534767,0.634011,497


The above shows the following:
- a completely `uncompiled` keybert is not comptetitive to either `hybrid` or `compiled_long` keybert for non trivial documents of token length >= ~100
    - since uncompiled pipelines are competitive with compiled pipelines for all documents **except** very short or very long ones (see `benchmarking_pipelines.ipynb`), this is largely driven by the embedding of keywords being much faster (3x - 4x) on the compiled word embedding pipeline
- `hybrid` keybert is competitive with a fully compiled `compiled_long` keybert on all but long documents, and outperforms it at token length ~ 100
    - compilation of the document embedding pipeline is therefore **sensible** if we expect the vast majority of **documents coming through to be >= / ~ 200 tokens**
    - compilation of the document embedding pipeline is actually **beneficial** if we expect the vast majority of **documents coming through to be >= / ~ 450 tokens**