In [1]:
import neptune
from transformers import pipeline
from keybert import KeyBERT
import os

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# --- initialize models
# get specified huggingface transformer pipeline
HF_MODEL_REFERENCE = os.environ.get('HF_MODEL_REFERENCE','sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')
hf_pipeline = pipeline("feature-extraction", model=HF_MODEL_REFERENCE)

# initialize keybert model with huggingface pipeline backend
keybert_model = KeyBERT(model=hf_pipeline)

Downloading (…)lve/main/config.json: 100%|██████████| 645/645 [00:00<00:00, 73.6kB/s]
Downloading pytorch_model.bin: 100%|██████████| 471M/471M [00:05<00:00, 86.1MB/s] 
Downloading (…)okenizer_config.json: 100%|██████████| 480/480 [00:00<00:00, 143kB/s]
Downloading tokenizer.json: 100%|██████████| 9.08M/9.08M [00:00<00:00, 64.6MB/s]
Downloading (…)cial_tokens_map.json: 100%|██████████| 239/239 [00:00<00:00, 69.5kB/s]


In [82]:
# --- create prediction files
sample_inputs = [
    '''Supervised learning is the machine learning task of learning a function that
         maps an input to an output based on example input-output pairs. It infers a
         function from labeled training data consisting of a set of training examples.
         In supervised learning, each example is a pair consisting of an input object
         (typically a vector) and a desired output value (also called the supervisory signal).
         A supervised learning algorithm analyzes the training data and produces an inferred function,
         which can be used for mapping new examples. An optimal scenario will allow for the
         algorithm to correctly determine the class labels for unseen instances. This requires
         the learning algorithm to generalize from the training data to unseen situations in a
         'reasonable' way (see inductive bias).''',
    '''Überwachtes Lernen ist die maschinelle Lernaufgabe, eine Funktion zu lernen, die
         ordnet eine Eingabe einer Ausgabe basierend auf beispielhaften Eingabe-Ausgabe-Paaren zu. Es folgert a
         Funktion aus beschrifteten Trainingsdaten, die aus einer Reihe von Trainingsbeispielen bestehen.
         Beim überwachten Lernen ist jedes Beispiel ein Paar, das aus einem Eingabeobjekt besteht
         (typischerweise ein Vektor) und einem gewünschten Ausgangswert (auch Überwachungssignal genannt).
         Ein überwachter Lernalgorithmus analysiert die Trainingsdaten und erzeugt eine abgeleitete Funktion.
         die zum Mapping neuer Beispiele verwendet werden können. Ein optimales Szenario ermöglicht die
         Algorithmus, um die Klassenbezeichnungen für unsichtbare Instanzen korrekt zu bestimmen. Dafür braucht man
         den Lernalgorithmus zum Verallgemeinern der Trainingsdaten auf ungesehene Situationen in a
         'vernünftiger' Weg (siehe induktive Vorspannung).''',
    '''El aprendizaje supervisado es la tarea de aprendizaje automático de aprender una función que
         asigna una entrada a una salida en función de pares de entrada-salida de ejemplo. Se infiere un
         función a partir de datos de entrenamiento etiquetados que consisten en un conjunto de ejemplos de entrenamiento.
         En el aprendizaje supervisado, cada ejemplo es un par que consta de un objeto de entrada
         (típicamente un vector) y un valor de salida deseado (también llamado señal de supervisión).
         Un algoritmo de aprendizaje supervisado analiza los datos de entrenamiento y produce una función inferida,
         que se puede utilizar para mapear nuevos ejemplos. Un escenario óptimo permitirá que la
         algoritmo para determinar correctamente las etiquetas de clase para instancias no vistas. Esto requiere
         el algoritmo de aprendizaje para generalizar a partir de los datos de entrenamiento a situaciones no vistas en un
         manera 'razonable' (ver sesgo inductivo).'''
]

In [110]:
# tokenizer
tokenized_inputs = hf_pipeline.tokenizer(sample_inputs, truncation=True, padding=True) # will truncate to max token length of batch

for i in range(len(sample_inputs)):
    print(f"Sequence {i} length: {len(tokenized_inputs['input_ids'][i])}")
    print(f"Attention mask {i} length: {len(tokenized_inputs['attention_mask'][i])}")
    print(f"Attention mask: {tokenized_inputs['attention_mask'][i]}")

Sequence 0 length: 222
Attention mask 0 length: 222
Attention mask: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Sequence 1 length: 222
Attention mask 1 length: 222
Attention mask: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1

In [99]:
# model
tokenized_inputs_tensor = hf_pipeline.tokenizer(sample_inputs, padding='max_length',return_tensors='pt')
model_predictions = hf_pipeline.model(**tokenized_inputs_tensor)

In [108]:
model_predictions.last_hidden_state.tolist()[0][0][:10]

[0.1341274082660675,
 -0.3512396216392517,
 -0.29578202962875366,
 -0.1361517608165741,
 0.30853715538978577,
 0.25779828429222107,
 0.5945106148719788,
 -0.08795639127492905,
 -0.37195488810539246,
 -0.19017291069030762]

In [88]:
# huggingface pipeline model
hf_pipeline_predictions = hf_pipeline(sample_inputs,  truncation=True, padding='max_length') # padding & truncation doesnt seem to affect sequence length dim of extracted features, which makes sense as no embedding applies for paddings

In [85]:
type(hf_pipeline_predictions), len(hf_pipeline_predictions)

(list, 3)

In [86]:
len(hf_pipeline_predictions[0]), len(hf_pipeline_predictions[1]), len(hf_pipeline_predictions[2]) 
# -> (n_batch x 1)

(1, 1, 1)

In [87]:
print(len(hf_pipeline_predictions[0][0]), type(hf_pipeline_predictions[0][0])) # 177
print(len(hf_pipeline_predictions[1][0]), type(hf_pipeline_predictions[1][0])) # 222
print(len(hf_pipeline_predictions[2][0]), type(hf_pipeline_predictions[2][0])) # 198
# -> n_batch x 1 x n_tokens x ?

177 <class 'list'>
222 <class 'list'>
198 <class 'list'>


In [42]:
print(len(hf_pipeline_predictions[0][0][0]), type(hf_pipeline_predictions[0][0][0])) # 384
print(len(hf_pipeline_predictions[1][0][0]), type(hf_pipeline_predictions[1][0][0])) # 384
print(len(hf_pipeline_predictions[2][0][0]), type(hf_pipeline_predictions[2][0][0])) # 384
# -> n_batch x 1 x n_tokens x 384

384 <class 'list'>
384 <class 'list'>
384 <class 'list'>


In [109]:
hf_pipeline_predictions[0][0][0][:10]

[0.1341274380683899,
 -0.3512396514415741,
 -0.2957819998264313,
 -0.13615182042121887,
 0.30853718519210815,
 0.25779861211776733,
 0.594510555267334,
 -0.08795640617609024,
 -0.3719547390937805,
 -0.1901727169752121]

In [59]:
hf_pipeline.tokenizer, hf_pipeline.model.parameters

(BertTokenizerFast(name_or_path='sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2', vocab_size=250002, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=False)}),
 <bound method Module.parameters of BertModel(
   (embeddings): BertEmbeddings(
     (word_embeddings): Embedding(250037, 384, padding_idx=0)
     (position_embeddings): Embedding(512, 384)
     (token_type_embeddings): Embedding(2, 384)
     (LayerNorm): LayerNorm((384,), eps=1e-12, elementwise_affine=True)
     (dropout): Dropout(p=0.1, inplace=False)
   )
   (encoder): BertEncoder(
     (layer): ModuleList(
       (0-11): 12 x BertLayer(
         (attention): BertAttention(
           (self): BertSelfAttention(
             (query): Linear(in_feature

In [37]:
type(hf_pipeline_predictions[0][0][0][0])

float

In [92]:
# keybert model
keybert_predictions = keybert_model.extract_keywords(sample_inputs, keyphrase_ngram_range=(1, 1), stop_words=None)

In [93]:
keybert_predictions

[[('supervised', 0.4765),
  ('learning', 0.4647),
  ('instances', 0.4511),
  ('training', 0.4236),
  ('supervisory', 0.393)],
 [('trainingsbeispielen', 0.534),
  ('algorithmus', 0.5079),
  ('trainingsdaten', 0.4838),
  ('lernalgorithmus', 0.4746),
  ('lernen', 0.4611)],
 [('aprendizaje', 0.4988),
  ('algoritmo', 0.4943),
  ('entrenamiento', 0.4444),
  ('supervisado', 0.4184),
  ('aprender', 0.4124)]]