In [None]:
#hide
!pip install -q transformers nbdev

[K     |████████████████████████████████| 51kB 3.2MB/s 
[K     |████████████████████████████████| 61kB 5.4MB/s 
[K     |████████████████████████████████| 61kB 5.6MB/s 
[?25h

In [None]:
# default_exp keyword_generators
# default_cls_lvl 3

# Keyword Generation
> Generate keywords from given input text

In [None]:
#export
from transformers import TextGenerationPipeline, TFAutoModelForPreTraining, TFBartForConditionalGeneration, BartTokenizer, pipeline

In [None]:
#export
class BartKeywordGenerator():
  """
  Bart based keyword generator using huggingface transformers
  """
  def __init__(self, model_name, use_cuda=False):
    self.model_name = model_name
    self.model = TFBartForConditionalGeneration.from_pretrained(self.model_name, from_pt=True)
    self.tokenizer = BartTokenizer.from_pretrained(self.model_name)
    self.use_cuda = use_cuda
    self.device = 0 if use_cuda else -1
    self.keyword_generator = pipeline("summarization", model=self.model, tokenizer=self.tokenizer, device=self.device)

  def generate(self, text, max_length=50, **kwargs):

    generated_keywords = self.keyword_generator(text, max_length=max_length, **kwargs)
    keywords = []
    for keyword in generated_keywords:
      keywords.append({"keywords": keyword['summary_text'].split(";")})
    return keywords

  def batch_generate(self, texts, batch_size=8, max_length=50, **kwargs):
    
    batches = [texts[i:i + batch_size] for i in range(0, len(texts), batch_size)]
    keywords = []

    for batch in batches:
      batch_keywords = self.generate(batch, max_length=max_length, **kwargs)
      keywords.extend(batch_keywords)
    return keywords    

`BartKeywordGenerator` is a base class for keyword generator. It is implemented based on huggingface transformer lib.

It has two function:



1.   `generate()`: Given text input it will generate keywords. The parameters are based on transformers .generate arguments. 
2.   `batch_generate()`: Given a list of text inputs. Firstly it will split into batches and then generate.





In [None]:
#export
class ExtractiveKeywordGenerator(BartKeywordGenerator):
  """It will generate extractive keywords using bart based fined tunned model on openkp datasets"""
  def __init__(self, use_cuda=False):
    model_name = "ankur310794/bart-base-keyphrase-generation-openkp"
    super().__init__(model_name, use_cuda)

`ExtractiveKeywordGenerator` implements `BartKeywordGenerator` for extractive keyword generator

In [None]:
#export
class AbstractiveKeywordGenerator(BartKeywordGenerator):
  """It will generate abstractive keywords using bart based fined tunned model on kpTimes dataset"""
  def __init__(self, use_cuda=False):
    model_name = "ankur310794/bart-base-keyphrase-generation-kpTimes"
    super().__init__(model_name, use_cuda)


`AbstractiveKeywordGenerator` implements `BartKeywordGenerator` for abstractive keyword generator

In [None]:
extractive_generator = ExtractiveKeywordGenerator()

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBartForConditionalGeneration: ['lm_head.weight']
- This IS expected if you are initializing TFBartForConditionalGeneration from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBartForConditionalGeneration from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBartForConditionalGeneration were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBartForConditionalGeneration for predictions without further training.


In [None]:
abstractive_generator = AbstractiveKeywordGenerator()

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBartForConditionalGeneration: ['lm_head.weight']
- This IS expected if you are initializing TFBartForConditionalGeneration from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBartForConditionalGeneration from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBartForConditionalGeneration were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBartForConditionalGeneration for predictions without further training.


In [None]:
extractive_generator.generate(input_text)

[{'keywords': ['The death toll in Germany', ' Belgium', ' historic flood']}]

In [None]:
extractive_generator.batch_generate([input_text, input_text])

[{'keywords': ['The death toll in Germany', ' Belgium', ' historic flood']},
 {'keywords': ['The death toll in Germany', ' Belgium', ' historic flood']}]

In [None]:
abstractive_generator.generate(input_text)

[{'keywords': ['Floods', 'Germany', 'Belgium', 'Europe']}]

In [None]:
extractive_generator.generate(input_text, min_length=10, num_beams=5, 
    early_stopping=True, max_length=50)

[{'keywords': ['The death toll in Germany', ' Belgium', ' historic flood']}]

In [None]:
extractive_generator.generate(input_text, do_sample=True, max_length=50, top_k=0)

[{'keywords': ['The death toll in Germany', ' Belgium', ' historic flood']}]