In [1]:
!pip install -U sacremoses
!pip install huggingface_hub


Collecting sacremoses
  Downloading sacremoses-0.1.1-py3-none-any.whl.metadata (8.3 kB)
Downloading sacremoses-0.1.1-py3-none-any.whl (897 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/897.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m897.5/897.5 kB[0m [31m45.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sacremoses
Successfully installed sacremoses-0.1.1


In [2]:
from transformers import pipeline
import pandas as pd

class PipelineNotLoadedError(Exception):
    def __init__(self, message):
        super().__init__(message)


class NlpEnricher:

   EMOTIONS = 'emotions'
   NER = 'ner'
   KEYPHRASES = 'keyphrases'
   SUMMARIZATION = 'summarization'

   EMOTIONS_MODEL = 'SamLowe/roberta-base-go_emotions'
   EMOTIONS_TASK = 'text-classification'
   SUMMARIZATION_TASK = 'summarization'
   KEYPHRASES_MODEL = 'yanekyuk/camembert-keyword-extractor'
   KEYPHRASES_TASK = 'token-classification'
   KEYPHRASES_AGGREGATION_STRATEGY = 'simple'
   NER_AGGREGATION_STRATEGY = 'simple'
   NER_TASK = 'ner'

   def __init__(self):
       """ the constructor """
       self._pipelines = dict()

   def load_pipelines(self, *pipeline_names):
       """
       Load desired pipelines for NLP enrichment.
       Param *pipelines is an arbitrary array of strings naming the
       respective pipelines.
       Each item of the array may be one of the following values:
       - NlpEnricher.EMOTIONS for classifying texts by expressed emotion
       - NlpEnricher.NER for named entity recognition
       - NlpEnricher.KEYPHRASES for keyphrase extraction
       - NlpEnricher.SUMMARIZATION for text summarization
       """
       for name in pipeline_names:
          match name:
             case NlpEnricher.EMOTIONS:
                  self._pipelines[self.EMOTIONS] = self._load_emotions_pipeline()
             case NlpEnricher.NER:
                  self._pipelines[self.NER] = self._load_ner_pipeline()
             case NlpEnricher.KEYPHRASES:
                  self._pipelines[self.KEYPHRASES] = self._load_keyphrases_pipeline()
             case NlpEnricher.SUMMARIZATION:
                  self._pipelines[self.SUMMARIZATION] = self._load_summarization_pipeline()


   def unload_pipeline(self, pipeline_name):
       """
       Unoad loaded pipelines for NLP enrichment.
       Param *pipelines is an arbitrary array of strings naming the
       respective pipelines.
       Each item of the array may be one of the following values:
       - NlpEnricher.EMOTIONS for classifying texts by expressed emotion
       - NlpEnricher.NER for named entity recognition
       - NlpEnricher.KEYPHRASES for keyphrase extraction
       - NlpEnricher.SUMMARIZATION for text summarization

       The method raises an PipelineNotLoadedError in case one of
       the listed pipelines is not loaded.
       """
       if not pipeline_name in self._pipelines:
          raise PipelineNotLoadedError(f"Pipeline '{pipeline_name}' is not loaded")
       else:
          self._pipelines.pop(pipeline_name)

   def infer(self, text):
       inference_results = dict()
       for pipeline_name in self._pipelines:
          print(f"running pipeline '{pipeline_name}'")
          pipeline = self._pipelines.get(pipeline_name)
          result = pipeline(text)
          inference_results[pipeline_name] = result
       return inference_results

   def _load_emotions_pipeline(self):
       return  pipeline(NlpEnricher.EMOTIONS_TASK,
                        model=NlpEnricher.EMOTIONS_MODEL)

   def _load_ner_pipeline(self):
       return pipeline(NlpEnricher.NER_TASK,
                       aggregation_strategy=NlpEnricher.NER_AGGREGATION_STRATEGY)

   def _load_summarization_pipeline(self):
       return pipeline(NlpEnricher.SUMMARIZATION_TASK)


   def _load_keyphrases_pipeline(self):
       return  pipeline(NlpEnricher.KEYPHRASES_TASK,
                        model=NlpEnricher.KEYPHRASES_MODEL,
                        aggregation_strategy=NlpEnricher.KEYPHRASES_AGGREGATION_STRATEGY)



In [3]:
#!huggingface-cli login
text = "Schwarzenegger began lifting weights at age 15 and won the Mr. Universe title aged 20, and subsequently the Mr. Olympia title seven times. He is tied with Phil Heath for the joint-second number of all-time Mr. Olympia wins, behind Ronnie Coleman and Lee Haney, who are joint-first with eight wins each. Nicknamed the 'Austrian Oak' in his bodybuilding days, he is regarded as one of the greatest bodybuilders of all time."

enricher = NlpEnricher()
enricher.load_pipelines(NlpEnricher.KEYPHRASES, NlpEnricher.SUMMARIZATION, NlpEnricher.EMOTIONS, NlpEnricher.NER)
enricher.infer(text)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/846 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/455 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/299 [00:00<?, ?B/s]

Device set to use cuda:0
No model was supplied, defaulted to sshleifer/distilbart-cnn-12-6 and revision a4f8f3e (https://huggingface.co/sshleifer/distilbart-cnn-12-6).
Using a pipeline without specifying a model name and revision in production is not recommended.


config.json:   0%|          | 0.00/1.80k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.22G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.22G [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Device set to use cuda:0


config.json:   0%|          | 0.00/1.92k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/380 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

Device set to use cuda:0
No model was supplied, defaulted to dbmdz/bert-large-cased-finetuned-conll03-english and revision 4c53496 (https://huggingface.co/dbmdz/bert-large-cased-finetuned-conll03-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


config.json:   0%|          | 0.00/998 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.33G [00:00<?, ?B/s]

Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tokenizer_config.json:   0%|          | 0.00/60.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Device set to use cuda:0


running pipeline 'keyphrases'


Your max_length is set to 142, but your input_length is only 96. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=48)


running pipeline 'summarization'
running pipeline 'emotions'
running pipeline 'ner'


{'keyphrases': [{'entity_group': 'KEY',
   'score': 0.990744,
   'word': 'Schwarzenegger',
   'start': 0,
   'end': 14},
  {'entity_group': 'KEY',
   'score': 0.95067453,
   'word': 'Phil Heath',
   'start': 155,
   'end': 165},
  {'entity_group': 'KEY',
   'score': 0.992785,
   'word': 'Ronnie Coleman',
   'start': 231,
   'end': 245},
  {'entity_group': 'KEY',
   'score': 0.99233603,
   'word': 'Lee Haney',
   'start': 250,
   'end': 259},
  {'entity_group': 'KEY',
   'score': 0.65014815,
   'word': 'bodybuil',
   'start': 339,
   'end': 347}],
 'summarization': [{'summary_text': " Schwarzenegger began lifting weights at age 15 and won the Mr. Universe title aged 20 . Nicknamed the 'Austrian Oak' in his bodybuilding days, he is regarded as one of the greatest bodybuilders of all time . He is tied with Phil Heath for the joint-second number of all-time Mr. Olympia wins, behind Ronnie Coleman and Lee Haney ."}],
 'emotions': [{'label': 'admiration', 'score': 0.6491815447807312}],
 'ner

In [7]:
text = """
Trump is gutting an agency that his daughter once champione
Trump’s untrimmed use of executive power to throttle agencies enshrined in the law, dismiss staff and halt spending already approved by Congress is raising alarms that he’s openly defying the Constitution, seizing power that the presidency does not have.
But the role of Musk, with his unelected power, is unprecedented. The richest man in the world is firing or suspending government workers, destroying US soft global power, and accessing data and private information about potentially millions of Americans — all with zero accountability.

Multiple courts have now stepped in to temporarily halt Trump and Musk’s plans. But everything is trending toward one of the most significant showdowns over the scope of presidential power in modern history, which is destined for a Supreme Court whose conservative majority has an expansive view of executive authority.

Plenty of people have predicted that Musk and Trump are headed for a breakup given their volatile personalities and need to be the alpha dog in every room. But Trump is showing no public sign of tiring of the Tesla pioneer, who is acting as the lead agent in the president’s revenge and disruption agenda. The president even shrugged off a Time Magazine cover that showed Musk behind the Oval Office desk.
enricher.infer(text)"""

enricher.infer(text)

running pipeline 'keyphrases'
running pipeline 'summarization'
running pipeline 'emotions'
running pipeline 'ner'


{'keyphrases': [{'entity_group': 'KEY',
   'score': 0.79444784,
   'word': 'Trump',
   'start': 1,
   'end': 6},
  {'entity_group': 'KEY',
   'score': 0.766516,
   'word': 'Trump',
   'start': 61,
   'end': 66},
  {'entity_group': 'KEY',
   'score': 0.95258135,
   'word': 'Musk',
   'start': 332,
   'end': 336},
  {'entity_group': 'KEY',
   'score': 0.8157873,
   'word': 'Trump',
   'start': 660,
   'end': 665},
  {'entity_group': 'KEY',
   'score': 0.7964475,
   'word': 'Musk',
   'start': 670,
   'end': 674},
  {'entity_group': 'KEY',
   'score': 0.94288236,
   'word': 'Musk',
   'start': 962,
   'end': 966},
  {'entity_group': 'KEY',
   'score': 0.64601934,
   'word': 'Trump',
   'start': 971,
   'end': 976},
  {'entity_group': 'KEY',
   'score': 0.83704436,
   'word': 'Trump',
   'start': 1085,
   'end': 1090},
  {'entity_group': 'KEY',
   'score': 0.9089399,
   'word': 'Tesla',
   'start': 1134,
   'end': 1139},
  {'entity_group': 'KEY',
   'score': 0.84569967,
   'word': 'Time Ma