# **NLP Tasks with Transformers using HuggingFace**
- Sentiment analysis
- Text generation
- Name entity recognition (NER)
- Question answering
- Filling masked text
- Summarization
- Translation
- Feature extraction

In [20]:
!pip install torch===1.5.0 torchvision===0.6.0 -f https://download.pytorch.org/whl/torch_stable.html
!pip install transformers
!pip install TensorFlow
!pip install HuggingFace

from transformers import pipeline   ,AutoTokenizer, AutoModelForSequenceClassification

Looking in links: https://download.pytorch.org/whl/torch_stable.html


# **Sentiment analysis** 


In [5]:
model_name = "distilbert-base-uncased-finetuned-sst-2-english"
#model="nlptown/bert-base-multilingual-uncased-sentiment"

model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)


classifier = pipeline('sentiment-analysis',model=model,tokenizer = tokenizer) # By default, the model downloaded for this pipeline is called “distilbert-base-uncased-finetuned-sst-2-english”
results = classifier(["We are very happy to show you the Transformers library.","We hope you don't hate it.",'انا سعيد اليوم','انا حزين','je suis heureux ',"je suis très triste"])
results


[{'label': 'POSITIVE', 'score': 0.999799370765686},
 {'label': 'NEGATIVE', 'score': 0.5308594107627869},
 {'label': 'NEGATIVE', 'score': 0.620414137840271},
 {'label': 'NEGATIVE', 'score': 0.5622567534446716},
 {'label': 'POSITIVE', 'score': 0.9116100072860718},
 {'label': 'NEGATIVE', 'score': 0.9617950916290283}]

# **Text generation**

In [6]:

generator_text = pipeline('text-generation')

print(generator_text("hello how are",max_length=100, do_sample=False))
print(generator_text('comment jouer avec  ',max_length=100,do_sample=False))
print(generator_text("مرحبا كيف",max_length=100, do_sample=False))


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=230.0, style=ProgressStyle(description_…




Setting `pad_token_id` to 50256 (first `eos_token_id`) to generate sequence
Setting `pad_token_id` to 50256 (first `eos_token_id`) to generate sequence


[{'generated_text': "hello how are you doing?\n\nI'm doing great. I'm doing great. I'm doing great. I'm doing great. I'm doing great. I'm doing great. I'm doing great. I'm doing great. I'm doing great. I'm doing great. I'm doing great. I'm doing great. I'm doing great. I'm doing great. I'm doing great. I'm doing great. I'm doing great. I'm doing great. I'm"}]


Setting `pad_token_id` to 50256 (first `eos_token_id`) to generate sequence


[{'generated_text': 'comment jouer avec   une répondit de la répondit de la répondit de la répondit de la répondit de la répondit de la répondit de la répondit de la répondit de la répondit de la répondit de la répondit de la répondit de la répondit de la répondit de la répond'}]
[{'generated_text': 'مرحبا كيفته بن بن بن بن بن بن بن بن بن بن بن بن بن بن بن بن بن بن بن بن بن بن بن بن بن بن بن بن بن'}]


 # **Name entity recognition**

O, Outside of a named entity

B-MIS, Beginning of a miscellaneous entity right after another miscellaneous entity

I-MIS, Miscellaneous entity

B-PER, Beginning of a person’s name right after another person’s name

I-PER, Person’s name

B-ORG, Beginning of an organisation right after another organisation

I-ORG, Organisation

B-LOC, Beginning of a location right after another location

I-LOC, Location

In [7]:
ner = pipeline("ner")

sequence_en = "linkedIn est un réseau social professionnel en ligne créé en 2002 à Mountain View en Californie. L'entreprise est valorisée à 20 milliards de dollars en 2015.Le 13 juin 2016, Microsoft annonce le rachat du réseau social pour un montant de 26,2 milliards de dollars américains soit 23,4 milliards d'euros"
ner(sequence_en)



HBox(children=(FloatProgress(value=0.0, description='Downloading', max=230.0, style=ProgressStyle(description_…




[{'entity': 'I-LOC', 'score': 0.9985359907150269, 'word': 'Mountain'},
 {'entity': 'I-LOC', 'score': 0.9992014169692993, 'word': 'View'},
 {'entity': 'I-LOC', 'score': 0.998285710811615, 'word': 'Cal'},
 {'entity': 'I-LOC', 'score': 0.948521614074707, 'word': '##if'},
 {'entity': 'I-LOC', 'score': 0.8773481845855713, 'word': '##orn'},
 {'entity': 'I-LOC', 'score': 0.8995317220687866, 'word': '##ie'},
 {'entity': 'I-ORG', 'score': 0.9989840984344482, 'word': 'Microsoft'}]

# **Question answering**

In [8]:

question_answerer = pipeline('question-answering')

context = "Extractive Question Answering is the task of extracting an answer from a text given a question. An example of a question answering dataset is the SQuAD dataset, which is entirely based on that task."

# testing questions
question_1 = "What is extractive question answering ?"
question_2 = "What is a good example of a question answering dataset?"

# Generating answers
print(question_answerer(question=question_1, context=context))
print(question_answerer(question=question_2, context=context))


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=230.0, style=ProgressStyle(description_…


{'score': 0.5899808892383618, 'start': 33, 'end': 95, 'answer': 'the task of extracting an answer from a text given a question.'}
{'score': 0.4760158088718782, 'start': 146, 'end': 160, 'answer': 'SQuAD dataset,'}


In [9]:
context_ar = "الحج هو الركن الخامس من أركان الإسلام، لقول النبي محمد: 'بني الإسلام على خمس: شهادة أن لا إله إلا الله وأن محمداً رسول الله، وإقام الصلاة، وإيتاء الزكاة، وصوم رمضان، وحج البيت من استطاع إليه سبيلاً'"

answer_question_ar = question_answerer(question = "ما هو الحج ؟", context = context_ar)
answer_question_ar


{'answer': 'هو الركن الخامس',
 'end': 20,
 'score': 0.005503810875175097,
 'start': 5}

In [10]:
context_fr = "Wikipédia en français est l'édition de Wikipédia en langue française. Elle est fondée le 23 mars 2001, deux mois après la création officielle de Wikipédia."

answer_question_fr = question_answerer(question = "Quand est ce que la wikipédia a été fondée ?", context = context_fr)
answer_question_fr

{'answer': 'le 23 mars 2001,',
 'end': 102,
 'score': 0.35677178492711903,
 'start': 86}

# **Translation**

In [11]:

en_to_fr = pipeline("translation_en_to_fr")
result_fr = en_to_fr("Hugging Face is a technology company based in New York and Paris", max_length=40)
result_fr

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=230.0, style=ProgressStyle(description_…




[{'translation_text': 'Hugging Face est une entreprise technologique basée à New York et à Paris.'}]

In [12]:
# translation from english to romanian
translator = pipeline("translation_en_to_ro")
print(translator("Wikipédia est un projet d’encyclopédie collective en ligne, universelle, multilingue et fonctionnant sur le principe du wiki. Ce projet vise à offrir un contenu librement réutilisable, objectif et vérifiable, que chacun peut modifier et améliorer.", max_length=86))

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=230.0, style=ProgressStyle(description_…




Your input_length: 85 is bigger than 0.9 * max_length: 86. You might consider increasing your max_length manually, e.g. translator('...', max_length=400)


[{'translation_text': 'Wikipédia este un proiect d’encyclopédie collective en ligne, universelle, multilingue şi fonctionnant sur le principe du wiki.'}]


# **Summarization**

In [None]:
summarizer = pipeline("summarization")
ARTICLE="Wikipédia est un projet d’encyclopédie collective en ligne, universelle, multilingue et fonctionnant sur le principe du wiki. Ce projet vise à offrir un contenu librement réutilisable, objectif et vérifiable, que chacun peut modifier et améliorer."
print(summarizer(ARTICLE, max_length=84, min_length=30, do_sample=False))

# **Feature extraction**

In [14]:
feature_extraction = pipeline('feature-extraction', model="distilroberta-base", tokenizer="distilroberta-base")
features = feature_extraction("i am Phd Student.")

print(features[0])
print(len(features[0])) 

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=230.0, style=ProgressStyle(description_…


[[-0.034597571939229965, 0.10583612322807312, 0.01171906478703022, -0.09092150628566742, 0.09974636137485504, -0.06182561814785004, -0.03564837574958801, 0.08272474259138107, 0.07069457322359085, -0.06833191215991974, -0.06463196873664856, 0.05068853870034218, 0.04042191058397293, -0.03432275727391243, 0.050665538758039474, -0.008323732763528824, -0.016973381862044334, 0.03395700827240944, -0.02754192240536213, -0.020276982337236404, -0.05156334862112999, 0.004505281336605549, -0.0370616689324379, 0.11891534179449081, -0.008486563339829445, 0.06446423381567001, 0.1302662044763565, 0.04592787101864815, -0.03173253685235977, 0.01429057028144598, -0.0048749735578894615, -0.06148260086774826, 0.02869199775159359, 0.05718902871012688, -0.010263791307806969, 0.0694044828414917, 0.05215027555823326, 0.05615502968430519, -0.02472902461886406, 0.09541488438844681, -0.0025344309397041798, 0.046452272683382034, 0.05056187883019447, -0.00900201965123415, -0.029209813103079796, -0.0123913343995809

# **Fill mask**

In [15]:
fill_mask = pipeline("fill-mask")
result = fill_mask(f"I want to {fill_mask.tokenizer.mask_token} English.")
result

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=230.0, style=ProgressStyle(description_…




[{'score': 0.535076916217804,
  'sequence': '<s> I want to learn English.</s>',
  'token': 1532},
 {'score': 0.36233922839164734,
  'sequence': '<s> I want to speak English.</s>',
  'token': 1994},
 {'score': 0.029888825491070747,
  'sequence': '<s> I want to teach English.</s>',
  'token': 6396},
 {'score': 0.012146448716521263,
  'sequence': '<s> I want to write English.</s>',
  'token': 3116},
 {'score': 0.008391833864152431,
  'sequence': '<s> I want to study English.</s>',
  'token': 892}]

In [19]:
!pip install watermark
%load_ext watermark
%watermark -v -m -p transforms 


The watermark extension is already loaded. To reload it, use:
  %reload_ext watermark
Python implementation: CPython
Python version       : 3.7.11
IPython version      : 5.5.0

transforms: not installed

Compiler    : GCC 7.5.0
OS          : Linux
Release     : 5.4.104+
Machine     : x86_64
Processor   : x86_64
CPU cores   : 2
Architecture: 64bit

