# Basic NLP tasks using Huggingface transformers
This notebook contains examples of NLP tasks like:  
- text summarization
- text classification
- machine translation
- question answering
- named entity recognition

In [None]:
import numpy as np
import pandas as pd
import textwrap

from transformers import pipeline

In [None]:
import logging

logging.getLogger("transformers").setLevel(logging.ERROR)

## Text summarization

In [None]:
bbc_text = pd.read_csv('data/bbc_text_cls.csv')

In [None]:
bbc_text.head()

In [None]:
bbc_text['labels'].value_counts()

In [None]:
def wrap(x):
  return textwrap.fill(x, replace_whitespace = False, fix_sentence_endings = True)

In [None]:
doc = bbc_text[bbc_text.labels == 'tech']['text'].sample(random_state=14)
print(wrap(doc.iloc[0]))

In [None]:
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
summary = summarizer(doc.iloc[0])[0]['summary_text']

In [None]:
def summarize_text(text, model="facebook/bart-large-cnn"):
    summarizer = pipeline("summarization", model=model)
    summary = summarizer(text, max_length=130, min_length=30, do_sample=False)
    return summary[0]['summary_text']

In [None]:
# Example texts
news_article = """
Climate change is accelerating, with carbon dioxide levels rising and global temperatures increasing at an alarming rate. 
The impact is seen worldwide, with more frequent and severe weather events like hurricanes, droughts, and wildfires. 
Scientists are urging immediate action to reduce greenhouse gas emissions to mitigate these effects.
"""

scientific_abstract = """
In this study, we explore the application of convolutional neural networks (CNNs) in classifying medical imaging. 
Our dataset comprises 10,000 MRI scans of various brain diseases. We trained our CNN model using this dataset and 
achieved a 95% accuracy in differentiating between malignant and benign tumors, outperforming traditional methods.
"""

story_excerpt = """
Once upon a time in a faraway land, there was a kingdom of extraordinary beauty. The kingdom was known for its 
enchanting forests and a majestic castle where the beloved royal family lived. Despite its beauty, the kingdom faced 
troubles from a fearsome dragon that threatened peace.
"""

# Summarizing each text
print("News Article Summary:")
print(summarize_text(news_article))
print("\nScientific Abstract Summary:")
print(summarize_text(scientific_abstract))
print("\nStory Excerpt Summary:")
print(summarize_text(story_excerpt))


## Text classification

In [None]:
classifier = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")
classification = classifier("This movie is disgustingly good !")

In [None]:
print(classification[0]['label'])

In [None]:
# Examples of sentences
sentences = ["I love sunny days in the city!", "I hate being stuck in traffic.", "It's just an average day, nothing special."]

# Analyzing sentiment
for sentence in sentences:
    result = classifier(sentence)
    print(f"Sentence: {sentence}")
    print(f"Sentiment: {result[0]['label']}, Score: {result[0]['score']:.2f}\n")


## Text translation

In [None]:
translator = pipeline("translation", model="sdadas/mt5-base-translator-en-pl")
enpl_translation = translator("We are now learning how to use natural Language Processing in Python")

In [None]:
print(enpl_translation[0]['translation_text'])

In [None]:
oracle = pipeline(model="deepset/roberta-base-squad2")
oracle(question="Where do I live?", context="My name is Wolfgang and I live in Wroclaw")

## Question answering

In [None]:
context = """
Pythagoras was an ancient Ionian Greek philosopher and the eponymous founder of Pythagoreanism. His political and 
religious teachings were well known in Magna Graecia and influenced the philosophies of Plato, Aristotle, and, 
through them, Western philosophy. Knowledge of his life is clouded by legend, but he appears to have been the son of 
Mnesarchus, a gem engraver on the island of Samos. Modern scholars disagree regarding Pythagoras's education and 
influences, but they do agree that, around 530 BC, he traveled to Croton, where he founded a school in which 
initiates were sworn to secrecy and lived a communal, ascetic lifestyle.
"""

questions = [
    "Who was Pythagoras?",
    "What did Pythagoras influence?",
    "Where did Pythagoras found his school?"
]

# Answering each question
for question in questions:
    result = oracle(question=question, context=context)
    print(f"Question: {question}")
    print(f"Answer: {result['answer']}\n")


## Named Entity Recognition

In [None]:
# Initialize NER pipeline
ner_pipeline = pipeline("ner", grouped_entities=True)

# Sample text
text = "Google was founded by Larry Page and Sergey Brin while they were students at Stanford University."

# Performing NER
ner_results = ner_pipeline(text)
for entity in ner_results:
    print(f"Entity: {entity['word']}, Type: {entity['entity_group']}, Score: {entity['score']:.2f}")

## Question answering based on the article in BBC set

In [None]:
print(summary)

In [None]:
doc_context = doc.iloc[0]
question = 'What survives final edits?'

oracle(question=question, context=doc_context)

## Text generation

In [None]:
generator = pipeline("text-generation", model="gpt2")
capital = generator('The most popular programming language is')

In [None]:
print(capital[0]['generated_text'])

## Prompt Engineering

In [None]:
# Initialize the text generation pipeline
generator = pipeline('text-generation', model='gpt2')

# Style-specific prompts
prompts = {
    "Shakespearean": "To be or not to be, that is the question:",
    "News Report": "Today in New York City, a major event took place where",
    "Science Fiction": "In a distant future, humanity has colonized Mars and"
}

# Generating and displaying responses
for style, prompt in prompts.items():
    result = generator(prompt, max_length=50, num_return_sequences=1)
    print(f"Style: {style}")
    print(f"Generated Text: {result[0]['generated_text']}\n")

In [None]:
# Fine-tuning the response by slightly altering prompts
original_prompt = "What is the best way to learn programming?"
modified_prompts = [
    original_prompt,
    "As a beginner, " + original_prompt,
    "In a fun and engaging way, " + original_prompt
]

# Generating responses
for prompt in modified_prompts:
    result = generator(prompt, max_length=50, num_return_sequences=1)
    print(f"Prompt: {prompt}")
    print(f"Generated Text: {result[0]['generated_text']}\n")

In [None]:
# Genre-specific prompts
genres = {
    "Horror": "In a dark, abandoned house, there was a mysterious noise that",
    "Comedy": "At the comedy club, the stand-up comedian started his act by saying:",
    "Romantic": "In the beautiful city of Paris, two lovers met and"
}

# Generating genre-specific texts
for genre, prompt in genres.items():
    result = generator(prompt, max_length=50, num_return_sequences=1)
    print(f"Genre: {genre}")
    print(f"Generated Text: {result[0]['generated_text']}\n")