In [11]:
# Import the dependencies
import nltk
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import keras
from nltk.corpus import stopwords
# Import CountVectorizer, TfidfVectorizer from sklearn
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
# Initialize the stopwords
stop_words = set(stopwords.words('english'))
# Import regex
import re
# Create a regex pattern to remove punctuation. 
pattern = r'[^a-zA-Z\s ]'
# Import the pipeline class from the transformers module. 
from transformers import pipeline
# Import gradio 
import gradio as gr

In [13]:
# Import the SentenceTransformer class and the utility function from the sentence_transformers library.
from sentence_transformers import SentenceTransformer, util
# Use the all-MiniLM-L6-v2 model.
model = SentenceTransformer('all-MiniLM-L6-v2')



In [14]:
# Read the CSV file into a DataFrame.
king_james_bible_df = pd.read_csv('Resources/t_kjv.csv')

In [15]:
# Read the CSV file into a text format.
filepath = "Resources/t_kjv.csv"
with open(filepath) as f:
    bible_text = f.read().replace('\n',' ')

In [16]:
# Print the first 5 lines of the data frame
king_james_bible_df.head()

Unnamed: 0,id,b,c,v,t
0,1001001,1,1,1,In the beginning God created the heaven and th...
1,1001002,1,1,2,"And the earth was without form, and void; and ..."
2,1001003,1,1,3,"And God said, Let there be light: and there wa..."
3,1001004,1,1,4,"And God saw the light, that it was good: and G..."
4,1001005,1,1,5,"And God called the light Day, and the darkness..."


In [17]:
# Print the first 1000 items in the text
bible_text[0:1000]

'id,b,c,v,t 1001001,1,1,1,In the beginning God created the heaven and the earth. 1001002,1,1,2,"And the earth was without form, and void; and darkness was upon the face of the deep. And the Spirit of God moved upon the face of the waters." 1001003,1,1,3,"And God said, Let there be light: and there was light." 1001004,1,1,4,"And God saw the light, that it was good: and God divided the light from the darkness." 1001005,1,1,5,"And God called the light Day, and the darkness he called Night. And the evening and the morning were the first day." 1001006,1,1,6,"And God said, Let there be a firmament in the midst of the waters, and let it divide the waters from the waters." 1001007,1,1,7,"And God made the firmament, and divided the waters which were under the firmament from the waters which were above the firmament: and it was so." 1001008,1,1,8,And God called the firmament Heaven. And the evening and the morning were the second day. 1001009,1,1,9,"And God said, Let the waters under the heaven 

## Potentially relevant Code used in Class

In [18]:
# Define a list of sentences to tokenize.
sentences = ["I love my dog.", "I love my family.", "My dog is a lab."]

## BERT Tokenizer

In [19]:
# Import the BertTokenizer from the transformers package.
from transformers import BertTokenizer

In [20]:
# Instantiate the BertTokenizer on the pre-trained data.
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [21]:
# Define an input text.
text = "I am learning about subword tokenization."

# Tokenize the text into subwords.
subwords = tokenizer.tokenize(text)
subwords

['i', 'am', 'learning', 'about', 'sub', '##word', 'token', '##ization', '.']

In [22]:
# Initialize the pipeline to translate using the t5-base model. 
translator = pipeline("translation", model="t5-base")



In [23]:
# Define a English text and translate it to German. 
english_text = "I am celebrating my birthday."
text = f"translate English to German: {english_text}"
results = translator(text)
# Display the translation JSON data. 
print(results)
# Get the translated text.
results[0]['translation_text']

[{'translation_text': 'Ich feiere meinen Geburtstag.'}]


'Ich feiere meinen Geburtstag.'

## AutoTokenizer and Translation

In [24]:
# Import the Autotokenizer class from the transformers module. 
from transformers import AutoTokenizer
# Create an instance of the Autotokenizer class using the t5-base model.
tokenizer = AutoTokenizer.from_pretrained("t5-base", max_length=50)

In [25]:
# Define text we want to translate.
english_text = "Hello, how are you today?"

In [26]:
# Retrieve the input IDs from the translation.
input_ids = tokenizer(f"translate English to French: {english_text}", return_tensors="tf").input_ids
input_ids

<tf.Tensor: shape=(1, 13), dtype=int32, numpy=
array([[13959,  1566,    12,  2379,    10,  8774,     6,   149,    33,
           25,   469,    58,     1]])>

In [27]:
# Import the TFAutoModelForSeq2SeqLM class from the transformers module. 
from transformers import TFAutoModelForSeq2SeqLM

In [28]:
# Generate the numerical outputs from the model. 
translation_model = TFAutoModelForSeq2SeqLM.from_pretrained("t5-base", max_length=100)
output_ids = translation_model.generate(input_ids, max_new_tokens=100)
output_ids




All PyTorch model weights were used when initializing TFT5ForConditionalGeneration.

All the weights of TFT5ForConditionalGeneration were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.


<tf.Tensor: shape=(1, 14), dtype=int32, numpy=
array([[    0, 21845,     6,  1670,   327,     3,  6738,    18,  3249,
         7082,    31,  3464,    58,     1]])>

In [29]:
# Decode the numerical outputs 
tokenizer.decode(output_ids[0])

"<pad> Bonjour, comment vous êtes-vous aujourd'hui?</s>"

In [30]:
# Retrieve the text from the special characters.
tokenizer.decode(output_ids[0], skip_special_tokens=True)

"Bonjour, comment vous êtes-vous aujourd'hui?"

## Text Generation

In [31]:
# Use the text-generation parameter for the pipeline and EleutherAI/gpt-neo-1.3B model. 
generator = pipeline('text-generation', model='EleutherAI/gpt-neo-1.3B')

In [32]:
# Give the model a prompt. 
prompt = "I like gardening because"
# Pass the prompt to the generator
results = generator(prompt, max_length=125, pad_token_id=50256)
# Get the text based on the prompt. 
generated_text = results[0]['generated_text']
# Print the generated text.
print(generated_text)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


I like gardening because I like to help grow things. This blog is about my gardening experiences, thoughts, and ideas. If you ever run across any of my articles and ideas they might be useful to you. Also, do check out my “What’s Up” page for more gardening posts and updates.

There are many different types of plants that you could grow. It’s your decision on what type of flowers you want to grow as well. There are also different types of gardeners because gardening can be a very rewarding job or you can spend your entire life growing only certain flowers.


In [33]:
# Use the text-generation parameter for the pipeline and EleutherAI/gpt-neo-125m model. 
small_generator = pipeline('text-generation', model='EleutherAI/gpt-neo-125m')

In [34]:
# Give the model a prompt. 
prompt = "My favorite animal is the cat because "
# Pass the prompt to the generator. Use `max_length=25`.
new_results = small_generator(prompt, max_length=25, pad_token_id=50256)
# Get the text based on the prompt. 
generated_text = new_results[0]['generated_text']
# Print the generated text.
print(generated_text)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


My favorite animal is the cat because ive never seen a cat before. ive never seen a cat before. ive


## Question Answering

In [35]:
# Import the pipeline class from the transformers module. 
question_answerer = pipeline("question-answering", model='distilbert-base-cased-distilled-squad')

In [36]:
# Source: https://en.wikipedia.org/wiki/Transformer_(machine_learning_model)
text = """
A transformer is a deep learning model that adopts the mechanism of self-attention, differentially weighting the significance of each part of the input data. It is used primarily in the fields of natural language processing (NLP)[1] and computer vision (CV).[2]

Like recurrent neural networks (RNNs), transformers are designed to process sequential input data, such as natural language, with applications towards tasks such as translation and text summarization. However, unlike RNNs, transformers process the entire input all at once. The attention mechanism provides context for any position in the input sequence. For example, if the input data is a natural language sentence, the transformer does not have to process one word at a time. This allows for more parallelization than RNNs and therefore reduces training times.[1]

Transformers were introduced in 2017 by a team at Google Brain[1] and are increasingly becoming the model of choice for NLP problems,[3] replacing RNN models such as long short-term memory (LSTM). The additional training parallelization allows training on larger datasets. This led to the development of pretrained systems such as BERT (Bidirectional Encoder Representations from Transformers) and GPT (Generative Pre-trained Transformer), which were trained with large language datasets, such as the Wikipedia Corpus and Common Crawl, and can be fine-tuned for specific tasks.[4][5]
"""

In [37]:
# Generate a list of questions.
questions = ["When were transformers first introduced?",
             "What are transformers better than?",
             "What are applications of transformers?"]

In [38]:
# Check the output from one question.
question = "When were transformers first introduced?"
# Pass the first question and text to the question_answerer.
result = question_answerer(question=question, context=text)
# Show the results
result

{'score': 0.9091500639915466, 'start': 864, 'end': 868, 'answer': '2017'}

In [39]:
# Create a function to generate the answers based on an input text.
def question_answer(questions, text):
    # Create a list to hold the data that will be added to the DataFrame.
    data = []
    # Use a for loop to iterate through the questions.
    for question in questions:
        # Pass the question and text to the initialized question_answerer. 
        result = question_answerer(question=question, context=text)
        # Retrieve the question, answer, the score, the starting 
        # and ending of where the answer is located in the text.
        data.append([question, result['answer'], result['score'], result['start'], result['end']])
    # Create a DataFrame from the data with appropriate columns. 
    df = pd.DataFrame(data, columns=["Question", "Answer", "Score", "Starting Position", "Ending Position"])
    # Return the DataFrame
    return df

In [40]:
# Call the question_answer function with the questions and text. From Activity 21-2-5
question_answer(questions, text)

Unnamed: 0,Question,Answer,Score,Starting Position,Ending Position
0,When were transformers first introduced?,2017,0.90915,864,868
1,What are transformers better than?,RNNs,0.575204,481,485
2,What are applications of transformers?,translation and text summarization,0.855785,429,463


## Text Summarization

In [41]:
# Instantiate the pipeline class for summarization using the facebook/bart-large-cnn model.
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")



In [42]:
# Create a variable to contain the text from (https://en.wikipedia.org/wiki/Deep_learning) to summarize.
article ="""Deep learning is part of a broader family of machine learning methods based on artificial neural networks with representation learning. Learning can be supervised, semi-supervised or unsupervised.[2] 

Deep-learning architectures such as deep neural networks, deep belief networks, deep reinforcement learning, recurrent neural networks, convolutional neural networks and transformers have been applied to fields including computer vision, speech recognition, natural language processing, machine translation, bioinformatics, drug design, medical image analysis, climate science, material inspection and board game programs, where they have produced results comparable to and in some cases surpassing human expert performance.[3][4][5]

Artificial neural networks (ANNs) were inspired by information processing and distributed communication nodes in biological systems. ANNs have various differences from biological brains. Specifically, artificial neural networks tend to be static and symbolic, while the biological brain of most living organisms is dynamic (plastic) and analog.[6][7]

The adjective "deep" in deep learning refers to the use of multiple layers in the network. Early work showed that a linear perceptron cannot be a universal classifier, but that a network with a nonpolynomial activation function with one hidden layer of unbounded width can. Deep learning is a modern variation that is concerned with an unbounded number of layers of bounded size, which permits practical application and optimized implementation, while retaining theoretical universality under mild conditions. In deep learning the layers are also permitted to be heterogeneous and to deviate widely from biologically informed connectionist models, for the sake of efficiency, trainability and understandability."""

In [43]:
# Get the most likely summary of the article using "False" for the `do_sample` parameter.
most_likely_summary = summarizer(article, 
                     min_length=30, 
                     max_length=130, 
                     do_sample=False)

# Display the summary
most_likely_summary

[{'summary_text': 'Deep learning is part of a broader family of machine learning methods based on artificial neural networks with representation learning. Learning can be supervised, semi-supervised or unsupervised. Deep-learning architectures have been applied to fields including computer vision, speech recognition, natural language processing, machine translation, bioinformatics, drug design and medical image analysis.'}]

In [44]:
# Get the summary text from the JSON output
most_likely_summary[0]["summary_text"]

'Deep learning is part of a broader family of machine learning methods based on artificial neural networks with representation learning. Learning can be supervised, semi-supervised or unsupervised. Deep-learning architectures have been applied to fields including computer vision, speech recognition, natural language processing, machine translation, bioinformatics, drug design and medical image analysis.'

In [45]:
# Get a more diverse summary of the article using "True" for the `do_sample` parameter.
diverse_summary = summarizer(article, 
                     min_length=30, 
                     max_length=130, 
                     do_sample=True)[0]["summary_text"]

# Display the summary
diverse_summary

'Deep learning is part of a broader family of machine learning methods based on artificial neural networks with representation learning. Learning can be supervised, semi-supervised or unsupervised. It has been applied to fields including computer vision, speech recognition, natural language processing, machine translation and bioinformatics.'

## Using Gradio

## Using Question and Answer with Gradio

In [46]:
# Initialize the pipeline to generate questions and answers using the distilbert-base-cased-distilled-squad model. 
question_answerer = pipeline("question-answering", model='distilbert-base-cased-distilled-squad')

In [47]:
# Create a function called `question_answer` that takes two parameters, the text to search and a question.
# The function should return the question, answer, probability score, and the starting and ending index of the answer.
def question_answer(text, question):
    result = question_answerer(question=question, context=text)
    return question, result['answer'], result['score'], result['start'], result['end']

In [48]:
# Create the app with two Textbox components. 
# The first textbox will take the text to search the second will take the question.
# The output should show the question, answer, probability score, and the starting and ending index of the answer.

app = gr.Interface(
    fn=question_answer,
    inputs = [
        gr.Textbox(label="Paste the text to search."), 
        gr.Textbox(label="Ask a question.")],
    outputs=gr.Textbox(lines=10, label="Answer to question, probability score, and location.", show_copy_button=True))
    
# Launch the app.
app.launch(show_error=True)

* Running on local URL:  http://127.0.0.1:7861

To create a public link, set `share=True` in `launch()`.




## Using Summarizer Function with Gradio

In [49]:
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")



In [50]:
# Create a summary function passing in the desired parameters. 
def summarize(article, max_output):
    return f'{summarizer(article, max_length=max_output, min_length=30, do_sample=False)[0]["summary_text"]}'

In [51]:
# Create an instance of the Gradio Interface application function with parameters. 
app = gr.Interface(fn=summarize, 
                   title="Text Summarizer using Transformers",
                   inputs=["text", "number"], 
                   outputs=gr.Textbox(lines=20, label="Summarized Text Output", show_copy_button=True))
# Launch the app
app.launch()

* Running on local URL:  http://127.0.0.1:7862

To create a public link, set `share=True` in `launch()`.


