In [47]:
# Import the dependencies
import nltk
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import keras
from nltk.corpus import stopwords
# Import CountVectorizer, TfidfVectorizer from sklearn
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
# Initialize the stopwords
stop_words = set(stopwords.words('english'))
# Import regex
import re
# Create a regex pattern to remove punctuation. 
pattern = r'[^a-zA-Z\s ]'
# Import the pipeline class from the transformers module. 
from transformers import pipeline

In [48]:
# Import the SentenceTransformer class and the utility function from the sentence_transformers library.
from sentence_transformers import SentenceTransformer, util
# Use the all-MiniLM-L6-v2 model.
model = SentenceTransformer('all-MiniLM-L6-v2')

In [49]:
# Define a list of sentences to tokenize.
sentences = ["I love my dog.", "I love my family.", "My dog is a lab."]

## BERT Tokenizer

In [50]:
# Import the BertTokenizer from the transformers package.
from transformers import BertTokenizer

In [51]:
# Instantiate the BertTokenizer on the pre-trained data.
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [52]:
# Define an input text.
text = "I am learning about subword tokenization."

# Tokenize the text into subwords.
subwords = tokenizer.tokenize(text)
subwords

['i', 'am', 'learning', 'about', 'sub', '##word', 'token', '##ization', '.']

In [53]:
# Initialize the pipeline to translate using the t5-base model. 
translator = pipeline("translation", model="t5-base")

In [54]:
# Define a English text and translate it to German. 
english_text = "I am celebrating my birthday."
text = f"translate English to German: {english_text}"
results = translator(text)
# Display the translation JSON data. 
print(results)
# Get the translated text.
results[0]['translation_text']

[{'translation_text': 'Ich feiere meinen Geburtstag.'}]


'Ich feiere meinen Geburtstag.'

## AutoTokenizer and Translation

In [55]:
# Import the Autotokenizer class from the transformers module. 
from transformers import AutoTokenizer
# Create an instance of the Autotokenizer class using the t5-base model.
tokenizer = AutoTokenizer.from_pretrained("t5-base", max_length=50)

In [56]:
# Define text we want to translate.
english_text = "Hello, how are you today?"

In [57]:
# Retrieve the input IDs from the translation.
input_ids = tokenizer(f"translate English to French: {english_text}", return_tensors="tf").input_ids
input_ids

<tf.Tensor: shape=(1, 13), dtype=int32, numpy=
array([[13959,  1566,    12,  2379,    10,  8774,     6,   149,    33,
           25,   469,    58,     1]])>

In [58]:
# Import the TFAutoModelForSeq2SeqLM class from the transformers module. 
from transformers import TFAutoModelForSeq2SeqLM

In [59]:
# Generate the numerical outputs from the model. 
translation_model = TFAutoModelForSeq2SeqLM.from_pretrained("t5-base", max_length=100)
output_ids = translation_model.generate(input_ids, max_new_tokens=100)
output_ids

All PyTorch model weights were used when initializing TFT5ForConditionalGeneration.

All the weights of TFT5ForConditionalGeneration were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.


<tf.Tensor: shape=(1, 14), dtype=int32, numpy=
array([[    0, 21845,     6,  1670,   327,     3,  6738,    18,  3249,
         7082,    31,  3464,    58,     1]])>

In [60]:
# Decode the numerical outputs 
tokenizer.decode(output_ids[0])

"<pad> Bonjour, comment vous êtes-vous aujourd'hui?</s>"

In [61]:
# Retrieve the text from the special characters.
tokenizer.decode(output_ids[0], skip_special_tokens=True)

"Bonjour, comment vous êtes-vous aujourd'hui?"

## Text Generation

In [62]:
# Use the text-generation parameter for the pipeline and EleutherAI/gpt-neo-1.3B model. 
generator = pipeline('text-generation', model='EleutherAI/gpt-neo-1.3B')

In [63]:
# Give the model a prompt. 
prompt = "I like gardening because"
# Pass the prompt to the generator
results = generator(prompt, max_length=125, pad_token_id=50256)
# Get the text based on the prompt. 
generated_text = results[0]['generated_text']
# Print the generated text.
print(generated_text)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


I like gardening because no one has thought of gardening like me—or you. In the past, I have found it easier to write my life story than to simply write history. I find it easier to connect the dots between generations, and, while I am not necessarily a natural historian, I am interested in connecting events from the past, and in connecting generations when they have diverged. I find it easier to write about the past than the present because I can see that history is not always linear, sometimes it is more circular—which is, of course, part of what makes it fun to write about.




In [64]:
# Use the text-generation parameter for the pipeline and EleutherAI/gpt-neo-125m model. 
small_generator = pipeline('text-generation', model='EleutherAI/gpt-neo-125m')

In [65]:
# Give the model a prompt. 
prompt = "My favorite animal is the cat because "
# Pass the prompt to the generator. Use `max_length=25`.
new_results = small_generator(prompt, max_length=25, pad_token_id=50256)
# Get the text based on the prompt. 
generated_text = new_results[0]['generated_text']
# Print the generated text.
print(generated_text)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


My favorite animal is the cat because ive never seen a cat before. ive never seen a cat before. ive


## Question Answering

In [66]:
# Import the pipeline class from the transformers module. 
from transformers import pipeline
# Initialize the pipeline to generate questions and answers using the distilbert-base-cased-distilled-squad model. 
question_answerer = pipeline("question-answering", model='distilbert-base-cased-distilled-squad')

In [67]:
# Source: https://en.wikipedia.org/wiki/Transformer_(machine_learning_model)
text = """
A transformer is a deep learning model that adopts the mechanism of self-attention, differentially weighting the significance of each part of the input data. It is used primarily in the fields of natural language processing (NLP)[1] and computer vision (CV).[2]

Like recurrent neural networks (RNNs), transformers are designed to process sequential input data, such as natural language, with applications towards tasks such as translation and text summarization. However, unlike RNNs, transformers process the entire input all at once. The attention mechanism provides context for any position in the input sequence. For example, if the input data is a natural language sentence, the transformer does not have to process one word at a time. This allows for more parallelization than RNNs and therefore reduces training times.[1]

Transformers were introduced in 2017 by a team at Google Brain[1] and are increasingly becoming the model of choice for NLP problems,[3] replacing RNN models such as long short-term memory (LSTM). The additional training parallelization allows training on larger datasets. This led to the development of pretrained systems such as BERT (Bidirectional Encoder Representations from Transformers) and GPT (Generative Pre-trained Transformer), which were trained with large language datasets, such as the Wikipedia Corpus and Common Crawl, and can be fine-tuned for specific tasks.[4][5]
"""

In [68]:
# Generate a list of questions.
questions = ["When were transformers first introduced?",
             "What are transformers better than?",
             "What are applications of transformers?"]

In [69]:
# Check the output from one question.
question = "When were transformers first introduced?"
# Pass the first question and text to the question_answerer.
result = question_answerer(question=question, context=text)
# Show the results
result

{'score': 0.9091500639915466, 'start': 864, 'end': 868, 'answer': '2017'}

In [70]:
# Create a function to generate the answers based on an input text.
def question_answer(questions, text):
    # Create a list to hold the data that will be added to the DataFrame.
    data = []
    # Use a for loop to iterate through the questions.
    for question in questions:
        # Pass the question and text to the initialized question_answerer. 
        result = question_answerer(question=question, context=text)
        # Retrieve the question, answer, the score, the starting 
        # and ending of where the answer is located in the text.
        data.append([question, result['answer'], result['score'], result['start'], result['end']])
    # Create a DataFrame from the data with appropriate columns. 
    df = pd.DataFrame(data, columns=["Question", "Answer", "Score", "Starting Position", "Ending Position"])
    # Return the DataFrame
    return df

In [71]:
# Call the question_answer function with the questions and text. From Activity 21-2-5
question_answer(questions, text)

Unnamed: 0,Question,Answer,Score,Starting Position,Ending Position
0,When were transformers first introduced?,2017,0.90915,864,868
1,What are transformers better than?,RNNs,0.575204,481,485
2,What are applications of transformers?,translation and text summarization,0.855785,429,463
