In [None]:
# !pip3 install evaluate
# !pip3 install rouge
# !pip3 install --upgrade simplet5

In [10]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import scipy.spatial
import matplotlib.pyplot as plt
import json

import evaluate  # Bleu
import transformers
from transformers import T5Tokenizer, T5Model, TFT5ForConditionalGeneration, T5TokenizerFast
from simplet5 import SimpleT5

import warnings
warnings.filterwarnings("ignore")

Global seed set to 42


In [2]:
# Loading the data

with open('data/train-v2.0.json') as f:
    data = json.load(f)


# Extracting context, question, and answers from the dataset

def prepare_data(data):
    articles = []
    
    for article in data["data"]:
        for paragraph in article["paragraphs"]:
            for qa in paragraph["qas"]:
                question = qa["question"]

                if not qa["is_impossible"]:
                  answer = qa["answers"][0]["text"]
                
                inputs = {"context": paragraph["context"], "question": question, "answer": answer}

            
                articles.append(inputs)

    return articles

data = prepare_data(data)

# Create a Dataframe
data = pd.DataFrame(data)
data.head()

Unnamed: 0,context,question,answer
0,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,When did Beyonce start becoming popular?,in the late 1990s
1,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,What areas did Beyonce compete in when she was...,singing and dancing
2,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,When did Beyonce leave Destiny's Child and bec...,2003
3,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,In what city and state did Beyonce grow up?,"Houston, Texas"
4,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,In which decade did Beyonce become famous?,late 1990s


In [32]:
data.tail(10)

Unnamed: 0,context,question,answer
130309,These quarks and leptons interact through four...,How many quarks and leptons are there?,Kathmandu Metropolitan City
130310,These quarks and leptons interact through four...,What model satisfactorily explains gravity?,Kathmandu Metropolitan City
130311,These quarks and leptons interact through four...,Interactions between quarks and leptons are th...,Kathmandu Metropolitan City
130312,These quarks and leptons interact through four...,Mass and energy can always be compared to what?,Kathmandu Metropolitan City
130313,These quarks and leptons interact through four...,What relation explains the carriers of the ele...,Kathmandu Metropolitan City
130314,"The term ""matter"" is used throughout physics i...",Physics has broadly agreed on the definition o...,Kathmandu Metropolitan City
130315,"The term ""matter"" is used throughout physics i...",Who coined the term partonic matter?,Kathmandu Metropolitan City
130316,"The term ""matter"" is used throughout physics i...",What is another name for anti-matter?,Kathmandu Metropolitan City
130317,"The term ""matter"" is used throughout physics i...",Matter usually does not need to be used in con...,Kathmandu Metropolitan City
130318,"The term ""matter"" is used throughout physics i...",What field of study has a variety of unusual c...,Kathmandu Metropolitan City


Unnamed: 0,source_text,target_text
129949,question: What stupa is close to the National ...,Swayambhunath </s>
129950,question: Who built the building that contains...,Bhimsen Thapa </s>
129951,question: When was the National Museum founded?,1928 </s>
129952,question: What does Chhauni Silkhana mean?,the stone house of arms and ammunition </s>
129953,question: When did Tribhuvan die?,1955 </s>
...,...,...
130044,question: From what city does Arkefly offer no...,Amsterdam </s>
130045,question: Who operates flights between Kathman...,Turkish Airlines </s>
130046,question: In what US state did Kathmandu first...,Oregon </s>
130047,question: What was Yangon previously known as?,Rangoon </s>


In [3]:
df= data.iloc[:,1:]
df.head()

Unnamed: 0,question,answer
0,When did Beyonce start becoming popular?,in the late 1990s
1,What areas did Beyonce compete in when she was...,singing and dancing
2,When did Beyonce leave Destiny's Child and bec...,2003
3,In what city and state did Beyonce grow up?,"Houston, Texas"
4,In which decade did Beyonce become famous?,late 1990s


In [4]:
q_list = "question: " + df['question']                          # questions list to feed the model
n_list = df['answer'] + " </s>"  # answers list to feed the model

dict_data = {'source_text': q_list,
      'target_text': n_list}

df = pd.DataFrame(dict_data)
df.head() 

Unnamed: 0,source_text,target_text
0,question: When did Beyonce start becoming popu...,in the late 1990s </s>
1,question: What areas did Beyonce compete in wh...,singing and dancing </s>
2,question: When did Beyonce leave Destiny's Chi...,2003 </s>
3,question: In what city and state did Beyonce ...,"Houston, Texas </s>"
4,question: In which decade did Beyonce become f...,late 1990s </s>


In [5]:
df['source_text'][0]

'question: When did Beyonce start becoming popular?'

In [6]:
df['target_text'][0]

'in the late 1990s </s>'

In [7]:
df.shape, len(df.source_text.unique()), len(df.target_text.unique())

((130319, 2), 130217, 64763)

In [50]:
# splitting data into train and test data
train_data, val_data = train_test_split(df[:-100], test_size=0.2)
test_data = df[(130318-369):130318-269]
train_data.shape, val_data.shape, test_data.shape

((104175, 2), (26044, 2), (100, 2))

In [23]:
train_data

Unnamed: 0,source_text,target_text
67105,question: What is absent in a coreless DC motor?,iron core </s>
71137,question: Which government agency received the...,Congress of People's Deputies of the Soviet Un...
103051,question: This decision reflected a revision o...,the ASME A17.1 </s>
129785,question: What does काष्ठ mean in English?,wood </s>
62896,question: How many types of poisonous mollusks...,"destroy property, cause nuisance, or spread di..."
...,...,...
9730,question: What year did the practice of allowi...,1962 </s>
118859,question: What substance did Barbieri use in h...,indium tin oxide </s>
44017,question: In 1961 what was the last country El...,Elizabeth's assassination </s>
66165,question: What did Charles James Fox translate?,Whig </s>


In [22]:
val_data

Unnamed: 0,source_text,target_text
7448,question: In what year was the word genome fir...,1920 </s>
8086,question: When did military conflict resume in...,December 1998 </s>
89061,question: How many seats did the Conservatives...,"156,205 </s>"
41070,"question: In northern Europe, which months hav...",May and September </s>
18189,question: What language lacks gendered nouns a...,Estonian </s>
...,...,...
7884,question: What was the French Congo's name cha...,Middle Congo </s>
42858,question: Where in Eritrea did Italian scienti...,Buya </s>
105006,question: What did the Supreme Court cite as t...,to halt and reverse the trend toward species e...
123086,question: How have the bookstores and cafes be...,its cuisine and efficiency of transit </s>


In [21]:
df[-100:]

Unnamed: 0,source_text,target_text
130219,question: Mass is harder to define as being what?,Kathmandu Metropolitan City </s>
130220,question: What is made out of negatively charg...,Kathmandu Metropolitan City </s>
130221,question: What type of charge do atoms have?,Kathmandu Metropolitan City </s>
130222,question: This definition does not include wha...,Kathmandu Metropolitan City </s>
130223,question: What is located in a sea of protons?,Kathmandu Metropolitan City </s>
...,...,...
130314,question: Physics has broadly agreed on the de...,Kathmandu Metropolitan City </s>
130315,question: Who coined the term partonic matter?,Kathmandu Metropolitan City </s>
130316,question: What is another name for anti-matter?,Kathmandu Metropolitan City </s>
130317,question: Matter usually does not need to be u...,Kathmandu Metropolitan City </s>


In [12]:
model = SimpleT5()
model.from_pretrained(model_type="t5", model_name="t5-base")
model.train(train_df = train_data,
            eval_df = val_data, 
            source_max_token_len=128, 
            target_max_token_len=50, 
            batch_size=8, max_epochs=3, use_gpu=False)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
Missing logger folder: /Users/peeti_mac/Documents/python/peeti/NLP tutorial/text_summarization/lightning_logs

  | Name  | Type                       | Params
-----------------------------------------------------
0 | model | T5ForConditionalGeneration | 222 M 
-----------------------------------------------------
222 M     Trainable params
0         Non-trainable params
222 M     Total params
891.614   Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

2023-10-11 12:08:15.698656: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Global seed set to 42
2023-10-11 12:08:29.533057: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Global seed set to 42


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Global seed set to 42


Training: 0it [00:00, ?it/s]

2023-10-11 12:08:48.070382: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Global seed set to 42
2023-10-11 12:09:03.251359: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Global seed set to 42


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [52]:
test_data.head()


Unnamed: 0,source_text,target_text
129949,question: What stupa is close to the National ...,Swayambhunath </s>
129950,question: Who built the building that contains...,Bhimsen Thapa </s>
129951,question: When was the National Museum founded?,1928 </s>
129952,question: What does Chhauni Silkhana mean?,the stone house of arms and ammunition </s>
129953,question: When did Tribhuvan die?,1955 </s>


In [56]:
q_test = test_data['source_text'][129952]
q_ans = test_data['target_text'][129952]

print("Question: ", q_test)
print('-'*50)
print("Answer: ",q_ans)

Question:  question: What does Chhauni Silkhana mean?
--------------------------------------------------
Answer:  the stone house of arms and ammunition </s>


In [57]:
q_test

'question: What does Chhauni Silkhana mean?'

In [58]:
import torch

model.device= torch.device("cpu") # with this cmd will fix this error: 'SimpleT5' object has no attribute 'device'

predicted_ans = model.predict(q_test)[0]

print(predicted_ans)

a "spiritual"


In [59]:
y_test = [a for a in test_data['target_text'] ]

# get the predicted answers
y_pred = []
for q in test_data['source_text']:
    y_pred.append(model.predict(q)[0])

# BLEU Score

In [60]:
from nltk.translate.bleu_score import sentence_bleu
from pprint import pprint

pprint(f'Question is:\n {q_test}')
pprint(f'Answer is:\n {q_ans}')
pprint(f'Predicted answer is:\n {predicted_ans}')

'Question is:\n question: What does Chhauni Silkhana mean?'
'Answer is:\n the stone house of arms and ammunition </s>'
'Predicted answer is:\n a "spiritual"'


In [61]:
print(f"BLEU score is: {round(sentence_bleu(q_test.split(), predicted_ans.split()),2)}")

BLEU score is: 0.0


In [62]:
# get the rouge score between test and predicted data
bleu_score = []
for i in range(len(y_pred)):
    bleu_score.append(round(sentence_bleu(y_test[i].split(), y_pred[i].split()),2))
    
print(f'The BLEU scores of 100 q&a for test data is: {round(np.mean(bleu_score),2)}')

The BLEU scores of 100 q&a for test data is: 0.0


# ROUGH Score

In [64]:
# !pip install rouge-score

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [65]:
from rouge_score import rouge_scorer

scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)
scores = scorer.score('What is the name of your dog',
                      'Dark')
pprint(scores)

{'rouge1': Score(precision=0.0, recall=0.0, fmeasure=0.0),
 'rougeL': Score(precision=0.0, recall=0.0, fmeasure=0.0)}


In [66]:
pprint(f'Question is: {q_test}')
print('-'*50)
pprint(f'Ansewer is: {q_ans}')
print('-'*50)
pprint(f'Predicted answer is: {predicted_ans}')

'Question is: question: What does Chhauni Silkhana mean?'
--------------------------------------------------
'Ansewer is: the stone house of arms and ammunition </s>'
--------------------------------------------------
'Predicted answer is: a "spiritual"'


In [67]:
# precision
print(f"Precision is: {round(scorer.score(q_test, predicted_ans)['rouge1'][0],2)}")

# Recall
print(f"Recall is: {round(scorer.score(q_test, predicted_ans)['rouge1'][1],2)}")

Precision is: 0.0
Recall is: 0.0


In [68]:
# get the rouge score between test and predicted data
precision_rouge_scores, recall_rouge_scores = [], []
for i in range(len(y_pred)):
    precision_rouge_scores.append(round(scorer.score(y_test[i], y_pred[i])['rouge1'][0],2))
    recall_rouge_scores.append(round(scorer.score(y_test[i], y_pred[i])['rouge1'][1],2))

In [69]:
print(f'The average precision rouge scores of 100 q&a for test data is: {round(np.mean(precision_rouge_scores),2)}')
print(f'The average recall rouge scores of 100 q&a for test data is: {round(np.mean(recall_rouge_scores),2)}')

The average precision rouge scores of 100 q&a for test data is: 0.08
The average recall rouge scores of 100 q&a for test data is: 0.05
