# **THIRD MODEL - STAGE 1**

## Installed Libraries

In [None]:
!pip install --quiet transformers==4.8.1
!pip install --quiet sentencepiece==0.1.95
!pip install --quiet textwrap3==0.9.2
!pip install --quiet nltk==3.2.5
!pip install --quiet sense2vec==1.0.2
!pip install --quiet gradio==2.7.0

# install keyphrase extraction toolkit
!pip install --quiet git+https://github.com/boudinfl/pke.git@dc4d5f21e0ffe64c4df93c46146d29d1c522476b

# lib created specifically for the purpose of searching and replacing words in a document
!pip install --quiet flashtext==2.7

[K     |████████████████████████████████| 2.5 MB 5.2 MB/s 
[K     |████████████████████████████████| 3.3 MB 37.3 MB/s 
[K     |████████████████████████████████| 895 kB 45.1 MB/s 
[K     |████████████████████████████████| 1.2 MB 5.4 MB/s 
[K     |████████████████████████████████| 54 kB 2.0 MB/s 
[?25h  Building wheel for sense2vec (setup.py) ... [?25l[?25hdone
[K     |████████████████████████████████| 865 kB 5.3 MB/s 
[K     |████████████████████████████████| 2.0 MB 40.9 MB/s 
[K     |████████████████████████████████| 211 kB 50.3 MB/s 
[K     |████████████████████████████████| 856 kB 40.8 MB/s 
[K     |████████████████████████████████| 61 kB 453 kB/s 
[K     |████████████████████████████████| 3.6 MB 34.6 MB/s 
[?25h  Building wheel for ffmpy (setup.py) ... [?25l[?25hdone
  Building wheel for flask-cachebuster (setup.py) ... [?25l[?25hdone
[K     |████████████████████████████████| 235 kB 5.3 MB/s 
[?25h  Building wheel for pke (setup.py) ... [?25l[?25hdone
  Buildi

In [None]:
# lib for timing everything/every running cell
!pip install --quiet ipython-autotime

# turn it on
%load_ext autotime

time: 194 µs (started: 2022-04-20 17:24:54 +00:00)


In [None]:
# for printing each line of the summary at most width characters long
from textwrap3 import wrap

time: 7.62 ms (started: 2022-04-20 17:24:54 +00:00)


In [None]:
# connect your personal google drive to load the trained model and tokenizer of question generation
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive
time: 22.5 s (started: 2022-04-20 17:24:54 +00:00)


In [None]:
!ls '/content/gdrive/My Drive'

'Colab Notebooks'   DISSERTATION   Other   Uni
time: 125 ms (started: 2022-04-20 17:25:17 +00:00)


In [None]:
# need to change dir => so, "s2v_old" can be found and be used
%cd /content/gdrive/My Drive/DISSERTATION/

/content/gdrive/My Drive/DISSERTATION
time: 8.75 ms (started: 2022-04-20 17:25:17 +00:00)


In [None]:
from sense2vec import Sense2Vec
s2v = Sense2Vec().from_disk('s2v_old')

time: 20 s (started: 2022-04-20 17:25:17 +00:00)


In [None]:
import torch
from transformers import T5ForConditionalGeneration,T5Tokenizer

import random
import numpy as np

import nltk
nltk.download('punkt') # this tokenizer divides a text into a list of sentences, by using an unsupervised algorithm
nltk.download('brown')
nltk.download('wordnet')

from nltk.corpus import wordnet as wn
from nltk.tokenize import sent_tokenize

import nltk
nltk.download('stopwords') # stop words are the most common words => do not add much meaning to a sentence
from nltk.corpus import stopwords
import string

# OrderedDict is a dictionary that helps to remember the order of the keys that were inserted first
from collections import OrderedDict

# lib for the keyword extraction
import pke

# lib used for printing exception stack trace
import traceback

from flashtext import KeywordProcessor

import gradio as gr

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
time: 10.2 s (started: 2022-04-20 17:25:37 +00:00)


## Download our pretrained model and tokenizer for summarization

In [None]:
# download our pre-trained model
summary_model = T5ForConditionalGeneration.from_pretrained('t5-base')

# download our pre-trained tokenizer
summary_tokenizer = T5Tokenizer.from_pretrained('t5-base')

time: 5.27 s (started: 2022-04-20 17:26:59 +00:00)


In [None]:
# in case cuda.is_available() is True => then, the device used is a GPU
# otherwise, it's a CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


# move the model to the device used (in my case is GPU)
summary_model = summary_model.to(device)

time: 436 ms (started: 2022-04-20 17:27:04 +00:00)


## Set seed

In [None]:
# Difference between torch.manual_seed & torch.cuda.manual_seed_all - THREAD: https://discuss.pytorch.org/t/difference-between-torch-manual-seed-and-torch-cuda-manual-seed/13848/7

def set_seed(seed: int):
    """ 
      This function sets the same seed  to be able to reproduce the results
      as each algorithm uses its own random number generator  
    """

    random.seed(seed) # set python built-in pseudo-random generator at a fixed value
    np.random.seed(seed) # set numpy pseudo-random generator at a fixed value
    torch.manual_seed(seed) # pytorch pseudo-random generator at a fixed value
    torch.cuda.manual_seed_all(seed) # for having reproducible results when using random generation on the gpu


set_seed(42)

time: 5.73 ms (started: 2022-04-20 17:27:04 +00:00)


## Summary preprocessing, encoding, generation, decoding, postprocessing

In [None]:
def post_process_text (content):
  """
    This function tokenizes sentences and then capitalize the first letter of the first word of the sentence 
    by making the rest characters of that sentence to lowercase
  """

  final=""

  # example: https://pythonspot.com/tokenizing-words-and-sentences-with-nltk/ (Section: Tokenizing sentences)
  for sent in sent_tokenize(content):
    sent = sent.capitalize()
    final = final +" "+sent

    
  return final

time: 7.82 ms (started: 2022-04-20 17:27:04 +00:00)


In [None]:
def summarizer(context,model,tokenizer):
  """ 
    this function's main purpose is to generate the summary of the text by taking the following steps:
      1. encode the given original text
      2. generate its summary according to the encoded input 
      3. decode the generated summary
      4. process the summary by sending it to the post_process_text() function as well as removing any leading and trailing whitespaces
      5. return the final version of the summary
  """

  context = context.strip().replace("\n"," ")
  input = "summarize: "+ context # summarize string prefix added


  encoding = tokenizer.encode_plus(input, max_length=512, pad_to_max_length=False,truncation=True, return_tensors="pt").to(device)

  input_ids, attention_mask = encoding["input_ids"], encoding["attention_mask"]


  # hugging face generate function generates a summary of minimum 75 tokens and 300 max
  outs = model.generate(input_ids=input_ids,
                                  attention_mask=attention_mask,
                                  early_stopping=True,
                                  num_beams=3,
                                  num_return_sequences=1,
                                  no_repeat_ngram_size=2,
                                  min_length = 75,
                                  max_length=300)


  # decode the generated summary 
  dec = [tokenizer.decode(ids,skip_special_tokens=True) for ids in outs]
  summary = dec[0] # [0] is for getting it out of the square brackets

  # call the post_process_text function to process the text
  summary = post_process_text(summary)

  # remove leading and trailing whitespaces
  summary= summary.strip()


  return summary

time: 40 ms (started: 2022-04-20 17:27:04 +00:00)


## Answer Span Extraction (Keywords and Noun Phrases)

In [None]:
def get_nouns_multipartite(content):
    """ 
      extract keywords using multipartite algorithm
    """

    out=[]

    try:
        extractor = pke.unsupervised.MultipartiteRank()
        extractor.load_document(input=content)

        pos = {'PROPN','NOUN'}

        # not contain punctuation marks or stopwords as candidates.
        stoplist = list(string.punctuation)
        stoplist += ['-lrb-', '-rrb-', '-lcb-', '-rcb-', '-lsb-', '-rsb-']
        stoplist += stopwords.words('english')
        extractor.candidate_selection(pos=pos, stoplist=stoplist)


        # build the Multipartite graph and rank candidates using random walk,
        # alpha controls the weight adjustment mechanism
        # see TopicRank for threshold/method parameters.
        extractor.candidate_weighting(alpha=1.1,
                                      threshold=0.75,
                                      method='average')
        
        keyphrases = extractor.get_n_best(n=15)

        print("n_best keyphrases: ", keyphrases)
        

        for val in keyphrases:
            out.append(val[0])
    except:
        out = []
        traceback.print_exc()


    return out

time: 60.3 ms (started: 2022-04-20 17:27:05 +00:00)


In [None]:
def get_keywords(originaltext,summarytext):
  """ 
    this function's mainly purpose is to find the keywords that are in both original text and summarized text
    >>> returns the first 4 keywords of the list
    
    Steps:
      1. calls the get_noun_multipartite function to extract keywords from the original text
      2. adds those extracted keywords of the original text to the keyword processor
      3. extracts keywords from the summarized text using the processor
      4. checks if the extracted keywords from the original text are in the summarized text => keep  --- Otherwise => ignore
  """

  # call the function to extract keywords from the original text
  keywords = get_nouns_multipartite(originaltext)
  
  # print the original text's extracted keywords
  print ("keywords of the original text: ",keywords)

  # initialize keyword processor
  keyword_processor = KeywordProcessor()
  # add original text's extracted keywords to the keyword processor
  for keyword in keywords:
    keyword_processor.add_keyword(keyword)

  # exract keywords from the summarized text
  keywords_found = keyword_processor.extract_keywords(summarytext)
  # add the summarized text's extracted keywords to a list
  keywords_found = list(set(keywords_found))
  
  print ("keywords of the summarized text: ",keywords_found)


  important_keywords = [] # list for containing the extracted keywords that are contained in both original and summarized texts

  # check if any of the original text's extracted keywords matches to the summarized text's extracted keywords => add it to the list
  for keyword in keywords:
    if keyword in keywords_found:
      important_keywords.append(keyword) 


  # return only 4 matched keywords
  return important_keywords[:4] 

time: 39.3 ms (started: 2022-04-20 17:27:05 +00:00)


## Load our Fine-Tuned Question Generation T5 model (our 2nd model)

In [None]:
# load our pre-trained T5 model for question generation
question_model = T5ForConditionalGeneration.from_pretrained('/content/gdrive/My Drive/DISSERTATION/MODEL 2/t5/model')

# load our pre-trained T5 tokenizer for question generation
question_tokenizer = T5Tokenizer.from_pretrained('/content/gdrive/My Drive/DISSERTATION/MODEL 2/t5/tokenizer')


# move the model to the device used (in my case is GPU)
question_model = question_model.to(device)

time: 19.4 s (started: 2022-04-20 17:27:05 +00:00)


## Decoding Strategies

### Beam Search

In [None]:
def beam_search(input_ids):
    """
    Beam Search Decoding Strategy
    """

    # beams refer to the decoding style used - there are several kinds of decoding methods for generated2text models 
    outs = question_model.generate(
        input_ids=input_ids,  # the token ids of the the "text" variable above
        max_length=72,  # max length of the output 

        num_beams=5,  # 3 distractors
        no_repeat_ngram_size=3,  # no n-gram will appear three times => the ideal would be to be equal to 2 so no n-gram would appear twice but it ouputs errors
        num_return_sequences=1,  # generate one sequence of outputs/distractors
        early_stopping=True  # so that the generation is finished when all beam hypotheses reached the EOS token (</s>)
    )


    return outs

time: 3.43 ms (started: 2022-04-20 17:27:25 +00:00)


### Greedy Search

In [None]:
def greedy_search(input_ids):
  """
  Greedy Search Decoding Strategy
  """

  outs = question_model.generate(input_ids, max_length=50)


  return outs

time: 1.8 ms (started: 2022-04-20 17:27:25 +00:00)


### Random Sampling

In [None]:
def sampling_decoding(input_ids):
  """
  Sampling Decoding Strategy
  """

  # activate sampling and deactivate top_k by setting top_k sampling to 0
  outs = question_model.generate(
      input_ids, 
      do_sample=True, 
      max_length=72, 
      top_k=0
  )


  return outs

time: 7.4 ms (started: 2022-04-20 17:27:25 +00:00)


### Random Sampling with Temperature

In [None]:
def sampling_with_temperature_decoding(input_ids):
  """
  Sampling with Temperature Decoding Strategy
  """

  # use temperature to decrease the sensitivity to low probability candidates
  outs = question_model.generate(
      input_ids, 
      do_sample=True, 
      max_length=72, 
      top_k=0, 
      temperature=0.7
  )


  return outs

time: 5.71 ms (started: 2022-04-20 17:27:25 +00:00)


### Top-K Sampling

In [None]:
def topK_sampling(input_ids):
  """
  Top-K Sampling Decoding Strategy
  """

  # set top_k to 50
  outs = question_model.generate(
      input_ids, 
      do_sample=True, 
      max_length=72, 
      top_k=50
  )

  return outs

time: 6.89 ms (started: 2022-04-20 17:27:25 +00:00)


### Top-p (nucleus) Sampling

In [None]:
def nucleus_sampling(input_ids):
  """
  Top-K Sampling Decoding Strategy
  """

  # deactivate top_k sampling and sample only from 92% most likely words
  outs = question_model.generate(
      input_ids, 
      do_sample=True, 
      max_length=72, 
      top_p=0.92, 
      top_k=0
  )


  return outs

time: 5.1 ms (started: 2022-04-20 17:27:25 +00:00)


## Function for encoding the input, passing it to the selected Decoding Strategy and decoding the new genereated output

In [None]:
def get_question(context,answer,model,tokenizer, decoding_method):
  """
  this function's purpose is to generate questions using the our T5 pre-trained model
  
  Steps:
  1. encodes both summarized text and answer which are passed to the selected Decoding Strategy
  2. generate a question
  3. decode the generated question
  """


  text = "context: {} answer: {}".format(context,answer)
  encoding = tokenizer.encode_plus(text,max_length=384, pad_to_max_length=False,truncation=True, return_tensors="pt").to(device)
  input_ids, attention_mask = encoding["input_ids"], encoding["attention_mask"]

  outs = ""

  # get the generated output from the selected decoding method
  if decoding_method == "Beam Search":
    outs = beam_search(input_ids)
  elif decoding_method == "Greedy Search":
    outs = greedy_search(input_ids)
  elif decoding_method == "Sampling":
    outs = sampling_decoding(input_ids)
  elif decoding_method == "Sampling w Temp":
    outs = sampling_decoding(input_ids)    
  elif decoding_method == "Top-K":
    outs = topK_sampling(input_ids)   
  elif decoding_method == "Top-p":
    outs = nucleus_sampling(input_ids)  


  # decode the generated output => question
  dec = [tokenizer.decode(ids,skip_special_tokens=True) for ids in outs] # decode the generated question

  Question = dec[0].replace("question:","") # skip the string "question:" => contain only the real question
  
  Question= Question.strip() #  remove leading and trailing whitespaces


  return Question

time: 29.6 ms (started: 2022-04-20 17:27:25 +00:00)


## Word Embedding functions

In [None]:
def get_distractors_wordnet (word):
    
    """
    This function is called when the WordNet radiobutton is selected by the user

    Finds hyponynms of the given answer/word 
    Returns 3 distractors
    """

    distractors=[] # initalize a list for adding the distractors of the given word

    try:
      syn = wn.synsets(word,'n')[0] # get noun synonyms => thus, 'n'
    
      word = word.lower() # make the word lowercase
      orig_word = word # original word is the lowercased word

      # if the word can be split => replace the space with an underscore
      if len(word.split()) > 0:
          word = word.replace(" ", "_")

      # hypernym is the higher-level category => we are looking for hyponyms - sub-categories
      hypernym = syn.hypernyms()

      # if hypernym is 0 => the given word is in the higher-level category and not in the subcategory => return an empty list of distractors => no distractors found
      if len(hypernym) == 0:
          return distractors
      for item in hypernym[0].hyponyms():
          name = item.lemmas()[0].name()
          # print ("name ",name, " word",orig_word)

          # check if the hyponym found is the same with the word given => if yes, check for other hyponyms
          if name == orig_word:
              continue

          # if no, replace the underscore to a space
          name = name.replace("_", " ")
          # join the splitted words to one string
          name = " ".join(w.capitalize() for w in name.split())

          # check that the found hyponym is not empty and the is not already in the list => append it to the list
          if name is not None and name not in distractors:
              distractors.append(name)
    except: # in case the given word has no synsets (set of synonyms) => exception
      print ("Wordnet distractors not found")


    # return distractors
    return distractors[:3]

time: 17.6 ms (started: 2022-04-20 17:27:25 +00:00)


In [None]:
def get_distractors_sense2vec (word):

  """
    This function is called when the Sense2Vec radiobutton is selected by the user

    returns 3 distractors
  """

  output = []
  word = word.lower()
  word = word.replace(" ", "_")

  sense = s2v.get_best_sense(word)

  if not sense: # check if the word has no sense => return
    return " "
  else: 
    most_similar = s2v.most_similar(sense, n=3)

  # print ("most_similar ",most_similar)

  for each_word in most_similar:
      append_word = each_word[0].split("|")[0].replace("_", " ").lower()

      if append_word.lower() != word:
          output.append(append_word.title())

  out = list(OrderedDict.fromkeys(output))
  
  return out

time: 11.5 ms (started: 2022-04-20 17:27:25 +00:00)


## Prepare GUI

In [None]:
context = gr.inputs.Textbox(lines=10, placeholder="Enter paragraph/content here...")
output = gr.outputs.HTML(label="Question and Answers")

Decoding_Strategy = gr.inputs.Radio(["Beam Search", "Greedy Search", "Sampling", "Sampling w Temp", "Top-K", "Top-p"])
Word_Embedding = gr.inputs.Radio(["Wordnet", "Sense2Vec"])

time: 3.84 ms (started: 2022-04-20 17:27:25 +00:00)


In [None]:
def third_model_stage1(context, Decoding_Strategy, Word_Embedding):
  
  # get summary of the user's given text
  summary_text = summarizer(context,summary_model,summary_tokenizer)

  # print the summary for debugging purposes
  for wrp in wrap(summary_text, 150):
    print (wrp)
  
  # get the 4 keywords that are contained in both original text (user's input) and summarized text
  np =  get_keywords(context,summary_text)
  
  # print the keywords
  #print ("\n\nNoun phrases",np)


  output="" # initialize an empty string => GUI's output
  
  # for each keyword => generate a question 
  for answer in np:
    ques = get_question(summary_text,answer,question_model,question_tokenizer, Decoding_Strategy) # generate a question

    # if Wordnet is selected => call get_distractors_wordnet() function => get distractors from wordnet for the specific keyword/answer
    if Word_Embedding == "Wordnet":
      distractors = get_distractors_wordnet(answer)
    else:
      # Othwerise, sense2vec is selected => call get_distractors() function => get distractors from Sense2Vec for the specific keyword/answer
      distractors = get_distractors_sense2vec(answer)

    # output= output + ques + "\n" + "Ans: "+answer.capitalize() + "\n\n"
    output = output + "<b style='color:blue;'>" + ques + "</b>"
    output = output + "<br>"
    output = output + "<b style='color:green;'>" + "Ans: " +answer.capitalize()+  "</b>"
    output = output + "<br>"
    if len(distractors) > 0:
      for distractor in distractors: # add 3 distractors
        output = output + "<b style='color:brown;'>" + distractor+  "</b>"+"<br>"
    output = output + "<br>"

  # Summarized text
  summary ="Summary: "+ summary_text

  # make bold the keywords that appear in the summary
  for answer in np:
    summary = summary.replace(answer,"<b>"+answer+"</b>")
    summary = summary.replace(answer.capitalize(),"<b>"+answer.capitalize()+"</b>")

  # add the summary to the output
  output = output + "<p>"+summary+"</p>"
  
  return output

time: 33.3 ms (started: 2022-04-20 17:27:25 +00:00)


In [None]:
iface = gr.Interface(
  fn = third_model_stage1, 
  inputs = [context, Decoding_Strategy, Word_Embedding],
  outputs = output)

time: 703 ms (started: 2022-04-20 17:27:25 +00:00)


## Test GUI

In [None]:
iface.launch(debug=True)

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
Running on public URL: https://36672.gradio.app

This share link expires in 72 hours. For free permanent hosting, check out Spaces (https://huggingface.co/spaces)


  next_indices = next_tokens // vocab_size


Cristiano ronaldo dos santos aveiro goih comm is a professional footballer from portugal. He has won five ballon d'or awards and four european golden
shoes - the most by european player! 'he is one of the few players to have made over 1,100 professional career appearances'
n_best keyphrases:  [('cristiano ronaldo', 0.11775058495563419), ('career', 0.06976319968199642), ('league titles', 0.06835673132673951), ('player', 0.06550692753228861), ('santos aveiro goih', 0.058267767302060514), ('championship', 0.03604528347689767), ('pronunciation', 0.035560509255193475), ('appearances', 0.03471596266565655), ('goals', 0.03448301764111035), ('ronaldo', 0.03417992683059817), ('kɾiʃˈtjɐnu ʁɔˈnaɫdu', 0.03236142296157868), ('trophies', 0.028414826270788663), ('uefa', 0.027138994697160322), ('premier league club manchester united', 0.026434702414690616), ('forward', 0.025963639725860452)]
keywords of the original text:  ['cristiano ronaldo', 'career', 'league titles', 'player', 'santos aveiro goih'

  next_indices = next_tokens // vocab_size


Wordnet distractors not found
Wordnet distractors not found
Cristiano ronaldo dos santos aveiro goih comm is a professional footballer from portugal. He has won five ballon d'or awards and four european golden
shoes - the most by european player! 'he is one of the few players to have made over 1,100 professional career appearances'
n_best keyphrases:  [('cristiano ronaldo', 0.11775058495563419), ('career', 0.06976319968199642), ('league titles', 0.06835673132673951), ('player', 0.06550692753228861), ('santos aveiro goih', 0.058267767302060514), ('championship', 0.03604528347689767), ('pronunciation', 0.035560509255193475), ('appearances', 0.03471596266565655), ('goals', 0.03448301764111035), ('ronaldo', 0.03417992683059817), ('kɾiʃˈtjɐnu ʁɔˈnaɫdu', 0.03236142296157868), ('trophies', 0.028414826270788663), ('uefa', 0.027138994697160322), ('premier league club manchester united', 0.026434702414690616), ('forward', 0.025963639725860452)]
keywords of the original text:  ['cristiano ronaldo

  next_indices = next_tokens // vocab_size


KeyboardInterrupt: ignored

time: 1min 8s (started: 2022-04-20 17:27:25 +00:00)


# **THIRD MODEL - STAGE 2**
#### Note: THIRD MODEL needs to run before the FINAL MODEL runs

## Installed Libraries

In [None]:
# library for using the Normalized Levenshtein distance
!pip install --quiet strsim==0.0.3

time: 3.64 s (started: 2022-04-20 17:29:26 +00:00)


In [None]:
from similarity.normalized_levenshtein import NormalizedLevenshtein
normalized_levenshtein = NormalizedLevenshtein()

time: 2.24 ms (started: 2022-04-20 17:29:30 +00:00)


## Levenshtein Distance Filtering

In [None]:
def get_highest_similarity_score(wordlist, wrd):
  """
  this function compares each word of the given list to the given word and finds each similarity score for returning the maximum score
  """

  score=[]

  for each in wordlist:
    score.append(normalized_levenshtein.similarity(each.lower(), wrd.lower()))
    
  return max(score)

time: 4.62 ms (started: 2022-04-20 17:29:25 +00:00)


## Base Sense Filtering

In [None]:
def filter_same_sense_words(original, wordlist):
  """
  this function's purpose is to return the words of the passing wordlist that have the same sense with the answer
  """

  filtered_words=[]

  # get the sense
  base_sense = original.split('|')[1] 
  #print(" the base_sense is: ", base_sense)

  # check that the words in the wordlist have the same sense with the answer
  # if yes => get those words and replace the underscores with a space, make their first letter uppercase and remove any leading and trailing characters
  for eachword in wordlist:
    if eachword[0].split('|')[1] == base_sense:
      filtered_words.append(eachword[0].split('|')[0].replace("_", " ").title().strip())


  return filtered_words

time: 8.1 ms (started: 2022-04-20 17:29:24 +00:00)


## Sense2Vec

In [None]:
def sense2vec_get_words_(question, word, topn):
    """
    this function returns filtered distractors by doing the following:
    1. gets the sense of the given word
    2. finds the most similar words
    3. calls the filter_same_sense_words() function to filter the similar words to have the same sense with the given word
    4. applies extra filtering using three requirements 
        - calls the get_highest_similarity_score() function for checking the similarity score using Levenshtein distance
        - the word has not been found before
        - the word is not part of the question
    5. returns the filtered words
    """

    output = []

    try:

      # if no sense => None
      sense = s2v.get_best_sense(word, senses= ["NOUN", "PERSON","PRODUCT","LOC","ORG","EVENT","NORP","WORK OF ART","FAC","GPE","NUM","FACILITY"])
      #print("the sense of the passing word is: ", sense)


      # gets a list of the n most similar ((word, sense), score) tuples
      most_similar = s2v.most_similar(sense, n=topn)
      #print ("the most similar words of the passing word are: ", most_similar)


      output = filter_same_sense_words(sense, most_similar) # call the filter_same_sense_words() function
      #print ("The words that are indeed similar (have the same base sense) are: ",output)

    except:
      output =[]


    threshold = 0.6
    final = [word] # initialize final list with the answer
    checklist = question.split() # split the question into a list of strings

    # loop through the filtered words with the same sense of the given word
    for x in output:
      # if the filtered word has similarity score less than 0.6, is not in the final list and is not part of the question => append it to the list
      if get_highest_similarity_score(final, x)<threshold and x.upper() not in final and x not in checklist:
        final.append(x)
    
    #print("the final list is the following: ", final)
    
    
    # the first word is the given answer => skip it
    return final[1:]

time: 16.2 ms (started: 2022-04-20 18:02:17 +00:00)


## Prepare GUI

In [None]:
context = gr.inputs.Textbox(lines=10, placeholder="Enter paragraph/content here...")
output = gr.outputs.HTML(  label="Question and Answers")

Decoding_Strategy = gr.inputs.Radio(["Beam Search", "Greedy Search", "Sampling", "Sampling w Temp", "Top-K", "Top-p"])

time: 3.08 ms (started: 2022-04-20 18:02:13 +00:00)


In [None]:
def third_model_stage2(context, Decoding_Strategy):

  # get the summary of the original text given by the user
  summary_text = summarizer(context,summary_model,summary_tokenizer)

  # print the summary to the console for debugging purposes
  for wrp in wrap(summary_text, 150):
    print (wrp)


  # find the matched keywords between the original and the summarized texts
  np =  get_keywords(context,summary_text)
  
  # print the extracted noun keywords to the console for debugging purposes
  print ("\n\nNoun phrases",np)
  

  output="" # initialize an empty string which will be the one that will be displayed on the GUI
  

  # loop through the 4 matched extracted keywords
  for answer in np:
    ques = get_question(summary_text, answer, question_model, question_tokenizer, Decoding_Strategy) # generate question for each keyword/answer


    # get distractors from Sense2Vec for the specific keyword/answer
    distractors = sense2vec_get_words_(ques, answer.capitalize(), 40)

    # output= output + ques + "\n" + "Ans: "+answer.capitalize() + "\n\n"
    output = output + "<b style='color:blue;'>" + ques + "</b>"
    output = output + "<br>"
    output = output + "<b style='color:green;'>" + "Ans: " +answer.capitalize()+  "</b>"+"<br>"
    if len(distractors)>0:
      for distractor in distractors[:3]: # add only 3 distractors
        output = output + "<b style='color:brown;'>" + distractor+  "</b>"+"<br>"
    output = output + "<br>"

  
  # add "Summary:" prefix in the Summary variable string for displaying it on GUI
  summary ="Summary: " + "<br>" + summary_text
  
  
  # capitalize the 4 matched keywords/answers in the summarized text for better visualization in the GUI's final output
  for answer in np:
    summary = summary.replace(answer,"<b>" + answer+ "</b>")
    summary = summary.replace(answer.capitalize(),"<b>" + answer.capitalize() + "</b>")


  # add summarized text to the GUI's final output
  output = output + "<p>"+summary+"</p>"
  output = output + "<br>"
  
  return output

time: 33.2 ms (started: 2022-04-20 18:02:14 +00:00)


In [None]:
iface = gr.Interface(
  fn=third_model_stage2, 
  inputs=[context, Decoding_Strategy], 
  outputs=output)

time: 772 ms (started: 2022-04-20 18:02:14 +00:00)


## Test GUI

In [None]:
iface.launch(debug=True)

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
Running on public URL: https://42263.gradio.app

This share link expires in 72 hours. For free permanent hosting, check out Spaces (https://huggingface.co/spaces)


  next_indices = next_tokens // vocab_size


The 2007 film is based on the toy line of the same name. It was directed by michael bay and produced by don murphy and tom desanto, and is the first
installment in the live-action transformers film series, released in january of this year, starring samuel l. eric o'donnell jr.
n_best keyphrases:  [('science fiction action film', 0.20181129865537317), ('toy line', 0.10832020939941245), ('name', 0.09431956887333845), ('computer animation', 0.08046707032337405), ('transformers', 0.07632735430252066), ('michael bay', 0.07082490122116103), ('steven spielberg', 0.06870046555098483), ('film', 0.05934794113013609), ('producer', 0.057033555828298554), ('action filming', 0.05109057283485087), ('installment', 0.04941538298171607), ('tom desanto', 0.048780867457206174), ('action transformers film series', 0.03356081144162786)]
keywords of the original text:  ['science fiction action film', 'toy line', 'name', 'computer animation', 'transformers', 'michael bay', 'steven spielberg', 'film', 'produce

  next_indices = next_tokens // vocab_size


The 2007 film is based on the toy line of the same name. It was directed by michael bay and produced by don murphy and tom desanto, and is the first
installment in the live-action transformers film series, released in january of this year, starring samuel l. eric o'donnell jr.
n_best keyphrases:  [('science fiction action film', 0.20181129865537317), ('toy line', 0.10832020939941245), ('name', 0.09431956887333845), ('computer animation', 0.08046707032337405), ('transformers', 0.07632735430252066), ('michael bay', 0.07082490122116103), ('steven spielberg', 0.06870046555098483), ('film', 0.05934794113013609), ('producer', 0.057033555828298554), ('action filming', 0.05109057283485087), ('installment', 0.04941538298171607), ('tom desanto', 0.048780867457206174), ('action transformers film series', 0.03356081144162786)]
keywords of the original text:  ['science fiction action film', 'toy line', 'name', 'computer animation', 'transformers', 'michael bay', 'steven spielberg', 'film', 'produce

  next_indices = next_tokens // vocab_size


He was awarded a phd in computer science by the technical university of ostrava, czech republic. His work was on feature selection and function
approximation using adaptive algorithms. The czech university of ostreva awarded him an ivf grant for his research in machine learning and signal
processing for pattern analysis of human's perception of the urban environment.
n_best keyphrases:  [('phd', 0.07860856377369267), ('computer science', 0.05896968241102518), ('pattern analysis', 0.05576039558922247), ('signal processing', 0.054031061909360585), ('human', 0.053903423319960564), ('applied mathematics', 0.05198015316401382), ('machine learning', 0.0517701697980533), ('swiss national science foundation project', 0.05069203548304731), ('perception', 0.0496213568013842), ('technical university', 0.04961343186357499), ('ostrava', 0.0489591789742317), ('eth zurich', 0.048249923772628704), ('czech republic', 0.04719548255331527), ('switzerland', 0.04475399070468492), ('environment', 0.04236668

  next_indices = next_tokens // vocab_size


He was awarded a phd in computer science and applied mathematics by the technical university of ostrava, czech republic. His phd work was on feature
selection and function approximation using adaptive algorithms, based on machine learning and signal processing for pattern analysis of human's
perception of the urban environment. Before this, dr ojha worked as an interdisciplinary research fellow in government of india funded-project on
mixed gases.
n_best keyphrases:  [('researcher', 0.06604301748257026), ('signal processing', 0.052690424471298666), ('pattern analysis', 0.05075647067495878), ('machine learning', 0.05070231261553641), ('computer science', 0.050150620976407684), ('phd', 0.0450487002220786), ('technology', 0.03963706060556339), ('eth zurich', 0.03820320685498657), ('dr ojha', 0.0359785640813622), ('human', 0.03313316867358486), ('switzerland', 0.031131089374770667), ('swiss national science foundation project', 0.03081604461473249), ('perception', 0.02968660539199123), ('a

  next_indices = next_tokens // vocab_size


He was awarded a phd in computer science by the technical university of ostrava, czech republic. His work was on feature selection and function
approximation using adaptive algorithms. The czech university of ostreva awarded him an ivf grant for his research in machine learning and signal
processing for pattern analysis of human's perception of the urban environment.
n_best keyphrases:  [('phd', 0.07860856377369267), ('computer science', 0.05896968241102518), ('pattern analysis', 0.05576039558922247), ('signal processing', 0.054031061909360585), ('human', 0.053903423319960564), ('applied mathematics', 0.05198015316401382), ('machine learning', 0.0517701697980533), ('swiss national science foundation project', 0.05069203548304731), ('perception', 0.0496213568013842), ('technical university', 0.04961343186357499), ('ostrava', 0.0489591789742317), ('eth zurich', 0.048249923772628704), ('czech republic', 0.04719548255331527), ('switzerland', 0.04475399070468492), ('environment', 0.04236668

  next_indices = next_tokens // vocab_size


He was awarded a phd in computer science by the technical university of ostrava, czech republic. His work was on feature selection and function
approximation using adaptive algorithms. The czech university of ostreva awarded him an ivf grant for his research in machine learning and signal
processing for pattern analysis of human's perception of the urban environment.
n_best keyphrases:  [('phd', 0.07860856377369267), ('computer science', 0.05896968241102518), ('pattern analysis', 0.05576039558922247), ('signal processing', 0.054031061909360585), ('human', 0.053903423319960564), ('applied mathematics', 0.05198015316401382), ('machine learning', 0.0517701697980533), ('swiss national science foundation project', 0.05069203548304731), ('perception', 0.0496213568013842), ('technical university', 0.04961343186357499), ('ostrava', 0.0489591789742317), ('eth zurich', 0.048249923772628704), ('czech republic', 0.04719548255331527), ('switzerland', 0.04475399070468492), ('environment', 0.04236668

  next_indices = next_tokens // vocab_size


He was awarded a phd in computer science by the technical university of ostrava, czech republic. His work was on feature selection and function
approximation using adaptive algorithms. The czech university of ostreva awarded him an ivf grant for his research in machine learning and signal
processing for pattern analysis of human's perception of the urban environment.
n_best keyphrases:  [('phd', 0.07860856377369267), ('computer science', 0.05896968241102518), ('pattern analysis', 0.05576039558922247), ('signal processing', 0.054031061909360585), ('human', 0.053903423319960564), ('applied mathematics', 0.05198015316401382), ('machine learning', 0.0517701697980533), ('swiss national science foundation project', 0.05069203548304731), ('perception', 0.0496213568013842), ('technical university', 0.04961343186357499), ('ostrava', 0.0489591789742317), ('eth zurich', 0.048249923772628704), ('czech republic', 0.04719548255331527), ('switzerland', 0.04475399070468492), ('environment', 0.04236668

  next_indices = next_tokens // vocab_size


He was awarded a phd in computer science by the technical university of ostrava, czech republic. His work was on feature selection and function
approximation using adaptive algorithms. The czech university of ostreva awarded him an ivf grant for his research in machine learning and signal
processing for pattern analysis of human's perception of the urban environment.
n_best keyphrases:  [('phd', 0.07860856377369267), ('computer science', 0.05896968241102518), ('pattern analysis', 0.05576039558922247), ('signal processing', 0.054031061909360585), ('human', 0.053903423319960564), ('applied mathematics', 0.05198015316401382), ('machine learning', 0.0517701697980533), ('swiss national science foundation project', 0.05069203548304731), ('perception', 0.0496213568013842), ('technical university', 0.04961343186357499), ('ostrava', 0.0489591789742317), ('eth zurich', 0.048249923772628704), ('czech republic', 0.04719548255331527), ('switzerland', 0.04475399070468492), ('environment', 0.04236668

  next_indices = next_tokens // vocab_size


He was awarded a phd in computer science by the technical university of ostrava, czech republic. His work was on feature selection and function
approximation using adaptive algorithms. The czech university of ostreva awarded him an ivf grant for his research in machine learning and signal
processing for pattern analysis of human's perception of the urban environment.
n_best keyphrases:  [('phd', 0.07860856377369267), ('computer science', 0.05896968241102518), ('pattern analysis', 0.05576039558922247), ('signal processing', 0.054031061909360585), ('human', 0.053903423319960564), ('applied mathematics', 0.05198015316401382), ('machine learning', 0.0517701697980533), ('swiss national science foundation project', 0.05069203548304731), ('perception', 0.0496213568013842), ('technical university', 0.04961343186357499), ('ostrava', 0.0489591789742317), ('eth zurich', 0.048249923772628704), ('czech republic', 0.04719548255331527), ('switzerland', 0.04475399070468492), ('environment', 0.04236668

  next_indices = next_tokens // vocab_size


He was awarded a phd in computer science by the technical university of ostrava, czech republic. His work was on feature selection and function
approximation using adaptive algorithms. The czech university of ostreva awarded him an ivf grant for his research in machine learning and signal
processing for pattern analysis of human's perception of the urban environment.
n_best keyphrases:  [('phd', 0.07860856377369267), ('computer science', 0.05896968241102518), ('pattern analysis', 0.05576039558922247), ('signal processing', 0.054031061909360585), ('human', 0.053903423319960564), ('applied mathematics', 0.05198015316401382), ('machine learning', 0.0517701697980533), ('swiss national science foundation project', 0.05069203548304731), ('perception', 0.0496213568013842), ('technical university', 0.04961343186357499), ('ostrava', 0.0489591789742317), ('eth zurich', 0.048249923772628704), ('czech republic', 0.04719548255331527), ('switzerland', 0.04475399070468492), ('environment', 0.04236668

  next_indices = next_tokens // vocab_size


He was awarded a phd in computer science by the technical university of ostrava, czech republic. His work was on feature selection and function
approximation using adaptive algorithms. The czech university of ostreva awarded him an ivf grant for his research in machine learning and signal
processing for pattern analysis of human's perception of the urban environment.
n_best keyphrases:  [('phd', 0.07860856377369267), ('computer science', 0.05896968241102518), ('pattern analysis', 0.05576039558922247), ('signal processing', 0.054031061909360585), ('human', 0.053903423319960564), ('applied mathematics', 0.05198015316401382), ('machine learning', 0.0517701697980533), ('swiss national science foundation project', 0.05069203548304731), ('perception', 0.0496213568013842), ('technical university', 0.04961343186357499), ('ostrava', 0.0489591789742317), ('eth zurich', 0.048249923772628704), ('czech republic', 0.04719548255331527), ('switzerland', 0.04475399070468492), ('environment', 0.04236668

  next_indices = next_tokens // vocab_size


He was awarded a phd in computer science by the technical university of ostrava, czech republic. His work was on feature selection and function
approximation using adaptive algorithms. The czech university of ostreva awarded him an ivf grant for his research in machine learning and signal
processing for pattern analysis of human's perception of the urban environment.
n_best keyphrases:  [('phd', 0.07860856377369267), ('computer science', 0.05896968241102518), ('pattern analysis', 0.05576039558922247), ('signal processing', 0.054031061909360585), ('human', 0.053903423319960564), ('applied mathematics', 0.05198015316401382), ('machine learning', 0.0517701697980533), ('swiss national science foundation project', 0.05069203548304731), ('perception', 0.0496213568013842), ('technical university', 0.04961343186357499), ('ostrava', 0.0489591789742317), ('eth zurich', 0.048249923772628704), ('czech republic', 0.04719548255331527), ('switzerland', 0.04475399070468492), ('environment', 0.04236668

  next_indices = next_tokens // vocab_size


He was awarded a phd in computer science by the technical university of ostrava, czech republic. His work was on feature selection and function
approximation using adaptive algorithms. The czech university of ostreva awarded him an ivf grant for his research in machine learning and signal
processing for pattern analysis of human's perception of the urban environment.
n_best keyphrases:  [('phd', 0.07860856377369267), ('computer science', 0.05896968241102518), ('pattern analysis', 0.05576039558922247), ('signal processing', 0.054031061909360585), ('human', 0.053903423319960564), ('applied mathematics', 0.05198015316401382), ('machine learning', 0.0517701697980533), ('swiss national science foundation project', 0.05069203548304731), ('perception', 0.0496213568013842), ('technical university', 0.04961343186357499), ('ostrava', 0.0489591789742317), ('eth zurich', 0.048249923772628704), ('czech republic', 0.04719548255331527), ('switzerland', 0.04475399070468492), ('environment', 0.04236668

  next_indices = next_tokens // vocab_size


He was awarded a phd in computer science by the technical university of ostrava, czech republic. His work was on feature selection and function
approximation using adaptive algorithms. The czech university of ostreva awarded him an ivf grant for his research in machine learning and signal
processing for pattern analysis of human's perception of the urban environment.
n_best keyphrases:  [('phd', 0.07860856377369267), ('computer science', 0.05896968241102518), ('pattern analysis', 0.05576039558922247), ('signal processing', 0.054031061909360585), ('human', 0.053903423319960564), ('applied mathematics', 0.05198015316401382), ('machine learning', 0.0517701697980533), ('swiss national science foundation project', 0.05069203548304731), ('perception', 0.0496213568013842), ('technical university', 0.04961343186357499), ('ostrava', 0.0489591789742317), ('eth zurich', 0.048249923772628704), ('czech republic', 0.04719548255331527), ('switzerland', 0.04475399070468492), ('environment', 0.04236668