In [None]:
import torch
from transformers import (
    BertForQuestionAnswering,
    BertTokenizerFast,
)
from scipy.special import softmax
import plotly.express as px
import numpy as np
import pandas as pd



In [None]:
model_name = 'deepset/bert-base-cased-squad2'
tokenizer = BertTokenizerFast.from_pretrained(model_name)
model = BertForQuestionAnswering.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/152 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/508 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/433M [00:00<?, ?B/s]

Some weights of the model checkpoint at deepset/bert-base-cased-squad2 were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
def predict_answer(context, question):
  inputs = tokenizer(question, context, return_tensors='pt')

  with torch.no_grad():
    outputs = model(**inputs)

  start_score, end_score = softmax(outputs.start_logits)[0], softmax(outputs.end_logits)[0]

  start_idx = np.argmax(start_score)
  end_idx = np.argmax(end_score)

  confident_score = (start_score[start_idx] + end_score[end_idx]) / 2

  answer_ids = inputs.input_ids[0][start_idx: end_idx +1]
  answer_tokens = tokenizer.convert_ids_to_tokens(answer_ids)
  answer = tokenizer.convert_tokens_to_string(answer_tokens)

  if answer != tokenizer.cls_token:
    return answer, confident_score
  return None, confident_score

In [None]:
context = """Coffee is a beverage brewed from roasted, ground coffee beans. Darkly colored, bitter, and slightly acidic, coffee has a stimulating effect on humans, primarily due to its caffeine content, but decaffeinated coffee is also commercially available. There are also various coffee substitutes.

Coffee production begins when the seeds from coffee cherries (the Coffea plant's fruits) are separated to produce unroasted green coffee beans. The "beans" are roasted and then ground into fine particles. Coffee is brewed from the ground roasted beans, which are typically steeped in hot water before being filtered out. It is usually served hot, although chilled or iced coffee is common. Coffee can be prepared and presented in a variety of ways (e.g., espresso, French press, caffè latte, or already-brewed canned coffee). Sugar, sugar substitutes, milk, and cream are often added to mask the bitter taste or enhance the flavor.

Though coffee has become a global commodity, it has a long history tied closely to food traditions around the Red Sea. The earliest credible reports of coffee drinking pertain to the plant's use among the Sufis of Yemen (southern Arabia) in the middle of the 15th century.[4][5] Up to the end of the 17th century, most of the world's coffee was imported from Yemen. But as the beverage gained in popularity, coffee started to be cultivated in Java in the 17th century, as well as in the Americas from the 18th century onward.[6]

The two most commonly grown coffee bean types are C. arabica and C. robusta.[7] Coffee plants are cultivated in over 70 countries, primarily in the equatorial regions of the Americas, Southeast Asia, the Indian subcontinent, and Africa. Green, unroasted coffee is traded as an agricultural commodity. The global coffee industry is worth $495.50 billion, as of 2023.[8] In 2023, Brazil was the leading grower of coffee beans, producing 31% of the world's total, followed by Vietnam. While coffee sales reach billions of dollars annually worldwide, coffee farmers disproportionately live in poverty. Critics of the coffee industry have pointed to its negative impact on the environment, including clearing of land for coffee growing and water use."""

In [None]:
predict_answer(context, "What is coffee?")

('a beverage brewed from roasted, ground coffee beans', np.float32(0.8250079))

In [None]:
predict_answer(context, "What are the most common coffee beans?")

('C. arabica and C. robusta', np.float32(0.9428095))

In [None]:
predict_answer(context, "How can I make Ice coffee?")

(None, np.float32(0.9893315))

One of it's limitation is token size

In [None]:
len(tokenizer.tokenize(context))

497

In [None]:
context = """Coffee is a beverage brewed from roasted, ground coffee beans. Darkly colored, bitter, and slightly acidic, coffee has a stimulating effect on humans, primarily due to its caffeine content, but decaffeinated coffee is also commercially available. There are also various coffee substitutes.

Coffee production begins when the seeds from coffee cherries (the Coffea plant's fruits) are separated to produce unroasted green coffee beans. The "beans" are roasted and then ground into fine particles. Coffee is brewed from the ground roasted beans, which are typically steeped in hot water before being filtered out. It is usually served hot, although chilled or iced coffee is common. Coffee can be prepared and presented in a variety of ways (e.g., espresso, French press, caffè latte, or already-brewed canned coffee). Sugar, sugar substitutes, milk, and cream are often added to mask the bitter taste or enhance the flavor.

Though coffee has become a global commodity, it has a long history tied closely to food traditions around the Red Sea. The earliest credible reports of coffee drinking pertain to the plant's use among the Sufis of Yemen (southern Arabia) in the middle of the 15th century.[4][5] Up to the end of the 17th century, most of the world's coffee was imported from Yemen. But as the beverage gained in popularity, coffee started to be cultivated in Java in the 17th century, as well as in the Americas from the 18th century onward.[6]

The two most commonly grown coffee bean types are C. arabica and C. robusta.[7] Coffee plants are cultivated in over 70 countries, primarily in the equatorial regions of the Americas, Southeast Asia, the Indian subcontinent, and Africa. Green, unroasted coffee is traded as an agricultural commodity. The global coffee industry is worth $495.50 billion, as of 2023.[8] In 2023, Brazil was the leading grower of coffee beans, producing 31% of the world's total, followed by Vietnam. While coffee sales reach billions of dollars annually worldwide, coffee farmers disproportionately live in poverty. Critics of the coffee industry have pointed to its negative impact on the environment, including clearing of land for coffee growing and water use.

In the late 16th century, Yemen developed a booming coffee economy. Farmers grew coffee on mountain terraces above the Tihamah plain, and trade routes linked its ports to Jeddah and Cairo. By the 17th century, coffee had surpassed the global spice trade.[53] Up to the end of the 17th century, Yemen was the world's main producer for coffee, and Mocha was the world's largest shipping port for coffee.[6][54]


A coffee can from the first half of the 20th century. From the Museo del Objeto del Objeto collection.
Meanwhile, coffee had been introduced to Brazil in 1727, although its cultivation did not gather momentum until independence in 1822.[55] After this time, massive tracts of rainforest were cleared for coffee plantations, first in the vicinity of Rio de Janeiro and later São Paulo.[56] Brazil went from having essentially no coffee exports in 1800 to being a significant regional producer in 1830, to being the largest producer in the world by 1852. Between 1910 and 1920, Brazil exported around 70% of the world's coffee; Colombia, Guatemala, and Venezuela exported 15%; and Old World production accounted for less than 5% of world exports.[57]

Many countries in Central America took up cultivation in the latter half of the 19th century, and almost all were involved in the large-scale displacement and exploitation of indigenous people. Harsh conditions led to many uprisings, coups, and bloody suppression of peasants.[58] The notable exception was Costa Rica, where a lack of ready labor prevented the formation of large farms. Smaller farms and more egalitarian conditions ameliorated unrest over the 19th and 20th centuries.[59]

Rapid growth in coffee production in South America during the second half of the 19th century was matched by an increase in consumption in developed countries, though nowhere has this growth been as pronounced as in the United States, where a high rate of population growth was compounded by doubling of per capita consumption between 1860 and 1920. Though the United States was not the heaviest coffee-drinking nation at the time (Belgium, the Netherlands and Nordic countries all had comparable or higher levels of per capita consumption), due to its sheer size, it was already the largest consumer of coffee in the world by 1860, and, by 1920, around half of all coffee produced worldwide was consumed in the US.[57]

Coffee has become a vital cash crop for many developing countries. Over 100 million people in developing countries have become dependent on coffee as their primary source of income. It has become the primary export and economic backbone for African countries like Uganda, Burundi, Rwanda, and Ethiopia,[60] as well as many Central American countries."""

In [None]:
len(tokenizer.tokenize(context))

Token indices sequence length is longer than the specified maximum sequence length for this model (1059 > 512). Running this sequence through the model will result in indexing errors


1059

In [None]:
predict_answer(context, "How many people are dependent on coffee for their income?")

RuntimeError: The size of tensor a (1073) must match the size of tensor b (512) at non-singleton dimension 1

In [None]:
def predict_answer(context, question):
  inputs = tokenizer(question, context, return_tensors='pt', truncation=True, max_length=512)

  with torch.no_grad():
    outputs = model(**inputs)

  start_score, end_score = softmax(outputs.start_logits)[0], softmax(outputs.end_logits)[0]

  start_idx = np.argmax(start_score)
  end_idx = np.argmax(end_score)

  confident_score = (start_score[start_idx] + end_score[end_idx]) / 2

  answer_ids = inputs.input_ids[0][start_idx: end_idx +1]
  answer_tokens = tokenizer.convert_ids_to_tokens(answer_ids)
  answer = tokenizer.convert_tokens_to_string(answer_tokens)

  if answer != tokenizer.cls_token:
    return answer, confident_score
  return None, confident_score

In [None]:
predict_answer(context, "How many people are dependent on coffee for their income?")

(None, np.float32(0.99914694))

In [None]:
predict_answer(context[4000:], "How many people are dependent on coffee for their income?")

('Over 100 million', np.float32(0.8183233))

Can overcome by divinding the sentence into chunks

In [None]:
def chunck_sentence(sentences, chunk_size, stride):
  chunks = []
  num_sentence = len(sentences)

  for i in range(0, num_sentence, chunk_size - stride):
    chunk = sentences[i: i + chunk_size]
    chunks.append(chunk)
  return chunks

In [None]:
sentences = [
    "Sentence 1.",
    "Sentence 2.",
    "Sentence 3.",
    "Sentence 4.",
    "Sentence 5.",
    "Sentence 6.",
    "Sentence 7.",
    "Sentence 8.",
    "Sentence 9.",
    "Sentence 10.",
]

In [None]:
chunked_sentences = chunck_sentence(sentences, 3, 1)
chunked_sentences

[['Sentence 1.', 'Sentence 2.', 'Sentence 3.'],
 ['Sentence 3.', 'Sentence 4.', 'Sentence 5.'],
 ['Sentence 5.', 'Sentence 6.', 'Sentence 7.'],
 ['Sentence 7.', 'Sentence 8.', 'Sentence 9.'],
 ['Sentence 9.', 'Sentence 10.']]

In [None]:
chunked_sentences = chunck_sentence(sentences, 4, 2)
for i, chunk in enumerate(chunked_sentences):
  print(f"Chunk {i+1}: {chunk}")

Chunk 1: ['Sentence 1.', 'Sentence 2.', 'Sentence 3.', 'Sentence 4.']
Chunk 2: ['Sentence 3.', 'Sentence 4.', 'Sentence 5.', 'Sentence 6.']
Chunk 3: ['Sentence 5.', 'Sentence 6.', 'Sentence 7.', 'Sentence 8.']
Chunk 4: ['Sentence 7.', 'Sentence 8.', 'Sentence 9.', 'Sentence 10.']
Chunk 5: ['Sentence 9.', 'Sentence 10.']


In [None]:
chunked_sentences = chunck_sentence(sentences, 4, 1)
for i, chunk in enumerate(chunked_sentences):
  print(f"Chunk {i+1}: {chunk}")

Chunk 1: ['Sentence 1.', 'Sentence 2.', 'Sentence 3.', 'Sentence 4.']
Chunk 2: ['Sentence 4.', 'Sentence 5.', 'Sentence 6.', 'Sentence 7.']
Chunk 3: ['Sentence 7.', 'Sentence 8.', 'Sentence 9.', 'Sentence 10.']
Chunk 4: ['Sentence 10.']


Working with the context

In [None]:
sentences = context.split("\n")
len(sentences)

19

In [None]:
chunked_sentences = chunck_sentence(sentences, 3, 1)

In [None]:
questions = ["What is coffee?", "What are the most common coffee beans?", "How many people are dependent on coffee for their income?"]

answers = {}

In [None]:
for chunk in chunked_sentences:
  context = ''.join(chunk)
  for question in questions:
    answer, score = predict_answer(context, question)
    if answer:
      if question not in answers:
        answers[question] = (answer, score)
      else:
        if score > answers[question][1]:
          answers[question] = (answer, score)

In [None]:
answers

{'What is coffee?': ('a beverage brewed from roasted, ground coffee beans',
  np.float32(0.87443066)),
 'What are the most common coffee beans?': ('C. arabica and C. robusta',
  np.float32(0.95387244)),
 'How many people are dependent on coffee for their income?': ('Over 100 million',
  np.float32(0.8764832))}