In [None]:
!pip install nltk

# 1) Table Q&A.

Nous allons utiliser TAPAS, le Table Q&A de Google entraîné sur le [WikiTableQuestions](https://paperswithcode.com/dataset/wikitablequestions) (WTQ) dataset. Ici, pas de call d'API.
TAPAS permet de répondre à une question dont la réponse se trouve dans une table.

In [None]:
from transformers import TapasTokenizer, TapasForQuestionAnswering
import pandas as pd

# On charge le modèle et son tokenizer.
tokenizer = TapasTokenizer.from_pretrained('google/tapas-base-finetuned-wtq')
model = TapasForQuestionAnswering.from_pretrained('google/tapas-base-finetuned-wtq')

In [None]:
# Example table in a pandas DataFrame
data = {
    'Actors': ["Brad Pitt", "Leonardo DiCaprio", "Tom Cruise"],
    'Age': [57, 46, 58],
    'Movies': [88, 53, 43]
}
table = pd.DataFrame.from_dict(data)
# Attention de bien mettre la table sous forme de str !
table = pd.DataFrame(data).astype(str)

# La question
queries = ["How many movies has Leonardo DiCaprio acted in?"]
queries = ["Who played 88 movies ?"]

In [None]:
# Tokenize inputs as Pytorch tensor (la table et la query)
inputs = tokenizer(table=table, queries=queries, padding='max_length', return_tensors="pt")

In [None]:
# Perform inference
outputs = model(**inputs)

In [None]:
# Get the predicted answer coordinates
predicted_answer_coordinates = tokenizer.convert_logits_to_predictions(
    inputs, outputs.logits.detach().cpu()
)
predicted_answer_coordinates

In [None]:
# Get the cell values corresponding to the predicted answer coordinates
for question, coordinates in zip(queries, predicted_answer_coordinates[0]):
    answers = []
    for coordinate in coordinates:
        if len(coordinate) == 2:
            cell_value = table.iat[coordinate]
            answers.append(cell_value)
    print(f"Question: {question}\nPredicted answer: {answers}\n")

A priori, on aurait pu s'en sortir avec des requêtes SQL ou pandas. Quel est alors l'intérêt de Table Q&A ?

 * Pas besoin de pré-coder toutes les possibilités de requêtes utilisateur -- c'est très pratique !

 * Facile d'automatiser les process lorsqu'il y a beaucoup de donnees différentes et que nous avons qu'une seule question à poser.


# 2) Text Summarization

Il suffit d'utiliser l'API d'OpenAI et le paramètre `max_tokens` pour contrôler la taille du résumé.

In [None]:
from openai import OpenAI

from credentials.keys import OPENAI_API_KEY

client = OpenAI(api_key=OPENAI_API_KEY)

# Input text to summarize
input_text = """
In a groundbreaking discovery, scientists have found evidence of water on Mars.
The discovery was made using a new technique involving spectroscopy.
"""

# Generate a summary using the OpenAI API
response = client.chat.completions.create(
    model="gpt-4o-mini",
    messages=[
        {"role": "user", "content": f"Summarize the following text in a single sentence: {input_text}"}
    ],
    max_tokens=100
)

# NB: with older GPT-3.5 Turbo model
# response = client.completions.create(
#   model="gpt-3.5-turbo-instruct",
#   prompt=f"Summarize the following text: {input_text}",
#   max_tokens=100
# )

In [None]:
print(response.choices[0].message.content)

Évidemment, cette approche est limitée à des textes dont la longueur ne dépasse pas celle du contexte du modèle (ici de 128k tokens uniquement). Pour résumer un texte plus long, il faudrait le découper en chunks, résumer ces chunks, les aggréger et ensuite les résumer (ou alors utiliser un modèle dont la fenêtre de contexte est plus longue...)

Il y a de nombreuses façons de faire cela de facon un peu moins naïve, par exemple en déterminant les parties les plus importante du texte et en les résumant en priorité. On peut aussi utiliser des modèles un peu plus spécialisés pour ce genre de tâches.

**Ici, nous allons résumer l'histoire de Cendrillon en implémentant cette stratégie de chunking.**

**Exercice**: Charger le livre dans Python.

In [None]:
file_path = "./cinderella.txt"  # Adjust as per your file path

with open(file_path, "r", encoding="utf-8") as file:
    text_to_summarize = file.read()

**Exercice**: définir une fonction `get_chat_completion` pour envoyer un prompt à l'API d'OpenAI et récupérer le résultat sous forme de string.

In [None]:
import os

client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

def get_chat_completion(messages, model='gpt-4o-mini'):
    response = client.chat.completions.create(
        model=model,
        messages=messages,
        temperature=0,
    )
    return response.choices[0].message.content

Ensuite, on définit les fonctions annexes pour tokéniser le texte, le découper en chunks dont le nombre de tokens ne dépasse pas une certaine valeur (pour gérer la limite sur la longueur de la fenêtre de contexte).

In [None]:
from typing import Optional
import tiktoken


def tokenize(text: str) -> list[str]:
    '''
    Tokéniser un texte en utilisant gpt-4o-mini
    '''
    encoding = tiktoken.encoding_for_model('gpt-4o-mini')
    return encoding.encode(text)


def chunk_on_delimiter(
        input_string: str,
        max_tokens: int,
        delimiter: str
        ) -> list[str]:
    '''
    Chunk a text into smaller pieces based on a maximum token count and a delimiter.
    '''
    chunks = input_string.split(delimiter)
    combined_chunks, _, dropped_chunk_count = combine_chunks(
        chunks, max_tokens, chunk_delimiter=delimiter, add_ellipsis_for_overflow=True
    )
    if dropped_chunk_count > 0:
        print(f"warning: {dropped_chunk_count} chunks were dropped due to overflow")
    combined_chunks = [f"{chunk}{delimiter}" for chunk in combined_chunks]
    return combined_chunks


def combine_chunks(
        chunks: list[str],
        max_tokens: int,
        chunk_delimiter: str = "\n\n",
        header: Optional[str] = None,
        add_ellipsis_for_overflow: bool = False,
        ) -> tuple[list[str], list[int]]:
    '''
    Combine text chunks into larger blocks without exceeding a specified token count.
    Return the combined text blocks, their original indices, and the count of chunks
    dropped due to overflow.
    '''
    dropped_chunk_count = 0
    output = []  # list to hold the final combined chunks
    output_indices = []  # list to hold the indices of the final combined chunks
    candidate = (
        [] if header is None else [header]
    )

    candidate_indices = []
    for chunk_i, chunk in enumerate(chunks):

        chunk_with_header = [chunk] if header is None else [header, chunk]
        if len(tokenize(chunk_delimiter.join(chunk_with_header))) > max_tokens:
            print("Warning: Chunk overflow")
            if (
                    add_ellipsis_for_overflow
                    and len(tokenize(chunk_delimiter.join(candidate + ["..."]))) <= max_tokens
            ):
                candidate.append("...")
                dropped_chunk_count += 1
            continue

        # estimate token count with the current chunk added
        extended_candidate_token_count = len(tokenize(chunk_delimiter.join(candidate + [chunk])))
        # If the token count exceeds max_tokens, add the current candidate to output and start a new candidate
        if extended_candidate_token_count > max_tokens:
            output.append(chunk_delimiter.join(candidate))
            output_indices.append(candidate_indices)
            candidate = chunk_with_header  # re-initialize candidate
            candidate_indices = [chunk_i]
        # otherwise keep extending the candidate
        else:
            candidate.append(chunk)
            candidate_indices.append(chunk_i)

    # add the remaining candidate to output if it's not empty
    if (header is not None and len(candidate) > 1) or (header is None and len(candidate) > 0):
        output.append(chunk_delimiter.join(candidate))
        output_indices.append(candidate_indices)

    return output, output_indices, dropped_chunk_count

**Exercice** : écrire une fonction `summarize_text` qui résume un long document.

In [None]:
from tqdm import tqdm


def summarize_text(
    text: str,
    detail: float = 0,
    model: str = 'gpt-4o-mini',
    minimum_chunk_size: Optional[int] = 2000,
    chunk_delimiter: str = ".",
):
    """
    Summarizes a given text by splitting it into chunks, each of which is summarized individually.

    Args:
        text (str): The text to be summarized.
        detail (float, optional): A value between 0 and 1 indicating the desired level of detail in the summary.
          0 leads to a higher level summary, and 1 results in a more detailed summary. Defaults to 0.
        model (str, optional): The model to use for generating summaries. Defaults to 'gpt-3.5-turbo'.
        minimum_chunk_size (Optional[int], optional): The minimum size for text chunks. Defaults to 500.
        chunk_delimiter (str, optional): The delimiter used to split the text into chunks. Defaults to ".".

    Returns:
    - str: The final compiled summary of the text.

    The function first determines the number of chunks by interpolating between a minimum and a maximum chunk
    count based on the `detail` parameter. It then splits the text into chunks and summarizes each chunk.
    """

    # check detail is set correctly
    assert 0 <= detail <= 1

    # interpolate the number of chunks based to get specified level of detail
    max_chunks = len(chunk_on_delimiter(text, minimum_chunk_size, chunk_delimiter))
    min_chunks = 1
    num_chunks = int(min_chunks + detail * (max_chunks - min_chunks))

    # Adjust chunk_size based on interpolated number of chunks
    document_length = len(tokenize(text))
    chunk_size = max(minimum_chunk_size, document_length // num_chunks)
    text_chunks = chunk_on_delimiter(text, chunk_size, chunk_delimiter)
    print(f"Splitting the text into {len(text_chunks)} chunks to be summarized.")
    print(f"Chunk lengths are {[len(tokenize(x)) for x in text_chunks]}")

    # set system message
    system_message_content = f"You will be given different passages from a book one by one." + \
      f"Provide a summary of the following text. When summarizing, directly dive into the narrative " + \
      f"or descriptions from the text without using introductory phrases like 'In this passage'." + \
      f"Directly address the main events, characters, and themes, encapsulating the essence and " + \
      f"significant details from the text in a flowing narrative. The goal is to present a unified " + \
      f"view of the content, continuing the story seamlessly as if the passage naturally progresses into the summary."

    accumulated_summaries = []
    for chunk in tqdm(text_chunks):
        user_message_content = chunk
        messages = [
            {"role": "system", "content": system_message_content},
            {"role": "user", "content": user_message_content}
        ]
        response = get_chat_completion(messages, model=model)
        accumulated_summaries.append(response)

    # Compile final summary from partial summaries
    final_summary = '\n\n'.join(accumulated_summaries)

    return final_summary

In [None]:
summary = summarize_text(text_to_summarize, detail=0.25, verbose=True)

On enregistre à présent le résumé sous forme de PDF.

In [None]:
!pip install langchain openai tiktoken fpdf2

In [None]:
from fpdf import FPDF

class PDF(FPDF):
   def header(self):
       # Select Arial bold 15
       self.set_font('Arial', 'B', 15)
       # Move to the right
       self.cell(80)
       # Framed title
       self.cell(30, 10, 'Summary', 1, 0, 'C')
       # Line break
       self.ln(20)

   def footer(self):
       # Go to 1.5 cm from bottom
       self.set_y(-15)
       # Select Arial italic 8
       self.set_font('Arial', 'I', 8)
       # Page number
       self.cell(0, 10, 'Page %s' % self.page_no(), 0, 0, 'C')

pdf = PDF()
pdf.add_page()
pdf.set_font("Arial", size=12)
summary_utf8 = summary.encode('latin-1', 'replace').decode('latin-1')
pdf.multi_cell(0, 10, summary_utf8)
pdf_output_path = "./summary.pdf"
pdf.output(pdf_output_path)

On observe en pratique que cette approche ne fonctionne pas toujours très bien. On peut aller plus loin en utilisant des modèles dit encoder-decoder plus à même de résoudre ce genre de tâches (comme **BERT** ou **RoBERTA**).
- https://www.width.ai/post/4-long-text-summarization-methods
- https://medium.com/@pvsravanth/unlocking-the-power-of-text-summarization-with-large-language-models-llms-522372e7f9e0

En explorant HuggingFace, on peut trouver plusieurs modèles qui sont censés être performants pour cette tâche :
- **BERT_summary**: Une variante de BERT fine-tuned pour la summarization en particulier. https://huggingface.co/Shobhank-iiitdwd/BERT_summary/tree/main
- **pegasus-multi_news** : https://huggingface.co/google/pegasus-multi_news PEGASUS = Pre-training with Extracted Gap-sentences for Abstractive Summarization Sequence-to-sequence. Développé par Google.
- **longformer-base-4096** : https://huggingface.co/allenai/longformer-base-4096
- **bigbird-roberta-base**: https://huggingface.co/google/bigbird-roberta-base by Google


On peut commencer par essayer de voir si l'on peut facilement telecharger le modele "Shobhank-iiitdwd/BERT_summary".

In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

# Load the pre-trained T5 model and tokenizer
tokenizer = T5Tokenizer.from_pretrained("Shobhank-iiitdwd/BERT_summary")
model = T5ForConditionalGeneration.from_pretrained("Shobhank-iiitdwd/BERT_summary")

On tente maintenant le modele "google/pegasus-multi_news"

In [None]:
# Load the pre-trained T5 model and tokenizer
tokenizer = T5Tokenizer.from_pretrained("google/pegasus-multi_news")
model = T5ForConditionalGeneration.from_pretrained("google/pegasus-multi_news")

En fait, pour que cela fonction il faut télécharger le modèle directement via des fonctions plus spécialisées **PegasusTokenizer** ou **PegasusForConditionalGeneration**.

In [None]:
from transformers import PegasusTokenizer, PegasusForConditionalGeneration

# Load the pre-trained PEGASUS tokenizer and model
tokenizer = PegasusTokenizer.from_pretrained("google/pegasus-multi_news")
model = PegasusForConditionalGeneration.from_pretrained("google/pegasus-multi_news")

In [None]:
import torch

def summarize_text(
        text: str,
        max_length: int = 300,
        min_length: int = 30,
        num_beams: int = 4):
    """
    Inputs:
    -------
        text: The input text to be summarized.
        num_beams: Number of beams for beam search. Plus il y a de beam mieux c'est mais plus le cout computationnel explose.
    """
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    print(f'Using device {device}.')
    model.to(device)

    # Tokenize the input text
    inputs = tokenizer.encode("Summarize: " + text, return_tensors="pt", truncation=True)

    print(f'Input shape: {inputs.shape}')

    # Generate the summary
    summary_ids = model.generate(
        inputs.to(device),
        max_length=max_length,
        min_length=min_length,
        length_penalty=2.0,
        num_beams=num_beams,
        early_stopping=False
    )

    # Decode the summary
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

    return summary

In [None]:
summary = summarize_text(text_to_summarize)
print(summary)

Quelques remarques de ce TD
- Il y a de nombreux modèles disponibles pour une tâche spécifique.
- Commencer par tester si on arrive a y avoir accès facilement (pas d'erreur de package ou autre qui prendraient du temps à gérer).
- Il faut tout de même faire un nombre d'expériences assez significatif.
- Étant donné le livre que l'on a extrait, il vaudrait mieux bien nettoyer le texte pour éviter des ennuis inutiles.

# 3) Named Entity Recognition NER

C'est la tâche qui consiste à identifier et classifier les entités nommées (noms propres) dans des catégories prédéfinies telles que les noms de personnes, d'organisations, de lieux, de dates, de quantités, etc.

Là encore, on peut utiliser du prompt pour résoudre cette tâche. Noter la encore que des modèles spécialisés peuvent etre plus performants et moins couteux pour ce genre de tâches.

In [None]:
!pip install openai tiktoken

In [None]:
from openai import OpenAI

from credentials.keys import OPENAI_API_KEY

client = OpenAI(api_key=OPENAI_API_KEY)

# Example text to analyze for named entities
input_text = "Apple Inc. is headquartered in Cupertino, California."

# Construct a prompt to identify named entities
prompt = f"Identify the named entities in the following text. " + \
    f"You should return the entities in a list format, such as '['entity1', 'entity2', 'entity3']'" + \
    f"Here is the text: \'{input_text}\'."

# Request completion from OpenAI API
response = client.chat.completions.create(
    model="gpt-4o-mini",
    messages=[
        {"role": "user", "content": prompt}  # Chat-based message format
    ],
    max_tokens=50  # Control response length
)

In [None]:
# Process the response to extract named entities
extracted_entities = response.choices[0].message.content

print(extracted_entities)

Quels sont les modèles spécialisés pour faire de l'entity recognition ?

En cherchant NER sur HuggingFace, on obtient par exemple :
- bert-base-NER https://huggingface.co/dslim/bert-base-NER


In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/59.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/829 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/433M [00:00<?, ?B/s]

Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
nlp = pipeline("ner", model=model, tokenizer=tokenizer)
example = "My name is Wolfgang and I live in Berlin but I often travel to France, although my friend Matthias lives in Italy."

ner_results = nlp(example)
print([d['word'] for d in ner_results])
print([d['entity'] for d in ner_results])

['B-LOC', 'B-PER', 'B-LOC', 'B-LOC', 'B-PER']


**Exercice** : Anonymiser le texte grâce à la méthode de NER.

*Indices* : Les indices de début/fin de chaque entité dans la chaîne de caractères de départ sont donnés par `ner_results[idx]['start']` et `ner_results[idx]['end']`.

In [None]:
# Anonymize the text
anonymized_text = example

# Sort NER results in reverse order of their positions
# This prevents messing up the indices while replacing
ner_results = sorted(ner_results, key=lambda x: x['start'], reverse=True)

# Replace entities with placeholders
for entity in ner_results:
    entity_label = entity['entity'].split('-')[-1]  # Get the entity type (e.g., PERSON, LOC, etc.)
    start, end = entity['start'], entity['end']
    anonymized_text = anonymized_text[:start] + entity_label + anonymized_text[end:]

print("Anonymized Text:", anonymized_text)

Anonymized Text: My name is PER and I live in LOC but I often travel to LOC, although my friend PER lives in LOC.


## Optionnel : fine-tuning de NER sur un dataset.

*NB : ce code ne fonctionne qu'en environnement tensorflow.*

Nous pouvons explorer le dataset **conll2003** qui est un dataset sur lesquel chaque les mots sont associes a leur part-of-speech tagging, leur NER. Pour l'instant nous n'allons pas encore evaluer les performances du model sur ce jeu de donnees.

In [None]:
# On charge le dataset
from transformers import BertTokenizerFast, BertForTokenClassification, pipeline
import numpy as np


def load_sentences(filepath: str):
    '''
    Load the CoNLL-2003 dataset.
    '''
    final, sentences = [], []
    with open(filepath, 'r') as f:
        for line in f.readlines():
            if (line == ('-DOCSTART- -X- -X- O\n') or line == '\n'):
                if len(sentences) > 0:
                    final.append(sentences)
                    sentences = []
            else:
                l = line.split(' ')
                sentences.append((l[0], l[3].strip('\n')))
    return final

In [None]:
base_path = './conll003-englishversion/'

train_samples = load_sentences(base_path + 'train.txt')
test_samples = load_sentences(base_path + 'test.txt')
valid_samples = load_sentences(base_path + 'valid.txt')

samples = train_samples + test_samples

schema = ['_'] + sorted({tag for sentence in samples
                             for _, tag in sentence})

In [None]:
schema

['_',
 'B-LOC',
 'B-MISC',
 'B-ORG',
 'B-PER',
 'I-LOC',
 'I-MISC',
 'I-ORG',
 'I-PER',
 'O']

In [None]:
from transformers import AutoConfig, TFAutoModelForTokenClassification

MODEL_NAME = 'bert-base-cased'

config = AutoConfig.from_pretrained(MODEL_NAME, num_labels=len(schema))
model = TFAutoModelForTokenClassification.from_pretrained(MODEL_NAME, config=config)
model.summary()

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from tqdm import tqdm

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def tokenize_sample(sample):
    seq = [
               (subtoken, tag)
               for token, tag in sample
               for subtoken in tokenizer(token)['input_ids'][1:-1]
           ]
    return [(3, 'O')] + seq + [(4, 'O')]

def preprocess(samples):
    tag_index = {tag: i for i, tag in enumerate(schema)}
    tokenized_samples = list(tqdm(map(tokenize_sample, samples)))
    max_len = max(map(len, tokenized_samples))
    X = np.zeros((len(samples), max_len), dtype=np.int32)
    y = np.zeros((len(samples), max_len), dtype=np.int32)
    for i, sentence in enumerate(tokenized_samples):
        for j, (subtoken_id, tag) in enumerate(sentence):
            X[i, j] = subtoken_id
            y[i,j] = tag_index[tag]
    return X, y

X_train, y_train = preprocess(train_samples)
X_test, y_test = preprocess(test_samples)
X_valid, y_valid = preprocess(valid_samples)

14041it [00:05, 2702.84it/s]
3452it [00:01, 2957.40it/s]
3249it [00:01, 2517.23it/s]


In [None]:
import tensorflow as tf


num_epochs = 5
batch_size = 8

optimizer = tf.keras.optimizers.Adam(lr=1E-6)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
model.compile(optimizer=optimizer, loss=loss, metrics='accuracy')
history = model.fit(
    tf.constant(X_train), tf.constant(y_train),
    validation_data=(X_test, y_test),
    epochs=num_epochs,
    batch_size=batch_size)

In [None]:
[loss, accuracy] = model.evaluate(X_valid, y_valid)
print("Loss:%1.3f, Accuracy:%1.3f" % (loss, accuracy))