## <center> École Polytechnique de Montréal <br> Département Génie Informatique et Génie Logiciel <br>  INF8460 – Traitement automatique de la langue naturelle <br> </center>
## <center> TP4 - Les LLMs et la génération augmentée de récupération (RAG) pour les questions-réponses<br> Automne 2024 </center>


In [1]:
import os
import torch
import numpy as np
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt
from typing import Optional
from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM, pipeline, BitsAndBytesConfig
from collections import Counter
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer
from nltk.translate.bleu_score import sentence_bleu
import nltk
import faiss

In [2]:
root_path = './'
data_path = root_path + 'data/'
questions_test = pd.read_csv(data_path + 'questions_test.csv')
texts = pd.read_csv(data_path + 'texts.csv')

In [3]:
import torch
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
# Modèle de génération

from transformers import AutoTokenizer, AutoModelForCausalLM

model_name = 'microsoft/Phi-3-mini-128k-instruct'
gen_tokenizer = AutoTokenizer.from_pretrained(model_name)
gen_model = AutoModelForCausalLM.from_pretrained( 
    "microsoft/Phi-3-mini-128k-instruct",  
    device_map=DEVICE,  
    torch_dtype=torch.float16,  
    trust_remote_code=True,  
)

`flash-attention` package not found, consider installing for better performance: No module named 'flash_attn'.
Current `flash-attention` does not support `window_size`. Either upgrade or use `attn_implementation='eager'`.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [5]:
BATCH_SIZE = 32

In [6]:
# Modèle de plongement

from transformers import AutoTokenizer, AutoModel
model_name = 'BAAI/bge-small-en-v1.5'
emb_tokenizer = AutoTokenizer.from_pretrained(model_name)
emb_model = AutoModel.from_pretrained(model_name)
emb_model.to(DEVICE)

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 384, padding_idx=0)
    (position_embeddings): Embedding(512, 384)
    (token_type_embeddings): Embedding(2, 384)
    (LayerNorm): LayerNorm((384,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=384, out_features=384, bias=True)
            (key): Linear(in_features=384, out_features=384, bias=True)
            (value): Linear(in_features=384, out_features=384, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=384, out_features=384, bias=True)
            (LayerNorm): LayerNorm((384,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [7]:
def encode_sequences(sequences: list, tokenizer, model, device = DEVICE, batch_size = BATCH_SIZE):
    """
    Encode les textes en utilisant le modèle passé en paramètre pour générer les plongements des textes

    Paramètres:
    sequences    : Liste de séquence à transformer en plongements
    tokenizer   : Segmenteur du modèle de plongements
    model       : Modèle de plongements
    device      : Machine sur laquelle les opérations doivent être effectuées
    batch_size  : Taille des lots lors de la génération des traitements
    """
    # TODO

    model.eval()

    embeddings = []
    
    for i in range(0, len(sequences), batch_size):
        batch = sequences[i:i + batch_size]
    
        inputs = tokenizer(batch, padding=True, truncation=True, return_tensors="pt").to(device)
        
        with torch.no_grad():
            output = model(**inputs).last_hidden_state # .last_hidden_state : Dimension (torch.Size([batch_size, sequence_length, hidden_size])) -> chaque token de la séquence à un plongement de taille 384 !
            embedding = output.mean(dim=1)  # Moyenne sur la dimension des tokens (dimension sequence_length) pour avoir la représentation de la séquence
            embeddings.append(embedding)
    
    return torch.cat(embeddings, dim=0)

    # END TODO

In [8]:
passages = texts['text'].tolist()
questions = questions_test['question'].tolist()
passage_embed = encode_sequences(passages, emb_tokenizer, emb_model)

  attn_output = torch.nn.functional.scaled_dot_product_attention(


In [9]:
# Index Faiss
passage_embeddings = passage_embed.cpu().numpy()

faiss.normalize_L2(passage_embeddings) 
d = passage_embeddings.shape[1]
index = faiss.IndexFlatIP(d)
index.add(passage_embeddings)

In [10]:
def retrieve_passages_faiss(questions: list, vector_index: faiss.IndexFlatL2, k: int, embedding_model_tokenizer, embedding_model):
    """
    Retourne les k passages les plus pertinents pour chaque question passée en paramètre

    Paramètres:
    questions       : Les questions pour lesquelles on cherche les passages les plus pertinents
    vector_index    : L'objet d'indexation FAISS
    k               : le nombre de passages à retourner
    tokenizer       : Segmenteur du modèle de plongements
    model           : Modèle de plongements

    Retourne:
    Les indices des k passages les plus pertinents pour la question
    """
    question_embeddings = encode_sequences(questions, embedding_model_tokenizer, embedding_model)
    question_embeddings = question_embeddings.cpu().numpy()
    faiss.normalize_L2(question_embeddings)
    
    _, top_k_indices = vector_index.search(question_embeddings, k)
    
    return top_k_indices

In [11]:
# Création des prompts avec recherche d'informations via Faiss
k_values = [3]
prompts_k = {}

for k in k_values:
    prompts_k[k] = []
    faiss_passages = retrieve_passages_faiss(questions, index, k, emb_tokenizer, emb_model)  
    help_instruction = "Give a concise answer to the following question in a few words : "

    for i, passages_id in enumerate(faiss_passages):
        passages = []

        for passage_id in passages_id:
            passage_text = texts[texts['id'] == passage_id]['text'].iloc[0]
            passages.append(passage_text)

        context_string = " ".join(passages) + " " + help_instruction + questions[i]
        prompts_k[k].append(context_string)
        
# END TODO

In [12]:
def process_batch_prompts(batch_prompts, tokenizer, model, device):
    inputs = [
        tokenizer.apply_chat_template(
            [{'role': 'system', 'content': 'You are a helpful assistant.'},
             {'role': 'user', 'content': prompt}], 
            tokenize=False,
            add_generation_prompt=True
        )
        for prompt in batch_prompts
    ]
    
    inputs = tokenizer(inputs, return_tensors="pt", padding=True).to(device)
    
    outputs = model.generate(
        input_ids=inputs['input_ids'],
        attention_mask=inputs['attention_mask'],
        max_new_tokens=32
    )
    
    responses = [
        tokenizer.decode(output[inputs.input_ids.size(1):], skip_special_tokens=True)
        for output in outputs
    ]
    
    return responses

In [13]:
# Envoi des prompts au modèle pour générer les réponses

import time
k_values = [3]
generated_responses = {}

for k in k_values:
    batch_size = 8
    batch_responses = []

    for i in range(0, len(prompts_k[k]), batch_size):
        batch_prompts = prompts_k[k][i:i + batch_size]

        start_time = time.time()
        
        batch_responses.extend(process_batch_prompts(batch_prompts, gen_tokenizer, gen_model, DEVICE))
        
        end_time = time.time()
        
        print(f"[k={k} | Batch {i}] Time for generation = {end_time - start_time:.2f} seconds")

    generated_responses[k] = batch_responses

The `seen_tokens` attribute is deprecated and will be removed in v4.41. Use the `cache_position` model input instead.
You are not running the flash-attention implementation, expect numerical differences.


[k=3 | Batch 0] Time for generation = 19.50 seconds
[k=3 | Batch 8] Time for generation = 24.55 seconds
[k=3 | Batch 16] Time for generation = 22.93 seconds
[k=3 | Batch 24] Time for generation = 21.93 seconds
[k=3 | Batch 32] Time for generation = 28.47 seconds
[k=3 | Batch 40] Time for generation = 33.07 seconds
[k=3 | Batch 48] Time for generation = 62.50 seconds
[k=3 | Batch 56] Time for generation = 29.17 seconds
[k=3 | Batch 64] Time for generation = 39.92 seconds
[k=3 | Batch 72] Time for generation = 28.42 seconds
[k=3 | Batch 80] Time for generation = 20.18 seconds
[k=3 | Batch 88] Time for generation = 21.41 seconds
[k=3 | Batch 96] Time for generation = 36.13 seconds
[k=3 | Batch 104] Time for generation = 19.85 seconds
[k=3 | Batch 112] Time for generation = 26.26 seconds
[k=3 | Batch 120] Time for generation = 24.10 seconds
[k=3 | Batch 128] Time for generation = 28.07 seconds
[k=3 | Batch 136] Time for generation = 22.14 seconds
[k=3 | Batch 144] Time for generation = 32.

In [14]:
submission_df = pd.DataFrame({
    "id": questions_test["id"],
    "answer": generated_responses[3]
})

submission_df.to_csv("tp4_submission.csv", index=False)