#  INF8225 Projet QA : HyDE vs RAG-Fusion

## Import des librairies

In [None]:
import os
import torch
import numpy as np
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt
from typing import Optional
from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM, pipeline, BitsAndBytesConfig
from collections import Counter
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer
from nltk.translate.bleu_score import sentence_bleu
import nltk
import faiss
import torch.nn.functional as F
from torch.utils.data import Dataset

import os
import torch
from torch.utils.data import DataLoader
from transformers import AdamW
from tqdm import tqdm
import itertools
import time
import warnings

## Déclarations des constantes

In [2]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
BATCH_SIZE = 16

## Chargement des données

In [3]:
root_path = './'
data_path = root_path + 'data/'
questions_test = pd.read_csv(data_path + 'questions_test.csv')
questions_train = pd.read_csv(data_path+'questions_train.csv')
questions_val = pd.read_csv(data_path+'questions_val.csv')
texts = pd.read_csv(data_path + 'texts.csv')

# Colonnes utiles
passages = texts['text'].tolist()
questions_t = questions_test['question'].tolist()
questions_tr = questions_train['question'].tolist()
questions_v = questions_val['question'].tolist()

## Encoding

### Chargement de l'encodeur

In [4]:
from transformers import AutoTokenizer, AutoModel
model_name = 'BAAI/bge-base-en-v1.5'
emb_tokenizer = AutoTokenizer.from_pretrained(model_name)
emb_model = AutoModel.from_pretrained(model_name)
emb_model.to(DEVICE)

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [5]:
from peft import LoraConfig, get_peft_model

lora_config = LoraConfig(
    r=16,
    lora_alpha=16,
    target_modules=["query", "key", "value"],
    lora_dropout=0.05,
    bias="none",
)

emb_model = get_peft_model(emb_model, lora_config)

In [None]:
emb_model.load_state_dict(torch.load("models_encoder/fine_tuned_model_weights_epoch_8.pth"))

### Stockage dans FAISS

In [9]:
def encode_sequences(sequences: list, tokenizer, model, device=DEVICE, batch_size=BATCH_SIZE, training=False) -> torch.Tensor:
    model.train() if training else model.eval()
    context = torch.enable_grad() if training else torch.no_grad()

    embeddings = []
    with context:
        for i in range(0, len(sequences), batch_size):
            batch = sequences[i:i + batch_size]

            # Tokenisation des textes avant entrée du modèle
            inputs = tokenizer(
                batch, padding=True, truncation=True, return_tensors="pt"
            ).to(device)

            # Passage dans le modèle encodeur
            output = model(**inputs).last_hidden_state

            # Pooling : moyenne des vecteurs de tous les tokens pour chaque séquence
            pooled = output.mean(dim=1)  # [batch_size, hidden_size]

            embeddings.append(pooled)

    # Concatène tous les embeddings pour obtenir un seul tensor de sortie
    return torch.cat(embeddings, dim=0)

In [8]:
passage_embed = encode_sequences(passages, emb_tokenizer, emb_model)
passage_embeddings = passage_embed.cpu().numpy()
faiss.normalize_L2(passage_embeddings) #  Normalisation des vecteurs pour l'approche similarité cosinus
d = passage_embeddings.shape[1]
index = faiss.IndexFlatIP(d)  # Index Faiss
index.add(passage_embeddings)

In [10]:
def retrieve_top_k_passages_faiss(questions: list, vector_index: faiss.IndexFlatL2, embedding_model_tokenizer, embedding_model, k: int):
    question_embeddings = encode_sequences(questions, embedding_model_tokenizer, embedding_model)
    question_embeddings = question_embeddings.cpu().numpy()
    faiss.normalize_L2(question_embeddings)
    _, top_k_indices = vector_index.search(question_embeddings, k)
    
    return top_k_indices

## Generator

In [None]:
from huggingface_hub import login
login(token = '')

### Chargement du Generateur

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

model_name = "mistralai/Mistral-7B-Instruct-v0.1"

gen_tokenizer = AutoTokenizer.from_pretrained(model_name)
gen_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",           # auto-placement sur GPU si dispo
    torch_dtype=torch.float16,   # allège la mémoire (fp16)
    trust_remote_code=True,       # pour activer les modèles qui utilisent du code perso
)
gen_tokenizer.pad_token = gen_tokenizer.eos_token

### Prompts

In [22]:
def generate_prompts(questions, index, emb_tokenizer, emb_model, k) : 
    prompts = []
    faiss_passages = retrieve_top_k_passages_faiss(questions, index, emb_tokenizer, emb_model, k)  
    help_instruction = "Give a concise answer to the question."

    # Boucle sur chaque question et ses passages associés
    for i, passages_id in enumerate(faiss_passages):
        passages = []

        # Récupérer les textes des passages associés à la question actuelle
        for passage_id in passages_id:
            passage_text = texts[texts['id'] == passage_id]['text'].iloc[0]
            passages.append(passage_text)

        # Construire le contexte complet et l'ajouter à la liste des prompts
        context_string = (
            f"{help_instruction}\n" +
            "\n".join([f"{passage}" for j, passage in enumerate(passages)]) +
            "\n" +
            f"{questions[i]}\n"
        )
        prompts.append(context_string)
    return prompts

In [23]:
k_values = [1, 2, 3, 4]
prompts_k = {}

for k in k_values:
    prompts_k[k] = generate_prompts(questions_v, index, emb_tokenizer, emb_model, k)

print(prompts_k[2][0])

Give a concise answer to the question.
 Polymorphism in the CYSLTR2 gene has been associated with the inheritance of asthma in separate populations. The M201V variant of CYSLTR2 exhibits decreased responsiveness to LTD4, suggesting that this hypo-responsiveness underlies its asthma transmission-protecting effect.
 The human CYSLTR2 gene maps to chromosome 13 and consists of four exons. The protein encoded by CYSLTR2 is composed of 347 amino acids and shares only 31% amino acid identity with the CysLTR1 protein. CYSLTR2 is a G protein-coupled receptor that activates the Gq alpha subunit and/or Ga subunit of its coupled G protein when bound to its CysLT ligands.
What is the effect of the M201V variant of CYSLTR2 on responsiveness to LTD4?



### Generation

In [32]:
def process_batch_prompts(batch_prompts, tokenizer, model, device, max_new_tokens=64):
    formatted_prompts = [
        f"<s>[INST] {prompt.strip()} [/INST]" for prompt in batch_prompts
    ]

    print(formatted_prompts)

    inputs = tokenizer(formatted_prompts, return_tensors="pt", padding=True, truncation=True).to(device)

    # Génération
    outputs = model.generate(
        input_ids=inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        max_new_tokens=max_new_tokens,
        do_sample=True,
        top_p=0.9,
        temperature=0.7,
        eos_token_id=tokenizer.eos_token_id
    )

    # Décodage et extraction de la réponse
    responses = [
        tokenizer.decode(output, skip_special_tokens=True).split('[/INST]')[-1].strip()
        for output in outputs
    ]

    print(responses)

    return responses

def chunked_iterable(iterable, size):
    it = iter(iterable)
    while True:
        chunk = tuple(itertools.islice(it, size))
        if not chunk:
            return
        yield chunk

def generate_responses(gen_model, gen_tokenizer, prompts, batch_size = 32) : 
    batch_responses = []
    
    for batch_index, batch_prompts in enumerate(chunked_iterable(prompts, batch_size)):
        print(f"[k={k} | Batch {batch_index}] Started generation...")
        start_time = time.time()
        
        batch_responses.extend(process_batch_prompts(batch_prompts, gen_tokenizer, gen_model, DEVICE))
        
        end_time = time.time()
        elapsed_time = end_time - start_time
        print(f"[k={k} | Batch {batch_index}] Time for generation = {elapsed_time:.2f} seconds")
    return batch_responses

In [None]:
k_values = [3]
generated_responses = {}
for k in k_values:
    generated_responses[k] = generate_responses(gen_model, gen_tokenizer, prompts_k[k])
    print(f"Done with k = {k}!")

### Evaluation

In [None]:
def evaluate_bleu(df_true: pd.DataFrame, df_pred: pd.DataFrame, bleu_type: int):
    df_merged = pd.merge(df_true[['id', 'answer']], df_pred[['id', 'answer']], on='id', suffixes=('_true', '_pred'))

    weights = (1.0, 0.0, 0.0, 0.0) if bleu_type == 1 else (0.5, 0.5, 0.0, 0.0)
    
    bleu_scores = []

    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        for _, row in df_merged.iterrows():
            reference = row['answer_true'].split()
            candidate = row['answer_pred'].split() 

            bleu_score = sentence_bleu([reference], candidate, weights=weights)
            bleu_scores.append(bleu_score)
        
    mean_bleu = sum(bleu_scores) / len(bleu_scores)
    return mean_bleu

In [None]:
df_true_faiss = questions_val[['id', 'answer']].copy()
df_pred_faiss = questions_val[['id']].copy()
df_pred_faiss['answer'] = generated_responses[k]

bleu_1 = evaluate_bleu(df_true_faiss, df_pred_faiss, 1)
bleu_2 = evaluate_bleu(df_true_faiss, df_pred_faiss, 2)

print(f"Le score BLEU-1 pour k = {k} est {bleu_1:.3f}.")
print(f"Le score BLEU-2 pour k = {k} est {bleu_2:.3f}. \n")

In [None]:
df_export = questions_val[['id', 'question', 'answer']].copy()
df_export['generated_answer'] = generated_responses[k]
df_export.to_csv(f"generated_responses.csv", index=False, encoding='utf-8')