In [7]:
# -*- coding: utf-8 -*-
"""SYNTHESE.ipynb

Automatically generated by Colab.

Original file is located at
    https://colab.research.google.com/drive/1DeRgXDm2Vwafea-0ytm-bJ6JUw3b5gkh

# Implémentation d'un Chatbot documentaire (RAG)

# Prérequis : Installation des librairies nécessaires
"""

# Pour importer les fichiers PDF
import os
import requests

# Extraction des fichiers PDF
!pip install PyPDF2
import PyPDF2

# Traitement du texte
!pip install nltk
!pip install re
import re
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')
!pip install -qU langchain-text-splitters
from langchain_text_splitters import RecursiveCharacterTextSplitter

# Modèle TF
from sklearn.feature_extraction.text import CountVectorizer
!pip install sentence-transformers

# Modèle Dense Embeding
from sentence_transformers import SentenceTransformer
from sentence_transformers.util import cos_sim
from tqdm import tqdm

# Modèle Reranker
!pip install FlagEmbedding
from FlagEmbedding import FlagReranker

# Tracé des graphes de résultat
import matplotlib.pyplot as plt

# Objet text retriever
!pip install langchain
!pip install gradio
import langchain as lc
!pip install langchain --q
!pip install langchain-community --q
!pip install langchain-chroma --q
!pip install FlagEmbedding -q
from langchain_chroma import Chroma
from langchain_community.embeddings.sentence_transformer import SentenceTransformerEmbeddings
from FlagEmbedding import FlagReranker

# Modèle LLM
!pip install transformers
!pip install huggingface_hub
!pip install llama-cpp-python

# Hallucinations
from langchain.llms import LlamaCpp
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate
from llama_cpp import Llama
from math import *
import numpy as np
from transformers import DebertaV2ForSequenceClassification, DebertaV2Tokenizer
import torch
from tqdm import tqdm

# Téléchargement du modèle pour hallucinations
!huggingface-cli download TheBloke/Mistral-7B-Instruct-v0.2-GGUF mistral-7b-instruct-v0.2.Q4_K_M.gguf --local-dir . --local-dir-use-symlinks False

# Analyse de toxicité
!pip install torch
!pip install detoxify
!pip install datasets
!pip install scikit-learn
!pip install evaluate
!pip install pandas

import pandas as pd
import torch
import evaluate
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from detoxify import Detoxify
from sklearn.metrics import precision_recall_curve, auc





"""# GESTION DE LA BASE DE DONNÉES

## Etape 1 : récupération des fichiers PDFs:
"""

# Chemin du dossier où l'on souhaite télécharger les fichiers
chemin_dossier = "/content/drive/My Drive/RAG_IPCC"

# Vérifier si le dossier existe, sinon le créer
if not os.path.exists(chemin_dossier):
    os.makedirs(chemin_dossier)
    print("Le dossier 'RAG_IPCC' a été créé avec succès.")
else:
    print("Le dossier 'RAG_IPCC' existe déjà.")

# URLs des fichiers à télécharger
urls = {
    "6th_report": "https://www.ipcc.ch/report/ar6/syr/downloads/report/IPCC_AR6_SYR_FullVolume.pdf",
    "ocean": "https://www.ipcc.ch/site/assets/uploads/sites/3/2022/03/02_SROCC_TS_FINAL.pdf",
    "land": "https://www.ipcc.ch/site/assets/uploads/sites/4/2022/11/SRCCL_Technical-Summary.pdf",
    "warming": "https://www.ipcc.ch/site/assets/uploads/sites/2/2022/06/SPM_version_report_LR.pdf"
}

# Télécharger les fichiers dans le dossier
for name, url in urls.items():
    response = requests.get(url)
    with open(os.path.join(chemin_dossier, f"{name}.pdf"), 'wb') as file:
        file.write(response.content)
    print(f"{name} a été téléchargé.")

"""## Etape 2 : Extraction du texte des fichiers PDF"""

# Chemin du dossier contenant les fichiers PDF
chemin_dossier = "/content/drive/My Drive/RAG_IPCC"

# Liste des fichiers PDF dans le dossier
fichiers_pdf = [f for f in os.listdir(chemin_dossier) if f.endswith('.pdf')]

# Liste pour stocker le texte extrait de chaque PDF
extracted_text = []

# Boucle à travers chaque fichier PDF
for pdf in fichiers_pdf:
    print(f"*** PROCESSING FILE : {pdf} ***")

    # Chemin complet du fichier PDF
    chemin_pdf = os.path.join(chemin_dossier, pdf)

    # Ouverture du fichier PDF en mode lecture binaire
    with open(chemin_pdf, 'rb') as file:
        # Création d'un objet de lecteur PDF
        pdf_reader = PyPDF2.PdfReader(file)

        # Boucle à travers chaque page du PDF
        for page_num in range(len(pdf_reader.pages)):
            # Extraction du texte de la page actuelle
            page = pdf_reader.pages[page_num]
            text = page.extract_text()

            # Ajout du texte extrait à la liste
            extracted_text.append({"document": pdf, "page": page_num, "content": text})

# Affichage du texte extrait
for text in extracted_text:
    print(text)

"""## Etape 3 : Traitement du texte en chunks propres"""

#### FONCTIONS ####

# Segmentation du texte de base

def splitting_by_numer_of_words(text, chunk_size):
  """
  Découpe un texte en chunks de taille donnée (nombre de caractères).

  Args:
    text (str): Le texte à splitter.
    chunk_size (int): La taille souhaitée des chunks (nombre de mots).

  Returns:
    list: Une liste de chunks de texte.
  """
  chunks = []
  for phrase in text.split('\n'):
    words = phrase.split()
    for i in range(0, len(words), chunk_size):
      chunks.append(' '.join(words[i:i + chunk_size]))
  return chunks

# Fonction de splitting par phrase

def splitting_by_sentences(text):
  """
  Découpe un texte en chunks par phrases.

  Args:
    text (str): Le texte à découper.

  Returns:
    list: Une liste de chunks de texte (phrases).
  """
  sentences = []
  list_paragraph = text.split("\n")
  for paragraph in list_paragraph:
    list_sent = paragraph.split(".")
    sentences = sentences + list_sent
  return sentences

## TEST
print(splitting_by_numer_of_words("Bonjour, aujourd'hui c'est. le 26 Mars 2019 ça marche?",5))

# Fonction de splitting par phrase
def splitting_by_sentences(text):
  sentences=text.split('.')
  return sentences


## TEST
print(splitting_by_sentences("Bonjour, aujourd'hui c'est. le 26 Mars 2019 ça marche?"))

# Nettoyage du contenu de chaque chunk

special_chars = [" ", '-', '&', '(', ')', '_', ';', '†', '+', '–', "'", '!', '[', ']', '’', '́', '̀', '\u2009', '\u200b', '\u202f', '©', '£', '§', '°', '@', '€', '$', '\xa0', '~','\n','�']

def remove_char(text, char):
    """Remove each specific character from the text for each character in the chars list."""
    return text.replace(char, ' ')

def remove_chars(text, chars):
    """ Apply remove_char() function to text """
    for char in chars:
        text = remove_char(text, char)
    return text

def remove_multiple_white_spaces(text):
    """Remove multiple spaces."""
    text = re.sub(" +", " ", text)
    return text

def clean_text(text, special_chars=special_chars):
    """Generate a text without chars expect points and comma and multiple white spaces."""
    text = remove_chars(text, special_chars)
    text = remove_multiple_white_spaces(text)
    return text

# Filtrage des mots vides

def contains_mainly_digits(text, threshold=0.5):
    """
    Checks if a text string contains a high percentage of digits compared to letters.

    Args:
        text (str): The input text to analyze.
        threshold (float, optional): The threshold value for the proportion of digits to letters.
            Defaults to 0.5.

    Returns:
        bool: True if the proportion of digits in the text exceeds the threshold, False otherwise.
    """
    if not text:
        return False
    letters_count = 0
    nbs_count = 0
    for char in text:
        if char.isalpha():
            letters_count += 1
        elif char.isdigit():
            nbs_count += 1
    if letters_count + nbs_count > 0:
        digits_pct = (nbs_count / (letters_count + nbs_count))
    else:
        return True
    return digits_pct > threshold

def remove_mostly_digits_chunks(chunks, threshold=0.5):
  return [chunk for chunk in chunks if not contains_mainly_digits(chunk['content'])]

#### EXECUTION ####


# Split intelligent avec différents paramètres
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=20,
    length_function=len,
    is_separator_regex=False,
)

# Split pertinent qui garde la structure du document
chunks = []
for page_content in extracted_text:
  chunks_list = text_splitter.split_text(page_content['content'])

  # chunks_list = splitting_by_numer_of_words(page_content['content'])
  # chunks_list = splitting_by_sentences(page_content['content'])
  for chunk in chunks_list:
    text=clean_text(chunk)
    chunks.append({"document": page_content['document'],
                   "page": page_content['page'],
                   "content": text})
chunks=remove_mostly_digits_chunks(chunks)
print(chunks)

"""# COMPARAISON DES MODELES DE RECHERCHE (Information Retrieval)"""



"""## Etape 1 : Implémentation du modèle BOW (TF-IDF)

"""



"""## Etape 2 : Implémentation du modèle Dense Embeding"""





"""# IMPLEMENTATION DU MODELE DE RECHERCHE RETENU"""

class TextRetriever:
    def __init__(self, embedding_model_name="mixedbread-ai/mxbai-embed-large-v1", reranking_model_name="BAAI/bge-reranker-large"):
        """
        Initialise les modèles d'embedding et de reranking.

        Args:
            embedding_model_name (str): Nom du modèle d'embedding.
            reranking_model_name (str): Nom du modèle de reranking.
        """
        self.embedding_model = SentenceTransformerEmbeddings(model_name=embedding_model_name)
        self.reranker_model = FlagReranker(reranking_model_name, use_fp16=True)
        self.vector_database = None  # Initialisation de la base de données vectorielle à None

    def store_embeddings(self, chunks, path="./chroma_db"):
        """
        Stocke les embeddings des chunks de texte dans une base de données vectorielle.

        Args:
            chunks (list of str): Liste de chunks de texte à stocker.
            path (str): Chemin du répertoire où la base de données sera stockée.
        """
        self.vector_database = Chroma.from_texts(chunks, embedding=self.embedding_model, persist_directory=path)

    def load_embeddings(self, path):
        """
        Charge les embeddings depuis une base de données vectorielle.

        Args:
            path (str): Chemin du répertoire de la base de données.
        """
        self.vector_database = Chroma(persist_directory=path, embedding=self.embedding_model)

    def get_best_chunks(self, query, top_k=3):
        """
        Recherche les meilleurs chunks correspondant à une requête.

        Args:
            query (str): Requête de recherche.
            top_k (int): Nombre de meilleurs chunks à retourner.

        Returns:
            list: Liste des meilleurs chunks correspondant à la requête.
        """
        best_chunks = self.vector_database.similarity_search(query, k=top_k)
        return best_chunks

    def rerank_chunks(self, query, chunks):
        """
        Retourne le chunk le plus pertinent pour une requête donnée.

        Args:
            query (str): Requête de recherche.

        Returns:
            str: Contenu du chunk le plus pertinent.
        """
        best_chunks = self.get_best_chunks(query, top_k=10)
        rerank_scores = []
        for text in text_chunks:
          score = self.reranker_model.compute_score([query, text])
          rerank_scores.append(score)

        return [x for _, x in sorted(zip(rerank_scores, best_chunks))]

    def get_context(self, query):
        """
        Retourne le chunk le plus pertinent pour une requête donnée.

        Args:
            query (str): Requête de recherche.

        Returns:
            str: Contenu du chunk le plus pertinent.
        """
        best_chunks = self.get_best_chunks(query, top_k=1)
        return best_chunks[0].page_content

retriever=TextRetriever()

all_chunks=[]
for chunk in chunks:
  all_chunks.append(chunk['content'])
retriever.store_embeddings(all_chunks)


"""# MODELE LLM

## Etape 1 : Generation d'une réponse
"""

def load_llm(model_path):

		# On charge le LLM sous format quantisé. Cf la descriptions des paramètres ci-dessous.
    llm = LlamaCpp(
        model_path=model_path, stop=["Question"], max_tokens=300, temperature=0,
				n_ctx=8000, n_batch=1024, n_gpu_layers=-1, logits_all=True
    )

    return llm

llm = load_llm("mistral-7b-instruct-v0.2.Q4_K_M.gguf")

## FONCTIONS

# Basic context function.
def get_context_from_query(query):
  context1=retriever.get_best_chunks(query,4)


  return context1


# Fonction de generation de texte
def conv_chain(llm):

    template = """Below is an instruction that describes a task. Write a response that appropriately completes the request using the context provided.

    Human: [INST] {instruction} [\INST]

    Context: {context}

    AI:\n
    """

    prompt = PromptTemplate(
        input_variables=["instruction",'context'], template=template
    )

    llm_chain = LLMChain(
        llm=llm,
        prompt=prompt,
        verbose=True,
    )


    return llm_chain




# Nécessité d'ajouter un historique pour que la conversation ait un sens

"""## Etape 2 : Sauvegarde d'un historique limité de conversation"""

class ConversationHistoryLoader:

  def __init__(self, k):

    self.k=k
    self.conversation_history = []


  # Fonction qui permet créer un prompt (string) sur l'historique de conversation.
  def create_conversation_history_prompt(self):

    conversation = ''

    if self.conversation_history == None:

      return conversation

    else:

      for exchange in reversed(self.conversation_history):

        conversation = conversation + '\nHuman: '+exchange['Human']+'\nAI: '+exchange['AI']

      return conversation

  # Fonction qui permet de mettre à jour l'historique de conversation
  # à partir de la dernière query et la dernière réponse du LLM.
  def update_conversation_history(self, query, response):

    exchange = {'Human': query, 'AI': response}

    self.conversation_history.insert(0, exchange)

    if len(self.conversation_history) > self.k:

      self.conversation_history.pop()


def conv_chain_with_history(llm):

    template = """Below is an instruction that describes a task. Write a response that appropriately completes the request using the context provided and the previous conversation.

    Context: {context}

    {chat_history}

    Human: [INST] {instruction} [\INST]


    AI:\n
    """

    prompt = PromptTemplate(input_variables=["instruction",
                                             'chat_history',
                                             'context'],
                            template=template)


    llm_chain = LLMChain(
        llm=llm,
        prompt=prompt,
        verbose=True,
    )


    return llm_chain


def conv_chain_with_conversation_buffer(llm):

    template = """Below is an instruction that describes a task. Write a response that appropriately completes the request using the context provided.

    {chat_history}

    Human: [INST] {instruction} [\INST]

    Context: {context}

    AI:\n
    """

    prompt = PromptTemplate(
        input_variables=["instruction",'chat_history', 'context'], template=template
    )

    memory = ConversationBufferWindowMemory(memory_key="chat_history", input_key="instruction", k=3)

    llm_chain = LLMChain(
        llm=llm,
        prompt=prompt,
        verbose=True,
        memory=memory
    )


    return llm_chain

chain_with_history = conv_chain_with_history(llm)




Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1
[31mERROR: Could not find a version that satisfies the requirement re (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for re[0m[31m
[0m

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m314.7/314.7 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m124.9/124.9 kB[0m [31m12.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.0/53.0 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
spacy 3.7.4 requires typer<0.10.0,>=0.3.0, but you have typer 0.12.3 which is incompatible.
weasel 0.3.4 requires typer<0.10.0,>=0.3.0, but you have typer 0.12.3 which is incompatible.[0m[31m
[0mCollecting sentence-transformers
  Downloading sentence_transformers-3.0.0-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.7/224.7 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc

Collecting langchain
  Downloading langchain-0.2.3-py3-none-any.whl (974 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m974.0/974.0 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: langchain
Successfully installed langchain-0.2.3
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.2/49.2 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m526.8/526.8 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.4/2.4 MB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.3/41.3 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[2K 

  warn_deprecated(
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/171 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/113k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/677 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/670M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.24k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/297 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/443 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/279 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/801 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

llama_model_loader: loaded meta data with 24 key-value pairs and 291 tensors from mistral-7b-instruct-v0.2.Q4_K_M.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = mistralai_mistral-7b-instruct-v0.2
llama_model_loader: - kv   2:                       llama.context_length u32              = 32768
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   4:                          llama.block_count u32              = 32
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
llama_model_loader: - kv   7:                 l

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
def get_context_from_query(query):
  context1=retriever.get_best_chunks(query,4)


  return context1


In [8]:


import gradio as gr

def get_response(query):
    context = get_context_from_query(query)
    res = chain_with_history.predict(instruction=query,
                                     context=context,
                                     chat_history=ch.create_conversation_history_prompt())
    return res

iface = gr.Interface(fn=get_response, inputs="text", outputs="text", title="Query Interface", description="Enter your query:")
iface.launch(share=True)

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://0a89278d101db6dbb1.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


