In [None]:
#installation des packages nécessaires

!pip install xformer --quiet
!pip install chromadb --quiet
!pip install langchain --quiet
!pip install accelerate --quiet
!pip install transformers --quiet
!pip install bitsandbytes --quiet
!pip install unstructured --quiet
!pip install sentence-transformers --quiet

In [None]:
from langchain.document_loaders import TextLoader  #for textfiles
from langchain.text_splitter import CharacterTextSplitter #text splitter
from langchain.embeddings import HuggingFaceEmbeddings #for using HugginFace models
from langchain.vectorstores import FAISS  #facebook vectorizationfrom langchain.chains.question_answering import load_qa_chain
from langchain.chains.question_answering import load_qa_chain
from langchain import HuggingFaceHub
from langchain.document_loaders import UnstructuredPDFLoader  #load pdf
from langchain.indexes import VectorstoreIndexCreator #vectorize db index with chromadb
from langchain.chains import RetrievalQA
from langchain.document_loaders import UnstructuredURLLoader  #load urls into docoument-loader
from langchain.prompts import PromptTemplate #to translate answers from english to french
from langchain.llms import GPT4All 

In [None]:
from textwrap import fill
from IPython.display import Markdown, display

from langchain.prompts.chat import (
    ChatPromptTemplate,
    HumanMessagePromptTemplate,
    SystemMessagePromptTemplate,
    )

from langchain import PromptTemplate
from langchain import HuggingFacePipeline

from langchain.vectorstores import Chroma
from langchain.schema import AIMessage, HumanMessage
from langchain.memory import ConversationBufferMemory
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import UnstructuredMarkdownLoader, UnstructuredURLLoader
from langchain.chains import LLMChain, SimpleSequentialChain, RetrievalQA, ConversationalRetrievalChain

from transformers import BitsAndBytesConfig, AutoModelForCausalLM, AutoTokenizer, GenerationConfig, pipeline

import warnings
warnings.filterwarnings('ignore')

In [None]:
import os

In [None]:
!pip install PyPDF2
from PyPDF2 import PdfReader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import FAISS

# **data preprocessing**

**Prepare data before vectorizing and feeding it to the Language Model**

In [None]:
data_reader= PdfReader("/kaggle/input/rapport-economique-data/rapport_economique.pdf")

In [None]:
# printing number of pages in pdf file 
print(len(data_reader.pages)) 
  
# getting a specific page from the pdf file 
page = data_reader.pages[0] 
  
# extracting text from page 
text = page.extract_text() 
print(text)

In [None]:
content=""
for i in range(0,len(data_reader.pages)):
    page = data_reader.pages[i] 
    text = page.extract_text()
    content=content+text

In [None]:
content

In [None]:
#from the cintent we create multiple docs for making the retrieval more easy for the language model.
text_splitter = CharacterTextSplitter(
    separator = "\n",
    chunk_size = 200,
    chunk_overlap  = 20,
    length_function = len,
)

In [None]:
texts = text_splitter.split_text(content)

# **Indexing**

**Represeting the created docs in a vectorial space using "thenlper/gte-large" embedding algorithm.**

**!: You can use any embedding algorithm; updated and embedding algorithm are regularly published by the Data Science community and enthusiastes.**

In [None]:
embeddings = HuggingFaceEmbeddings(
    model_name="thenlper/gte-large",
    model_kwargs={"device": "cuda"},
    encode_kwargs={"normalize_embeddings": True},
)

**Download the language model**

In [None]:
import torch
from time import time
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from transformers import pipeline
from langchain.llms import HuggingFacePipeline
from langchain.chains import LLMChain, SimpleSequentialChain
from langchain import PromptTemplate

In [None]:
model_id = '/kaggle/input/mistral/pytorch/7b-instruct-v0.1-hf/1'
time_1 = time()
tokenizer = AutoTokenizer.from_pretrained(model_id)
model_name = AutoModelForCausalLM.from_pretrained(
        model_id,
        torch_dtype=torch.bfloat16,
        device_map="auto",
        trust_remote_code=True,
    )
print(f"Tokenizer & pipeline: {round(time() - time_1)} sec.")

In [None]:
time_1 = time()
query_pipeline = pipeline(
        "text-generation",
        model=model_name,
        tokenizer=tokenizer,
        do_sample=True,
        top_k=50,
        top_p=0.95,
        temperature=0.1,
        num_return_sequences=1,
        eos_token_id=tokenizer.eos_token_id,
        torch_dtype=torch.float16,
        device_map="auto",
        max_length=200,)
time_2 = time()
print(f"Prepare pipeline: {round(time_2-time_1, 3)} sec.")

In [None]:
llm_mistral = HuggingFacePipeline(pipeline=query_pipeline)

In [None]:
#install the vectorial data base.
!pip install faiss-gpu

In [None]:
#represent the docuemnt in vectorial space and store it into the vectorial database.
docsearch = FAISS.from_texts(texts, embeddings)

In [None]:
import os
import requests
#here remplace it by your own api_token
os.environ["HUGGINGFACEHUB_API_TOKEN"] = "xxxxxxxx"

In [None]:
from langchain import HuggingFaceHub


llm_new_mistral=HuggingFaceHub(repo_id="mistralai/Mixtral-8x7B-Instruct-v0.1", model_kwargs={"temperature":0.1 ,"max_length":512})

In [None]:
from langchain.prompts import PromptTemplate
prompt_template = """Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer. Answer in french.

{context}

Question: {question}
Answer: French
"""
PROMPT = PromptTemplate(
    template=prompt_template, input_variables=["context", "question"]
)

In [None]:
chain_type_kwargs = {"prompt": PROMPT}
from langchain.chains import RetrievalQA
chain = RetrievalQA.from_chain_type(llm=llm_new_mistral,
                                    chain_type="stuff",
                                    retriever=docsearch.as_retriever(),
                                    input_key="question",
                                    chain_type_kwargs=chain_type_kwargs)

In [None]:
print(chain.run("quel est le taux de croissance prévu dans les économies emergentes en 2022?"))