In [10]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.llms import HuggingFacePipeline
from langchain.chains import RetrievalQA
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
import os
import sys

# Document Loading

In [11]:
# === 1. Charger tous les PDF depuis le dossier "dataset/" ===
pdf_dir = "dataset"
loaders = []
for filename in os.listdir(pdf_dir):
    if filename.endswith(".pdf"):
        loaders.append(PyPDFLoader(os.path.join(pdf_dir, filename)))

In [12]:
# Charger les documents
docs = []
for loader in loaders:
    docs.extend(loader.load())

In [13]:
# === 2. Découpage des documents ===
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150)
split_docs = text_splitter.split_documents(docs)

In [14]:
# === 3. Création des embeddings + index FAISS ===
embedding_model = "sentence-transformers/all-MiniLM-l6-v2"
embeddings = HuggingFaceEmbeddings(
    model_name=embedding_model,
    model_kwargs={'device': 'cpu'},
    encode_kwargs={'normalize_embeddings': False}
)
db = FAISS.from_documents(split_docs, embeddings)

In [None]:
# === 4. Chargement du LLM génératif (type Seq2Seq) ===
llm_model_name = "google/flan-t5-base"  # Peut être remplacé par un autre modèle génératif
tokenizer = AutoTokenizer.from_pretrained(llm_model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(llm_model_name)

text_generation_pipeline = pipeline(
    "text2text-generation",
    model=model,
    tokenizer=tokenizer,
    max_length=512,
    do_sample=True,            # Active sampling
    temperature=0.7,           # Maintenant pris en compte
    repetition_penalty=1.1     # Bonus
)


llm = HuggingFacePipeline(pipeline=text_generation_pipeline)

Device set to use cuda:0


In [16]:
# === 5. Mise en place du RAG avec RetrievalQA ===
retriever = db.as_retriever(search_kwargs={"k": 4})

#qa_chain = RetrievalQA.from_chain_type(
#    llm=llm,
#    chain_type="stuff",  # "stuff" : met tous les documents dans le prompt
#    retriever=retriever,
#    #return_source_documents=False
#)
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever,
    return_source_documents=True,   # ← utile pour debug
    verbose=True                    # ← active l’affichage des prompts
)



In [17]:
# === 6. Question posée à ton RAG + LLM ===
question = "What is the name of the game?"

def err_remove(er):
    lin = "------------"
    er = str(er)
    start_index = er.find(lin) + len(lin)
    end_index = er.rfind(lin)
    Answer = er[start_index:end_index].strip()
    return Answer

try:
    result = qa_chain.invoke({"query": question})
    print("\nRéponse générée :\n", result["result"])
except:
    _, error, _ = sys.exc_info()
    answer = err_remove(error)
    print(answer)

Token indices sequence length is longer than the specified maximum sequence length for this model (659 > 512). Running this sequence through the model will result in indexing errors




[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m

Réponse générée :
 WORLD DOMINATION RISK


In [18]:
docs = retriever.get_relevant_documents(question)
for i, doc in enumerate(docs):
    print(f"\n--- Document {i+1} ---\n{doc.page_content}")



--- Document 1 ---
tournament, write to us at the address below.
We will be happy to answer questions about this game. Write: Consumer
Relations Department, Parker Brothers, P.O. Box 1012, Beverly, MA 01915.
“F!HPARKERBROTHERS
00044-I 
Rl
16

--- Document 2 ---
WORLD DOMINATION RISK®
OBJECT OF THE GAME
To conquer the world by occupying every territory on the board, thus
eliminating all your opponents.
SETUP
Unlike most games, RISK demands careful planning before you actually
start to play. This Initial Army Placement sets the stage for the battles you ’ll
fight later on.
INITIAL ARMY PLACEMENT  consists of these steps:
1.
2.
3.
4.
Select a color and, depending on the number of players, count out the
“ armies” you ’ll need to start the game.
If 2 are playing, see instructions on page 11.
If 3 are playing, each player counts out 35 Infantry.
If 4 are playing, each player counts out 30 Infantry.
If 5 are playing, each player counts out 25 Infantry.
If 6 are playing, each player counts ou