In [None]:
import os
import sys
import numpy as np
import faiss
from openai import OpenAI
from PyPDF2 import PdfReader
from uuid import uuid4

: 

: 

: 

: 

# Document Loading

In [None]:
# === 1. Charger et parser les fichiers PDF ===
def load_pdfs(directory):
    documents = []
    for filename in os.listdir(directory):
        if filename.endswith(".pdf"):
            reader = PdfReader(os.path.join(directory, filename))
            text = "\n".join(page.extract_text() for page in reader.pages if page.extract_text())
            documents.append({"id": str(uuid4()), "text": text})
    return documents

In [None]:
# === 2. Split des documents (simple) ===
def split_text(text, chunk_size=1000, overlap=150):
    chunks = []
    for i in range(0, len(text), chunk_size - overlap):
        chunk = text[i:i + chunk_size]
        if chunk.strip():
            chunks.append(chunk)
    return chunks

In [None]:
# === 3. Générer les embeddings via l'API OpenAI ===
def get_embeddings(texts, client, embedding_model_name):
    embeddings = []
    for i in range(0, len(texts), 100):  # batcher par 100 max
        batch = texts[i:i+100]
        response = client.embeddings.create(
            model=embedding_model_name,
            input=batch
        )
        for d in response.data:
            embeddings.append(np.array(d.embedding, dtype=np.float32))
    return np.stack(embeddings)

In [None]:
# === 4. Indexer avec FAISS ===
def build_faiss_index(chunks, client, embedding_model_name):
    texts = [chunk['text'] for chunk in chunks]
    vectors = get_embeddings(texts, client, embedding_model_name)
    dim = vectors.shape[1]
    index = faiss.IndexFlatL2(dim)
    index.add(vectors)
    return index, vectors, texts


Device set to use cuda:0


In [None]:
# === 5. Récupérer les documents pertinents ===
def retrieve(query, index, texts, client, embedding_model_name, top_k=4):
    query_embedding = client.embeddings.create(
        model=embedding_model_name,
        input=[query]
    ).data[0].embedding
    query_vector = np.array(query_embedding, dtype=np.float32).reshape(1, -1)
    distances, indices = index.search(query_vector, top_k)
    return [texts[i] for i in indices[0]]

In [None]:
# === 6. Appeler le modèle LLM ===
def ask_llm(question, context, client, llm_model_name):
    prompt = f"""Answer the following question based on the provided context.\n\nContext:\n{context}\n\nQuestion: {question}"""
    response = client.chat.completions.create(
        model=llm_model_name,
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt}
        ],
        temperature=0.7
    )
    return response.choices[0].message.content.strip()

Token indices sequence length is longer than the specified maximum sequence length for this model (659 > 512). Running this sequence through the model will result in indexing errors




[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m

Réponse générée :
 World domination RISK


# Main

In [None]:
# === 0. Initialisation du client OpenAI ===
client = OpenAI()

embedding_model_name = "text-embedding-3-small"   # Ton modèle d'embedding
llm_model_name = "gpt-4.1"                        # Ton modèle LLM

In [None]:
pdf_dir = "dataset"
documents = load_pdfs(pdf_dir)

# Split en chunks
chunks = []
for doc in documents:
    for chunk in split_text(doc["text"]):
        chunks.append({"id": doc["id"], "text": chunk})


In [None]:
# Index FAISS
index, vectors, texts = build_faiss_index(chunks, client, embedding_model_name)

In [None]:
# Question
question = "What is the name of the game?"
top_chunks = retrieve(question, index, texts, client, embedding_model_name, top_k=4)

In [None]:
# Générer réponse
context = "\n\n".join(top_chunks)
answer = ask_llm(question, context, client, llm_model_name)




--- Document 1 ---
tournament, write to us at the address below.
We will be happy to answer questions about this game. Write: Consumer
Relations Department, Parker Brothers, P.O. Box 1012, Beverly, MA 01915.
“F!HPARKERBROTHERS
00044-I 
Rl
16

--- Document 2 ---
WORLD DOMINATION RISK®
OBJECT OF THE GAME
To conquer the world by occupying every territory on the board, thus
eliminating all your opponents.
SETUP
Unlike most games, RISK demands careful planning before you actually
start to play. This Initial Army Placement sets the stage for the battles you ’ll
fight later on.
INITIAL ARMY PLACEMENT  consists of these steps:
1.
2.
3.
4.
Select a color and, depending on the number of players, count out the
“ armies” you ’ll need to start the game.
If 2 are playing, see instructions on page 11.
If 3 are playing, each player counts out 35 Infantry.
If 4 are playing, each player counts out 30 Infantry.
If 5 are playing, each player counts out 25 Infantry.
If 6 are playing, each player counts ou

In [None]:
print("\n📌 Question :", question)
print("\n🧠 Réponse générée :\n", answer)

# Afficher les passages utilisés
print("\n--- Passages utilisés ---")
for i, passage in enumerate(top_chunks):
    print(f"\n[{i+1}] {passage[:300]}...")