In [1]:
import ollama
import chromadb
import pandas as pd
import networkx as nx
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from tqdm import tqdm
import os
import pickle
import logging

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')


class DocumentProcessor:
    def __init__(self):
        self.client = chromadb.Client()
        self.collection = self.client.create_collection(name="encyclopedia")

    def process_tsv(self, file_path, chunk_size=100):
        logging.info("Début du chargement et traitement du fichier TSV...")
        all_data = []
        total_rows = sum(1 for _ in open(file_path, 'r', encoding='utf-8'))
        processed_rows = 0

        try:
            for chunk in pd.read_csv(file_path, sep='\t', chunksize=chunk_size):
                logging.info(f"Traitement du chunk {processed_rows}-{processed_rows+len(chunk)}")
                chunk['content'] = chunk['content'].astype(str).replace('nan', '')
                chunk = chunk[chunk['content'].str.strip() != '']
                all_data.append(chunk)

                for _, row in chunk.iterrows():
                    try:
                        embedding = ollama.embeddings(model="mxbai-embed-large", prompt=row['content'])['embedding']
                        self.collection.add(
                            ids=[str(row['id_enccre'])],
                            embeddings=[embedding],
                            documents=[row['content']],
                            metadatas=[{
                                'volume': row['volume'],
                                'numero': row['numero'],
                                'head': row['head'],
                                'author': row['author'],
                                'domaine_enccre': row['domaine_enccre']
                            }]
                        )
                    except Exception as e:
                        logging.error(f"Erreur lors du traitement de l'article {row['id_enccre']}: {e}")
                        continue

                processed_rows += len(chunk)
                logging.info(f"Progression: {processed_rows}/{total_rows} lignes traitées")

            df = pd.concat(all_data, ignore_index=True)
            logging.info("Traitement du fichier TSV terminé")
            return df, self.collection
        except Exception as e:
            logging.error(f"Une erreur s'est produite lors du traitement du fichier TSV: {e}")
            raise

    def query(self, query: str):
        if self.query_engine is None:
            raise ValueError("Le moteur de requête n'a pas été initialisé. Exécutez d'abord process_tsv().")
        return self.query_engine.query(query)

class KnowledgeGraph:
    def __init__(self):
        self.graph = nx.Graph()
        self.edges_threshold = 0.8

    def build_graph(self, df):
        print("Construction du graphe de connaissances...")
        embeddings = []
        for _, row in tqdm(df.iterrows(), total=df.shape[0], desc="Création des embeddings"):
            node_id = str(row['id_enccre'])
            self.graph.add_node(node_id, **row.to_dict())
            embedding = ollama.embeddings(model="mxbai-embed-large", prompt=row['content'])['embedding']
            embeddings.append(embedding)
        
        embeddings = np.array(embeddings)
        self._add_edges(embeddings)

    def _add_edges(self, embeddings):
        print("Ajout des arêtes au graphe...")
        similarity_matrix = cosine_similarity(embeddings)
        num_nodes = len(self.graph.nodes)
        
        for i in tqdm(range(num_nodes), desc="Création des arêtes"):
            for j in range(i+1, num_nodes):
                similarity_score = similarity_matrix[i][j]
                if similarity_score > self.edges_threshold:
                    node_i = list(self.graph.nodes)[i]
                    node_j = list(self.graph.nodes)[j]
                    self.graph.add_edge(node_i, node_j, weight=similarity_score)

    def get_related_nodes(self, node_id, depth=1):
        related_nodes = set()
        current_nodes = {node_id}
        
        for _ in range(depth):
            next_nodes = set()
            for node in current_nodes:
                next_nodes.update(self.graph.neighbors(node))
            related_nodes.update(next_nodes)
            current_nodes = next_nodes
        
        return related_nodes

class QueryEngine:
    def __init__(self, collection, knowledge_graph):
        self.collection = collection
        self.knowledge_graph = knowledge_graph

    def query(self, query: str):
        print("Traitement de la requête...")
        embedding = ollama.embeddings(model="mxbai-embed-large", prompt=query)['embedding']
        results = self.collection.query(query_embeddings=[embedding], n_results=5)
        
        context = ""
        for doc_id, doc_content in zip(results['ids'][0], results['documents'][0]):
            related_nodes = self.knowledge_graph.get_related_nodes(doc_id)
            for related_id in related_nodes:
                related_content = self.knowledge_graph.graph.nodes[related_id]['content']
                context += f"\n{related_content}"
            context += f"\n{doc_content}"
        
        response = ollama.generate(
            model="llama2",
            prompt=f"Using this context: {context}\nRespond to this query: {query}"
        )

        return response['response'], results['ids'][0], results['documents'][0]

class GraphRAG:
    def __init__(self):
        self.document_processor = DocumentProcessor()
        self.knowledge_graph = KnowledgeGraph()
        self.query_engine = None

    def process_tsv(self, file_path):
        if os.path.exists('processed_data.pkl'):
            print("Chargement des données pré-traitées...")
            with open('processed_data.pkl', 'rb') as f:
                df, collection = pickle.load(f)
        else:
            print("Traitement du fichier TSV...")
            df, collection = self.document_processor.process_tsv(file_path)
            with open('processed_data.pkl', 'wb') as f:
                pickle.dump((df, collection), f)

        if os.path.exists('knowledge_graph.pkl'):
            print("Chargement du graphe de connaissances...")
            with open('knowledge_graph.pkl', 'rb') as f:
                self.knowledge_graph = pickle.load(f)
        else:
            print("Construction du graphe de connaissances...")
            self.knowledge_graph.build_graph(df)
            with open('knowledge_graph.pkl', 'wb') as f:
                pickle.dump(self.knowledge_graph, f)

        self.query_engine = QueryEngine(collection, self.knowledge_graph)

    def query(self, query: str):
        return self.query_engine.query(query)

class DocumentProcessor:
    def __init__(self):
        self.client = chromadb.Client()
        self.collection = self.client.create_collection(name="encyclopedia")

    def process_tsv(self, file_path, chunk_size=1000):
        print("Chargement et traitement du fichier TSV...")
        all_data = []
        for chunk in pd.read_csv(file_path, sep='\t', chunksize=chunk_size):
            chunk['content'] = chunk['content'].astype(str).replace('nan', '')
            chunk = chunk[chunk['content'].str.strip() != '']
            all_data.append(chunk)

            for _, row in tqdm(chunk.iterrows(), total=chunk.shape[0], desc="Traitement des articles"):
                try:
                    embedding = ollama.embeddings(model="mxbai-embed-large", prompt=row['content'])['embedding']
                    self.collection.add(
                        ids=[str(row['id_enccre'])],
                        embeddings=[embedding],
                        documents=[row['content']],
                        metadatas=[{
                            'volume': row['volume'],
                            'numero': row['numero'],
                            'head': row['head'],
                            'author': row['author'],
                            'domaine_enccre': row['domaine_enccre']
                        }]
                    )
                except Exception as e:
                    print(f"Erreur lors du traitement de l'article {row['id_enccre']}: {e}")
                    continue

        df = pd.concat(all_data, ignore_index=True)
        return df, self.collection


In [2]:
# Initialisation du système GraphRAG
graph_rag = GraphRAG()

2024-08-30 00:30:47,293 - INFO - Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.


In [4]:
# Chargement et traitement du fichier TSV de l'Encyclopédie
#graph_rag.process_tsv("data/EDdA_dataframe_withContent.tsv")

Traitement du fichier TSV...
Chargement et traitement du fichier TSV...


Traitement des articles:   0%|          | 0/999 [00:00<?, ?it/s]2024-08-30 00:29:38,748 - INFO - HTTP Request: POST http://192.168.1.49:11434/api/embeddings "HTTP/1.1 200 OK"


: 

In [4]:
import cProfile
cProfile.run('graph_rag.process_tsv("data/EDdA_dataframe_withContent.tsv")', 'process_tsv_stats')

Traitement du fichier TSV...
Chargement et traitement du fichier TSV...


Traitement des articles:   0%|          | 0/999 [00:00<?, ?it/s]2024-08-30 00:30:53,436 - INFO - HTTP Request: POST http://192.168.1.49:11434/api/embeddings "HTTP/1.1 200 OK"


: 

In [None]:
# Formulation de la requête
query = "Comment Diderot définit le vivant dans l'Encyclopédie ?"

# Exécution de la requête
response, related_ids, related_docs = graph_rag.query(query)

# Affichage de la réponse
print("\nRéponse à la requête :")
print(response)

# Affichage des articles connexes
print("\nArticles connexes utilisés pour la réponse :")
for id, doc in zip(related_ids, related_docs):
    print(f"ID: {id}")
    print(f"Contenu: {doc[:200]}...")  # Affiche les 200 premiers caractères
    print("-" * 50)