In [1]:
import sys
# caution: path[0] is reserved for script path (or '' in REPL)
sys.path.insert(1, '../src')
sys.path.insert(1, '../prompts')
sys.path.insert(1, '../input')

from pathlib import Path
import hashlib
import json
import pandas as pd
from arango import ArangoClient
from arango import DocumentInsertError
import chromadb
import ollama

## Process PDF Files (optinal)

In [50]:
from PdfProcessor import PdfProcessor
# For text output
processor = PdfProcessor(output_dir="../input/txt")
processor.process_pdfs(output_format="text")

  from .autonotebook import tqdm as notebook_tqdm


# Load results

In [2]:
entities = pd.read_parquet('../benchmark/output/create_final_entities.parquet')
relationships = pd.read_parquet('../benchmark/output/create_final_relationships.parquet')
community_reports = pd.read_parquet('../benchmark/output/create_final_community_reports.parquet')
communities = pd.read_parquet('../benchmark/output/create_final_communities.parquet')
text_units = pd.read_parquet('../benchmark/output/create_final_text_units.parquet')
documents = pd.read_parquet('../benchmark/output/create_final_documents.parquet')
# nodes = pd.read_parquet('../ragtest/output/create_final_nodes.parquet')

In [3]:
communities_merged = pd.merge(
    communities, 
    community_reports, 
    on='community',           # Column to merge on
    how='outer',             # Keep all rows from both dataframes
    suffixes=('', '_reports'), # Add suffixes to distinguish duplicate column names
)

communities_merged.head(2)

Unnamed: 0,id,human_readable_id,community,parent,level,title,entity_ids,relationship_ids,text_unit_ids,period,...,level_reports,title_reports,summary,full_content,rank,rank_explanation,findings,full_content_json,period_reports,size_reports
0,63c7cb5c-d7d6-44e3-ad30-5bfc94fa5ed7,0,0,-1,0,Community 0,"[db22651f-2b32-4135-b485-c25a7f724304, 12b4db9...","[1ff51675-7d63-411e-956e-13c691786300, 23fa4e5...",[6089c9eecfbecf037f199213619a90f7cbe8351d933f7...,2025-01-30,...,0,Hugging Face and NeurIPS Community,"The community is centered around Hugging Face,...",# Hugging Face and NeurIPS Community\n\nThe co...,8.5,The impact severity rating is high due to the ...,[{'explanation': 'Hugging Face is a key player...,"{\n ""title"": ""Hugging Face and NeurIPS Comm...",2025-01-30,22
1,3c55fc2c-7565-4c65-a802-81e5c69487c0,1,1,-1,0,Community 1,"[5557962d-42e4-46db-a3d0-d77ce972dac3, dc2ad48...","[1ccb53c6-2e14-49a8-ab0b-66f4d8964177, 1f67be3...",[3a7cfdfe2513989b5be63087d2f81ce884640a0f191aa...,2025-01-30,...,0,"AI, CO₂ Emissions, and Nuclear Energy",This community encompasses the interplay betwe...,"# AI, CO₂ Emissions, and Nuclear Energy\n\nThi...",7.5,The impact severity rating is high due to the ...,[{'explanation': 'Artificial Intelligence (AI)...,"{\n ""title"": ""AI, CO₂ Emissions, and Nuclea...",2025-01-30,34


## ArangoDB

In [4]:
# Initialize the ArangoDB client
client = ArangoClient(hosts="http://localhost:8529")

# Connect to "_system" database as root user
sys_db = client.db("_system", username="root", password="root")

# Create a new database if it doesn't exist
if not sys_db.has_database("benchmark"):
    sys_db.create_database("benchmark")

# Connect to the knowledge_graph database
db = client.db("benchmark", username="root", password="root")

# Create document collections if they don't exist
for collection_name in ["chunks", "entities", "documents", "communities"]:
    if not db.has_collection(collection_name):
        db.create_collection(collection_name)

# Create edge collections if they don't exist
for edge_collection in ["relationships", "chunk_contains", "community_contains", "document_contains"]:
    if not db.has_collection(edge_collection):
        db.create_collection(edge_collection, edge=True)

In [5]:
# Helper function to create document with custom _key
def create_update_document(collection, doc, id_field='id'):
    doc['_key'] = doc[id_field]  # Use id as _key
    return collection.insert(doc, overwrite_mode='update')

### Insert Documents

In [6]:
# Insert documents
documents_collection = db.collection('documents')
for _, row in documents.iterrows():
    doc_doc = {
        'id': row['id'],
        'human_readable_id': row['human_readable_id'],
        'title': row['title'],
        'text': row['text']
    }
    create_update_document(documents_collection, doc_doc)

### Insert Chunks

In [7]:
# Insert Chunks
chunks_collection = db.collection('chunks')
for _, row in text_units.iterrows():
    chunk_doc = {
        'id': row['id'],
        'human_readable_id': row['human_readable_id'],
        'text': row['text'],
        'document_id': row['document_ids'][0],
        'n_tokens': row['n_tokens']
    }
    create_update_document(chunks_collection, chunk_doc)

### Insert Entities

In [8]:
entity_id_mapping = {
    row['id']: row['title'] for _, row in entities.iterrows()
}
entity_id_mapping_r = {
    row['title']: row['id'] for _, row in entities.iterrows()
}

In [9]:
# Insert entities
entities = entities[
    (entities['title'].notna()) & (entities['title'] != '') &
    (entities['type'].notna()) & (entities['type'] != '')
]

entities_collection = db.collection('entities')
for _, row in entities.iterrows():
    entity_doc = {
        'id': row['id'],
        'human_readable_id': row['human_readable_id'],
        'title': row['title'],
        'type': row['type'],
        'description': row['description']
    }
    create_update_document(entities_collection, entity_doc)

### Insert Communities

In [10]:
# Insert communities
communities_collection = db.collection('communities')
for _, row in communities_merged.iterrows():
    community_doc = {
        'id': row['id'],
        'human_readable_id': row['human_readable_id'],
        'community': row['community'],
        'parent': row['parent'],
        'level': row['level'],
        'period': row['period'],
        'rank': float(row['rank']),
        'size': int(row['size']),
        'title': row['title_reports'],
        'summary': row['summary'],
        'full_content': row['full_content'],
        'rank_explanation': row['rank_explanation'],
        'findings': list(row['findings'])
    }
    create_update_document(communities_collection, community_doc)

# Edges

### Realtionships

In [11]:
# Create relationships between entities
relationships_collection = db.collection('relationships')
for _, row in relationships.iterrows():
    rel_doc = {
        '_from': f"entities/{entity_id_mapping_r[row['source']]}",
        '_to': f"entities/{entity_id_mapping_r[row['target']]}",
        'id': row['id'],
        'human_readable_id': row['human_readable_id'],
        'type': 'relationship',
        'description': row['description'],
        'weight': float(row['weight']),
        'combined_degree': int(row['combined_degree'])
    }
    create_update_document(relationships_collection, rel_doc)

### Chunks

In [12]:
def create_hash(edge):
    dict_str = json.dumps(edge, sort_keys=True)
    hash_object = hashlib.sha256(dict_str.encode())
    hex_hash = hash_object.hexdigest()
    return hex_hash

In [13]:
# Create relationships between entities
text_units = text_units.dropna(subset=['relationship_ids'])

chunk_contains = db.collection('chunk_contains')
for _, row in text_units.iterrows():
    # Link chunks to entities
    for entity_id in row['entity_ids']:
        edge = {
            '_from': f"chunks/{row['id']}",
            '_to': f"entities/{entity_id}",
            'type': 'contains_entity',
            'weight': float(1),
        }
        edge['id'] = create_hash(edge)
        create_update_document(chunk_contains, edge)
    
    # Link chunks to documents  
    for doc_id in row['document_ids']:
        edge = {
            '_from': f"chunks/{row['id']}",
            '_to': f"documents/{doc_id}",
            'type': 'part_of_document',
            'weight': float(1),
        }
        edge['id'] = create_hash(edge)
        create_update_document(chunk_contains, edge)

### Communities

In [14]:
# Create Community Connections

community_contains = db.collection('community_contains')
for _, row in communities_merged.iterrows():
    # Link communities to entities
    for entity_id in row['entity_ids']:
        edge = {
            '_from': f"communities/{row['id']}",
            '_to': f"entities/{entity_id}",
            'type': 'community_entity',
            'weight': float(1),
        }
        edge['id'] = create_hash(edge)
        create_update_document(community_contains, edge)
    
    # Link communities to chunks
    for chunk in row['text_unit_ids']:
        edge = {
            '_from': f"communities/{row['id']}",
            '_to': f"chunks/{chunk}",
            'type': 'community_chunk',
            'weight': float(1),
        }
        edge['id'] = create_hash(edge)
        create_update_document(community_contains, edge)

### Documents

In [15]:
# Create Community Connections
document_contains = db.collection('document_contains')
for _, row in text_units.iterrows():
    edge = {
        '_from': f"chunks/{row['id']}",
        '_to': f"documents/{row['document_ids'][0]}",
        'type': 'part_of_document',
        'weight': float(1),
    }
    edge['id'] = create_hash(edge)
    create_update_document(document_contains, edge)
    # Link entities to documents
    for entity_id in row['entity_ids']:
        edge = {
            '_from': f"entities/{entity_id}",
            '_to': f"documents/{row['document_ids'][0]}",
            'type': 'part_of_document',
            'weight': float(1),
        }
        edge['id'] = create_hash(edge)
        create_update_document(document_contains, edge)
    
    # Link relationships to documents
    for relationship_id in row['relationship_ids']:
        edge = {
            '_from': f"relationships/{relationship_id}",
            '_to': f"documents/{row['document_ids'][0]}",
            'type': 'part_of_document',
            'weight': float(1),
        }
        edge['id'] = create_hash(edge)
        create_update_document(document_contains, edge)

# Create Graph

In [16]:
# Create a graph if it doesn't exist
if not db.has_graph("benchmark"):
    graph = db.create_graph("benchmark")
else:
    graph = db.graph("benchmark")

# Define edge definitions for the graph
edge_definitions = [
    {
        # Edge collection for relationships between entities
        "edge_collection": "relationships",
        "from_vertex_collections": ["entities"],
        "to_vertex_collections": ["entities"]
    },
    {
        # Edge collection for chunks containing entities/relationships
        "edge_collection": "chunk_contains",
        "from_vertex_collections": ["chunks"],
        "to_vertex_collections": ["entities"]
    },
    {
        # Edge collection for communities containing entities/relationships
        "edge_collection": "community_contains",
        "from_vertex_collections": ["communities"],
        "to_vertex_collections": ["entities", "chunks"]
    },
    {
        # Edge collection for communities containing entities/relationships
        "edge_collection": "document_contains",
        "from_vertex_collections": ["communities", "chunks", "entities"],
        "to_vertex_collections": ["documents"]
    }
]

# Add or update edge definitions
for edge_def in edge_definitions:
    if graph.has_edge_definition(edge_def["edge_collection"]):
        graph.replace_edge_definition(
            edge_collection=edge_def["edge_collection"],
            from_vertex_collections=edge_def["from_vertex_collections"],
            to_vertex_collections=edge_def["to_vertex_collections"]
        )
    else:
        graph.create_edge_definition(
            edge_collection=edge_def["edge_collection"],
            from_vertex_collections=edge_def["from_vertex_collections"],
            to_vertex_collections=edge_def["to_vertex_collections"]
        )

## Export Graph

In [17]:
# Query ArangoDB and create Gephi-compatible CSV files
def export_graph_for_gephi(db, output_dir='../gephi'):
    """
    Export ArangoDB graph to Gephi-compatible CSV files.
    Creates two files:
    - nodes.csv: Contains all vertices with their properties
    - edges.csv: Contains all edges with their properties
    """
    from pathlib import Path
    import csv
    
    # Create output directory if it doesn't exist
    Path(output_dir).mkdir(parents=True, exist_ok=True)
    
    # Export nodes
    nodes_query = """
    LET entities = (FOR v IN entities RETURN MERGE(v, {category: 'entity'}))
    LET chunks = (FOR v IN chunks RETURN MERGE(v, {category: 'chunk'}))
    LET documents = (FOR v IN documents RETURN MERGE(v, {category: 'document'}))
    LET communities = (FOR v IN communities RETURN MERGE(v, {category: 'community'}))
    
    FOR v IN APPEND(entities, APPEND(chunks, APPEND(documents, communities)))
        RETURN {
            Id: v._key,
            Label: v.title || v.text || v.id,
            Category: v.category,
            Type: v.type || '',
            Weight: v.weight || 1.0
        }
    """
    
    # Export edges
    edges_query = """
    LET relationships = (
        FOR e IN relationships 
        RETURN MERGE(e, {category: 'relationship'})
    )
    LET chunk_contains = (
        FOR e IN chunk_contains 
        RETURN MERGE(e, {category: 'chunk_contains'})
    )
    LET community_contains = (
        FOR e IN community_contains 
        RETURN MERGE(e, {category: 'community_contains'})
    )
    
    FOR e IN APPEND(relationships, APPEND(chunk_contains, community_contains))
        RETURN {
            Source: SPLIT(e._from, '/')[1],
            Target: SPLIT(e._to, '/')[1],
            Type: e.type,
            Weight: e.weight || 1.0,
            Category: e.category
        }
    """
    
    # Execute queries
    nodes = list(db.aql.execute(nodes_query))
    edges = list(db.aql.execute(edges_query))
    
    # Write nodes CSV
    with open(f'{output_dir}/nodes_benchmark.csv', 'w', newline='', encoding='utf-8') as f:
        writer = csv.DictWriter(f, fieldnames=['Id', 'Label', 'Category', 'Type', 'Weight'])
        writer.writeheader()
        writer.writerows(nodes)
    
    # Write edges CSV
    with open(f'{output_dir}/edges_benchmark.csv', 'w', newline='', encoding='utf-8') as f:
        writer = csv.DictWriter(f, fieldnames=['Source', 'Target', 'Type', 'Weight', 'Category'])
        writer.writeheader()
        writer.writerows(edges)
    
    print(f"Exported {len(nodes)} nodes and {len(edges)} edges to {output_dir}")
    return nodes, edges

# Export the graph
nodes, edges = export_graph_for_gephi(db)

Exported 1350 nodes and 3134 edges to ../gephi


# Semantic Embeddings


In [21]:
chroma_client = chromadb.PersistentClient(path="../chroma_db")
COLLECTION_NAME = 'knowledge_graph'
try:
    collection = chroma_client.get_collection(name=COLLECTION_NAME)
    chroma_client.delete_collection(name=COLLECTION_NAME)
    print('Old Version Deleted - Create New Collection')
    collection = chroma_client.create_collection(name=COLLECTION_NAME)
except:
    print('No Collection Found - Create New Collection')
    collection = chroma_client.create_collection(name=COLLECTION_NAME)

Old Version Deleted - Create New Collection


In [22]:
chunks = text_units['text'].tolist()
chunk_embeddings = ollama.embed(model='all-minilm:33m', input=chunks)

communities_summary = communities_merged['summary'].tolist()
# communities_full_content = communities_merged['full_content'].tolist()
communities_summary__embeddings = ollama.embed(model='all-minilm:33m', input=communities_summary)
# communities_full_content__embeddings = ollama.embed(model='all-minilm:33m', input=communities_full_content)

relationships_description = relationships['description'].tolist()
relationships_description__embeddings = ollama.embed(model='all-minilm:33m', input=relationships_description)

In [23]:
chunk_metadata = []
for idx, row in text_units.iterrows():
    chunk_metadata.append({
        "id": row['id'],
        'human_readable_id': row['human_readable_id'],
        'n_tokens': row['n_tokens']
    })

communities_metadata = []
for idx, row in communities_merged.iterrows():
    communities_metadata.append({
        "id": row['id'],
        'title': row['title_reports'],
        'rank': row['rank']
    })

relationships_metadata = []
for idx, row in relationships.iterrows():
    relationships_metadata.append({
        "id": row['id'],
        'source': row['source'],
        'target': row['target']
    })

In [24]:
for d, e, m in zip([chunks, communities_summary, relationships_description], [chunk_embeddings, communities_summary__embeddings, relationships_description__embeddings], [chunk_metadata, communities_metadata, relationships_metadata]):
    collection.add(
        documents=d,
        embeddings=e.embeddings,
        metadatas=m,
        ids=[x['id'] for x in m]
    )

### Query Semantic

In [25]:
chroma_client = chromadb.PersistentClient(path="../chroma_db")
COLLECTION_NAME = 'knowledge_graph'

try:
    collection = chroma_client.get_collection(name=COLLECTION_NAME)
    print(f'Loaded collection ({COLLECTION_NAME})')
except:
    print('No Collection Found')

query = "CoRAG framework and the KILT benchmark"
query_embedding = ollama.embed(model='all-minilm:33m', input=query)
results = collection.query(
    query_embeddings=query_embedding.embeddings,
    n_results=15,
)
for rd, re in zip(results['ids'][0], results['documents'][0]):
    print(rd, re)

Loaded collection (knowledge_graph)
52279d41-3e65-4cbf-af78-fca24980fb56 CoRAG is a newly established model that demonstrates state-of-the-art performance on the KILT benchmark, showcasing its effectiveness in handling knowledge-intensive tasks. The evaluation of CoRAG using the KILT benchmark highlights its capabilities across a variety of such tasks, confirming its advanced performance in this domain.
f8ce92bc-9661-414d-b2bf-c6fa567561df KILT-RAG is evaluated on the KILT benchmark, showcasing its performance across various tasks
77a4ce0c-85cc-41a1-a1ad-87afc791cad5 Tim Rocktäschel is one of the authors of the KILT benchmark paper
37237528-964b-429f-a6f6-356576598fb8 James Thorne is one of the authors of the KILT benchmark paper
4c797b87-54b3-4af4-b037-16749d687630 Nicola De Cao is one of the authors of the KILT benchmark paper
a21f4792-6029-479d-8532-2a4821bbf706 The KILT training set is part of the KILT benchmark used for training models
a81d336b-2d27-4d90-bcdf-aa0645a0435b Patrick 