In [31]:
import requests
import pandas as pd

In [32]:
import sys, os
SRC_PATH = os.path.join(os.getcwd(), "..")  # adjust depending on notebook location
if SRC_PATH not in sys.path:
    sys.path.insert(0, SRC_PATH)
from utils.config import *

print("Project root:", os.path.dirname(DATA_PATH))
print("Data folder exists?", os.path.exists(DATA_PATH))
print("Sample file exists?", os.path.exists(os.path.join(DATA_PATH, "abstract_sample_data.csv")))


Project root: c:\Users\Pau\Files\Estudis\UOC\TFM\Graph_Rag
Data folder exists? True
Sample file exists? True


In [33]:
# ----------------------------------------
# Notebook: DBpedia Pipeline Debugging
# Start from load_local_data()
# ----------------------------------------
import pandas as pd
from data_intake import load_local_data
from text_preprocessing import chunk_dataframe
from dbpedia_pipeline import (
    annotate_text_spotlight,
    enrich_annotations,
    clean_parsed_entities
)


In [34]:
# ----------------------------------------
# Step 0 — Load local data
# ----------------------------------------
metadata_df, text_df = load_local_data()
print(f"Metadata entries: {len(metadata_df)}")
display(metadata_df.head())

print(f"Text entries: {len(text_df)}")
display(text_df.head())


2025-10-20 22:03:06,237 | INFO | Loading dataset from c:\Users\Pau\Files\Estudis\UOC\TFM\Graph_Rag\data\abstract_sample_data.csv...
2025-10-20 22:03:06,258 | INFO | Sample affiliations_clean: [{'@_fa': 'true', 'affilname': 'Shanghai University of International Business and Economics', 'affiliation-city': 'Shanghai', 'affiliation-country': 'China'}]
2025-10-20 22:03:06,260 | INFO | Sample links_clean: {}
2025-10-20 22:03:06,261 | INFO | Loaded 23 metadata entries.


Metadata entries: 23


Unnamed: 0,scopus_id,title,creator,publication_name,cover_date,subtype_desc,affiliations,links
0,85195362067,Optimization Study of Higher Education Data Go...,Chen J.,ACM International Conference Proceeding Series,2023-12-22,Conference Paper,"[{'@_fa': 'true', 'affilname': 'Shanghai Unive...",{}
1,85181155120,Educational Missions and Tactics in the EU Art...,Xiong Y.,Advances in Transdisciplinary Engineering,2023-12-15,Conference Paper,"[{'@_fa': 'true', 'affilname': 'Wenzhou Univer...",{}
2,85183323712,The European AI Tango: Balancing Regulation In...,Todorova C.,ACM International Conference Proceeding Series,2023-12-14,Conference Paper,"[{'@_fa': 'true', 'affilname': 'European Softw...",{}
3,85179647045,Medicine’s Lessons for AI Regulation,Stark L.,New England Journal of Medicine,2023-12-14,Article,"[{'@_fa': 'true', 'affilname': 'Vanderbilt Uni...",{}
4,85181147353,Application of AI Technology in Online Platfor...,Zhou Q.,Frontiers in Artificial Intelligence and Appli...,2023-12-12,Conference Paper,"[{'@_fa': 'true', 'affilname': 'Anhui Sanlian ...",{}


Text entries: 23


Unnamed: 0,scopus_id,description,abstract
0,85195362067,Artificial intelligence technology has opened ...,
1,85181155120,"In the era of digital transformation, AI, clou...",
2,85183323712,"In the past few years, the EU has shown a grow...",
3,85179647045,There are various possible futures for regulat...,
4,85181147353,In order to cater to the reform of online plat...,


In [35]:
# ----------------------------------------
# Step 1 — Chunk text
# ----------------------------------------
chunks_df = chunk_dataframe(text_df, text_col="description", id_col="scopus_id")
print(f"Chunks created: {len(chunks_df)}")
display(chunks_df.head())


Chunks created: 41


Unnamed: 0,scopus_id,chunk_id,chunk_text
0,85195362067,0,Artificial intelligence technology has opened ...
1,85195362067,1,seven evaluation indicators and three stages o...
2,85181155120,0,"In the era of digital transformation, AI, clou..."
3,85181155120,1,the EU has adopted various education tactics i...
4,85183323712,0,"In the past few years, the EU has shown a grow..."


In [36]:
# ----------------------------------------
# Step 2 — Annotate original text with DBpedia entries
# ----------------------------------------
sample_chunks = chunks_df.head(20)  # testing first 20 chunks
all_annotations = []

print("\nStep 2 — Annotating chunks with DBpedia Spotlight...")
all_annotations = []

for idx, row in sample_chunks.iterrows():
    article_id = row['scopus_id']
    chunk_id = row['chunk_id']
    text = row['chunk_text']

    print(f"\nChunk {idx+1}: {text[:80]}...")
    annotations = annotate_text_spotlight(article_id, chunk_id, text, confidence=0.5)
    print(f"Annotations ({len(annotations)}): {annotations}")
    all_annotations.extend(annotations)  # flatten here to simplify enrichment


Step 2 — Annotating chunks with DBpedia Spotlight...

Chunk 1: Artificial intelligence technology has opened up new horizons for higher educati...
Annotations (10): [(85195362067, 0, 'http://dbpedia.org/resource/Artificial_intelligence', 'Artificial intelligence', 0.9999998141935466), (85195362067, 0, 'http://dbpedia.org/resource/Data_governance', 'data governance', 1.0), (85195362067, 0, 'http://dbpedia.org/resource/Data_dependency', 'data dependency', 1.0), (85195362067, 0, 'http://dbpedia.org/resource/Hinder', 'hinder', 0.9999999184987052), (85195362067, 0, 'http://dbpedia.org/resource/Data_sharing', 'data sharing', 1.0), (85195362067, 0, 'http://dbpedia.org/resource/Privacy', 'privacy', 0.9973295876647092), (85195362067, 0, 'http://dbpedia.org/resource/Data_sharing', 'data sharing', 1.0), (85195362067, 0, 'http://dbpedia.org/resource/Data_governance', 'data governance', 1.0), (85195362067, 0, 'http://dbpedia.org/resource/Artificial_intelligence', 'AI', 0.9999903076865347), (851953

In [37]:
display(all_annotations)

[(85195362067,
  0,
  'http://dbpedia.org/resource/Artificial_intelligence',
  'Artificial intelligence',
  0.9999998141935466),
 (85195362067,
  0,
  'http://dbpedia.org/resource/Data_governance',
  'data governance',
  1.0),
 (85195362067,
  0,
  'http://dbpedia.org/resource/Data_dependency',
  'data dependency',
  1.0),
 (85195362067,
  0,
  'http://dbpedia.org/resource/Hinder',
  'hinder',
  0.9999999184987052),
 (85195362067,
  0,
  'http://dbpedia.org/resource/Data_sharing',
  'data sharing',
  1.0),
 (85195362067,
  0,
  'http://dbpedia.org/resource/Privacy',
  'privacy',
  0.9973295876647092),
 (85195362067,
  0,
  'http://dbpedia.org/resource/Data_sharing',
  'data sharing',
  1.0),
 (85195362067,
  0,
  'http://dbpedia.org/resource/Data_governance',
  'data governance',
  1.0),
 (85195362067,
  0,
  'http://dbpedia.org/resource/Artificial_intelligence',
  'AI',
  0.9999903076865347),
 (85195362067,
  0,
  'http://dbpedia.org/resource/Artificial_intelligence',
  'AI',
  0.9999

In [38]:
# ----------------------------------------
# Step 3 — Enrich annotations with DBpedia JSON
# ----------------------------------------
from dbpedia_pipeline import enrich_annotations

print("\nStep 3 — Enriching annotations with DBpedia entity info...")
all_enriched = enrich_annotations(all_annotations)

print(f"Total enriched entities: {len(all_enriched)}")
# optional: preview first 5
for e in all_enriched[:5]:
    print(f"- {e['label']} | Types: {e['types']} | Relations: {list(e['relations'].keys())} | Score: {e['similarity_score']}")



Step 3 — Enriching annotations with DBpedia entity info...


2025-10-20 22:03:32,526 | ERROR | Failed to fetch entity http://dbpedia.org/resource/United_States: HTTPSConnectionPool(host='dbpedia.org', port=443): Read timed out. (read timeout=10)
2025-10-20 22:03:41,922 | ERROR | Failed to fetch entity http://dbpedia.org/resource/China: 500 Server Error: SPARQL Request Failed for url: https://dbpedia.org/data/China.json


Total enriched entities: 159
- Artificial intelligence | Types: ['http://dbpedia.org/ontology/MusicGenre', 'http://dbpedia.org/ontology/Organisation', 'http://www.w3.org/2002/07/owl#Thing'] | Relations: ['categories'] | Score: 0.9999998141935466
- Data governance | Types: ['http://dbpedia.org/ontology/Place'] | Relations: ['categories'] | Score: 1.0
- Data dependency | Types: ['http://dbpedia.org/class/yago/Person100007846', 'http://dbpedia.org/ontology/ProgrammingLanguage', 'http://dbpedia.org/class/yago/Writer110794014', 'http://dbpedia.org/class/yago/Organism100004475', 'http://dbpedia.org/class/yago/LivingThing100004258', 'http://dbpedia.org/class/yago/Object100002684', 'http://dbpedia.org/class/yago/PhysicalEntity100001930', 'http://dbpedia.org/class/yago/WikicatCompilers', 'http://dbpedia.org/class/yago/YagoLegalActorGeo', 'http://dbpedia.org/class/yago/YagoLegalActor', 'http://dbpedia.org/class/yago/Compiler109946957', 'http://dbpedia.org/class/yago/Whole100003553', 'http://dbpe

In [39]:
# Collect all types seen across the dataset
all_types_seen = set()
for entity in all_enriched:
    all_types_seen.update(entity.get("types", []))

print(f"Types seen across dataset ({len(all_types_seen)}):")
print(sorted(all_types_seen))

Types seen across dataset (274):
['http://dbpedia.org/class/yago/Ability105616246', 'http://dbpedia.org/class/yago/Abstraction100002137', 'http://dbpedia.org/class/yago/Act100030358', 'http://dbpedia.org/class/yago/Activity100407535', 'http://dbpedia.org/class/yago/AdministrativeDistrict108491826', 'http://dbpedia.org/class/yago/AdministrativeUnit108077292', 'http://dbpedia.org/class/yago/Administrator109770949', 'http://dbpedia.org/class/yago/Album106591815', 'http://dbpedia.org/class/yago/Algorithm105847438', 'http://dbpedia.org/class/yago/Alliance108293982', 'http://dbpedia.org/class/yago/Application100949134', 'http://dbpedia.org/class/yago/Application106570110', 'http://dbpedia.org/class/yago/Artifact100021939', 'http://dbpedia.org/class/yago/ArtificialLanguage106894544', 'http://dbpedia.org/class/yago/Attribute100024264', 'http://dbpedia.org/class/yago/Bloc108171094', 'http://dbpedia.org/class/yago/Branch108401248', 'http://dbpedia.org/class/yago/BroadcastingStation102903405', 'h

In [40]:
# ----------------------------------------
# Step 4 — Clean & validate parsed entities
# ----------------------------------------
from dbpedia_pipeline import clean_parsed_entities

print("\nStep 5 — Cleaning & validating entities...")
all_cleaned = clean_parsed_entities(all_enriched)  # single call

print(f"Total cleaned entities: {len(all_cleaned)}")

2025-10-20 22:03:43,966 | INFO | ✅ Accepted: Artificial intelligence (http://dbpedia.org/resource/Artificial_intelligence) | Reason: All heuristics passed
2025-10-20 22:03:43,968 | INFO | ❌ Rejected: Data governance (http://dbpedia.org/resource/Data_governance) | Reason: Types not in whitelist
2025-10-20 22:03:43,970 | INFO | ❌ Rejected: Data dependency (http://dbpedia.org/resource/Data_dependency) | Reason: Types not in whitelist
2025-10-20 22:03:43,972 | INFO | ✅ Accepted: Hinder (http://dbpedia.org/resource/Hinder) | Reason: All heuristics passed
2025-10-20 22:03:43,973 | INFO | ✅ Accepted: Data sharing (http://dbpedia.org/resource/Data_sharing) | Reason: All heuristics passed
2025-10-20 22:03:43,975 | INFO | ✅ Accepted: Privacy (http://dbpedia.org/resource/Privacy) | Reason: All heuristics passed
2025-10-20 22:03:43,976 | INFO | ✅ Accepted: Data sharing (http://dbpedia.org/resource/Data_sharing) | Reason: All heuristics passed
2025-10-20 22:03:43,977 | INFO | ❌ Rejected: Data gover


Step 5 — Cleaning & validating entities...


2025-10-20 22:03:44,092 | INFO | ✅ Accepted: KCMP (http://dbpedia.org/resource/KCMP) | Reason: Acronym accepted
2025-10-20 22:03:44,093 | INFO | ✅ Accepted: Artificial intelligence (http://dbpedia.org/resource/Artificial_intelligence) | Reason: All heuristics passed
2025-10-20 22:03:44,095 | INFO | ✅ Accepted: Artificial intelligence (http://dbpedia.org/resource/Artificial_intelligence) | Reason: All heuristics passed
2025-10-20 22:03:44,098 | INFO | ✅ Accepted: Artificial intelligence (http://dbpedia.org/resource/Artificial_intelligence) | Reason: All heuristics passed
2025-10-20 22:03:44,103 | INFO | ✅ Accepted: European Union (http://dbpedia.org/resource/European_Union) | Reason: All heuristics passed
2025-10-20 22:03:44,105 | INFO | ✅ Accepted: Artificial intelligence (http://dbpedia.org/resource/Artificial_intelligence) | Reason: All heuristics passed
2025-10-20 22:03:44,107 | INFO | ✅ Accepted: Artificial intelligence (http://dbpedia.org/resource/Artificial_intelligence) | Reason

Total cleaned entities: 148


In [41]:
display(all_cleaned)

[{'label': 'Artificial intelligence',
  'abstract': 'Artificial intelligence (AI) is intelligence—perceiving, synthesizing, and infering information—demonstrated by machines, as opposed to intelligence displayed by animals and humans. Example tasks in which this is done include speech recognition, computer vision, translation between (natural) languages, as well as other mappings of inputs. The Oxford English Dictionary of Oxford University Press defines artificial intelligence as: the theory and development of computer systems able to perform tasks that normally require human intelligence, such as visual perception, speech recognition, decision-making, and translation between languages. AI applications include advanced web search engines (e.g., Google), recommendation systems (used by YouTube, Amazon and Netflix), understanding human speech (such as Siri and Alexa), self-driving cars (e.g., Tesla), automated decision-making and competing at the highest level in strategic game systems 

In [42]:
pd.DataFrame(all_cleaned).head()

Unnamed: 0,label,abstract,types,relations,wikidata_quids,uri,scopus_id,chunk_id,surface_form,similarity_score
0,Artificial intelligence,Artificial intelligence (AI) is intelligence—p...,"[http://dbpedia.org/ontology/MusicGenre, http:...",{'categories': ['http://dbpedia.org/resource/C...,[Q11660],http://dbpedia.org/resource/Artificial_intelli...,85195362067,0,Artificial intelligence,1.0
1,Hinder,Hinder is an American rock band from Oklahoma ...,"[http://dbpedia.org/ontology/Band, http://dbpe...",{'categories': ['http://dbpedia.org/resource/C...,[Q907370],http://dbpedia.org/resource/Hinder,85195362067,0,hinder,1.0
2,Data sharing,Data sharing is the practice of making data us...,[http://dbpedia.org/ontology/Company],{'categories': ['http://dbpedia.org/resource/C...,[Q5227350],http://dbpedia.org/resource/Data_sharing,85195362067,0,data sharing,1.0
3,Privacy,"Privacy (UK: /ˈprɪvəsiː/, US: /ˈpraɪ-/) is the...","[http://dbpedia.org/ontology/Disease, http://d...",{'categories': ['http://dbpedia.org/resource/C...,[Q188728],http://dbpedia.org/resource/Privacy,85195362067,0,privacy,0.99733
4,Data sharing,Data sharing is the practice of making data us...,[http://dbpedia.org/ontology/Company],{'categories': ['http://dbpedia.org/resource/C...,[Q5227350],http://dbpedia.org/resource/Data_sharing,85195362067,0,data sharing,1.0


In [43]:
import requests

def fetch_dbpedia_entity(uri):
    """
    Given a DBpedia URI, fetch its JSON data.
    Returns the inner JSON block corresponding to the entity.
    """
    try:
        # Example: uri = "http://dbpedia.org/resource/Artificial_intelligence"
        entity_name = uri.split("/")[-1]
        url = f"https://dbpedia.org/data/{entity_name}.json"
        resp = requests.get(url, timeout=10)
        resp.raise_for_status()

        data = resp.json()
        entity_key = f"http://dbpedia.org/resource/{entity_name}"
        if entity_key in data:
            return data[entity_key]
        else:
            raise KeyError(f"No data found for {entity_name}")
    except Exception as e:
        print(f"Error fetching {uri}: {e}")
        return None