In [None]:
import os
import sys
from openai import OpenAI
from dotenv import load_dotenv

sys.path.append("../src")  # Add the directory containing `src`
from data_loader import DataLoader
from schema_generator import SchemaGenerator
from semantic_annotation import archetype_annotation
from join_discoverer import JoinDiscoverer

load_dotenv("/Users/matteocastagna/Documents/Università/Assegno di ricerca 2024:2025/SemLink/.env") # Insert the path to your .env file

openai_client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))  # Or use your actual API key

<h1>Eurostat<h1>

In [None]:
data_dir = "/Users/matteocastagna/Documents/Università/Assegno di ricerca 2024:2025/SemLink/demo/eurostat/data"
metadata_dir = "/Users/matteocastagna/Documents/Università/Assegno di ricerca 2024:2025/SemLink/demo/eurostat/data"
output_dir = "/Users/matteocastagna/Documents/Università/Assegno di ricerca 2024:2025/SemLink/demo/eurostat/output"

# Initialization
data_loader = DataLoader(openai_client=openai_client)
schema_gen = SchemaGenerator(openai_client=openai_client)
join_dis = JoinDiscoverer(openai_client=openai_client)

In [None]:
datalake = data_loader.load_and_describe_datalake(
    data_directory=data_dir, # Directory with CSV or TSV
    metadata_directory=metadata_dir, # Optional directory with matadata in json format
    llm=True, # Produce a desciption with an LLM for each column
    sample_size=10, # How many values sample from a column
    output_directory=os.path.join(output_dir, "datalake") # Directory to save the output json
)

print(f"\nProcessed {len(datalake)} files")

In [None]:
# Generate LinkML schema
schema = schema_gen.generate_linkml_schema(
    data_lake_list = datalake, # Either the list of dict produced by load_and_describe_datalake or the path to the json file produced
    output_directory=os.path.join(output_dir, "schema") # Directory to save the output yaml
)

In [None]:
# Prune LinkML schema
pruned_schema = schema_gen.prune_schema(
    data_lake_list = datalake, # Either the list of dict produced by load_and_describe_datalake or the path to the json file produced
    yaml_schema = schema, # Either the dict produced by generate_linkml_schema or path to the yaml file produced
    output_directory=os.path.join(output_dir, "schema") # Directory to save the output yaml
)

In [None]:
# Semnaitc column annotaiton with ArcheType
datalake_annotated = archetype_annotation(
    data_lake_list=datalake, # Either the list of dict produced by load_and_describe_datalake or the path to the json file produced
    yaml_schema=pruned_schema,  # Either the dict produced by generate_linkml_schema/prune_schema or path to the yaml file produced
    sample_size=10, # How many samples of the column pass to Archetype for the annotation
    output_directory=os.path.join(output_dir, "datalake") # Directory to save the output json
)

In [None]:
# Generate embeddings for each column
embeddings = join_dis.generate_embeddings(
    data_lake_list=datalake_annotated, # Either the list of dict produced by load_and_describe_datalake/archetype_annotation or the path to the json file produced
    output_directory=os.path.join(output_dir, "embeddings") # Directory to save the output json
)

In [None]:
# Generate Neo4j graph
join_dis.compute_distances_and_export_neo4j(
    embeddings=embeddings, # Either the list of dict produced by generate_embeddings or the path to the json file produced
    cosine_sim_threshold = 0.5,
    anns_threshold = 0.2,
    output_directory=os.path.join(output_dir, "neo4j") # Directory to save the output CSVs
)

<h1>PKT<h1>

In [None]:
data_dir = "/Users/matteocastagna/Documents/Università/Assegno di ricerca 2024:2025/SemLink/demo/pkt/data"
output_dir = "/Users/matteocastagna/Documents/Università/Assegno di ricerca 2024:2025/SemLink/demo/pkt/output"

# Initialization
data_loader = DataLoader(openai_client=openai_client)
schema_gen = SchemaGenerator(openai_client=openai_client)
join_dis = JoinDiscoverer(openai_client=openai_client)

In [None]:
datalake = data_loader.load_and_describe_datalake(
    data_directory=data_dir, # Directory with CSV or TSV
    llm=True, # Produce a desciption with an LLM for each column
    sample_size=10, # How many values sample from a column
    output_directory=os.path.join(output_dir, "datalake") # Directory to save the output json
)

print(f"\nProcessed {len(datalake)} files")

In [None]:
# Generate LinkML schema
schema = schema_gen.generate_linkml_schema(
    data_lake_list = datalake, # Either the list of dict produced by load_and_describe_datalake or the path to the json file produced
    output_directory=os.path.join(output_dir, "schema") # Directory to save the output yaml
)

In [None]:
# Prune LinkML schema
pruned_schema = schema_gen.prune_schema(
    data_lake_list = datalake, # Either the list of dict produced by load_and_describe_datalake or the path to the json file produced
    yaml_schema = schema, # Either the dict produced by generate_linkml_schema or path to the yaml file produced
    output_directory=os.path.join(output_dir, "schema") # Directory to save the output yaml
)

In [None]:
# Semnaitc column annotaiton with ArcheType
datalake_annotated = archetype_annotation(
    data_lake_list=datalake, # Either the list of dict produced by load_and_describe_datalake or the path to the json file produced
    yaml_schema=pruned_schema,  # Either the dict produced by generate_linkml_schema/prune_schema or path to the yaml file produced
    sample_size=10, # How many samples of the column pass to Archetype for the annotation
    output_directory=os.path.join(output_dir, "datalake") # Directory to save the output json
)

In [None]:
# Generate embeddings for each column
embeddings = join_dis.generate_embeddings(
    data_lake_list=datalake_annotated, # Either the list of dict produced by load_and_describe_datalake/archetype_annotation or the path to the json file produced
    output_directory=os.path.join(output_dir, "embeddings") # Directory to save the output json
)

In [None]:
# Generate Neo4j graph
join_dis.compute_distances_and_export_neo4j(
    embeddings=embeddings, # Either the list of dict produced by generate_embeddings or the path to the json file produced
    cosine_sim_threshold = 0.5,
    anns_threshold = 0.2,
    output_directory=os.path.join(output_dir, "neo4j") # Directory to save the output CSVs
)