In [1]:
import os
import sys
from openai import OpenAI
from dotenv import load_dotenv

sys.path.append("../src")  # Add the directory containing `src`
from data_loader import DataLoader
from schema_generator import SchemaGenerator
from semantic_annotation import archetype_annotation
from join_discoverer import JoinDiscoverer

load_dotenv("/Users/matteocastagna/Documents/Università/Assegno di ricerca 2024:2025/SemLink/.env") # Insert the path to your .env file

openai_client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))  # Or use your actual API key

  from .autonotebook import tqdm as notebook_tqdm
  Referenced from: <0B7EB158-53DC-3403-8A49-22178CAB4612> /Users/matteocastagna/anaconda3/envs/archetype2/lib/python3.10/site-packages/torchvision/image.so
  warn(


<h1>Eurostat<h1>

In [2]:
data_dir = "/Users/matteocastagna/Documents/Università/Assegno di ricerca 2024:2025/SemLink/demo/eurostat/data"
metadata_dir = "/Users/matteocastagna/Documents/Università/Assegno di ricerca 2024:2025/SemLink/demo/eurostat/data"
output_dir = "/Users/matteocastagna/Documents/Università/Assegno di ricerca 2024:2025/SemLink/demo/eurostat/output"

# Initialization
data_loader = DataLoader(openai_client=openai_client)
schema_gen = SchemaGenerator(openai_client=openai_client)
join_dis = JoinDiscoverer(openai_client=openai_client)

In [3]:
datalake = data_loader.load_and_describe_datalake(
    data_directory=data_dir, # Directory with CSV or TSV
    metadata_directory=metadata_dir, # Optional directory with matadata in json format
    llm=True, # Produce a desciption with an LLM for each column
    sample_size=10, # How many values sample from a column
    output_directory=os.path.join(output_dir, "datalake") # Directory to save the output json
)

print(f"\nProcessed {len(datalake)} files")

[17:41:50] [35mStarting data lake loading and description from /Users/matteocastagna/Documents/Università/Assegno di ricerca 2024:2025/SemLink/demo/eurostat/data[0m


Processing data files:   0%|          | 0/4 [00:00<?, ?it/s]

[17:41:50] [34mFound metadata for AACT_ALI02_data.csv: AACT_ALI02_metadata.json[0m


Processing data files:  25%|██▌       | 1/4 [00:18<00:55, 18.43s/it]

[17:42:09] [34mFound metadata for AACT_EAA06_data.csv: AACT_EAA06_metadata.json[0m


Processing data files:  50%|█████     | 2/4 [00:32<00:31, 15.86s/it]

[17:42:23] [34mFound metadata for ACF_D_EQ1_data.csv: ACF_D_EQ1_metadata.json[0m


Processing data files:  75%|███████▌  | 3/4 [00:45<00:14, 14.62s/it]

[17:42:36] [34mFound metadata for AACT_ALI01_data.csv: AACT_ALI01_metadata.json[0m


Processing data files: 100%|██████████| 4/4 [00:57<00:00, 14.36s/it]


[17:42:48] [33mUnifying column data...[0m


Unifying descriptions with LLM: 100%|██████████| 12/12 [00:16<00:00,  1.36s/it]

[17:43:04] [32mColumn data unification complete.[0m
[17:43:04] [32mEnsured directory exists: /Users/matteocastagna/Documents/Università/Assegno di ricerca 2024:2025/SemLink/demo/eurostat/output/datalake[0m
[17:43:04] [32mData lake JSON saved to: /Users/matteocastagna/Documents/Università/Assegno di ricerca 2024:2025/SemLink/demo/eurostat/output/datalake/data_lake.json[0m
[17:43:04] [32mFinished loading and describing 4 data files.[0m

Processed 4 files





In [4]:
# Generate LinkML schema
schema = schema_gen.generate_linkml_schema(
    data_lake_list = datalake, # Either the list of dict produced by load_and_describe_datalake or the path to the json file produced
    output_directory=os.path.join(output_dir, "schema") # Directory to save the output yaml
)

[17:43:04] [33mGenerating LinkML schema with LLM...[0m
[17:43:18] [32mSuccessfully generated LinkML schema.[0m
[17:43:18] [32mEnsured directory exists: /Users/matteocastagna/Documents/Università/Assegno di ricerca 2024:2025/SemLink/demo/eurostat/output/schema[0m
[17:43:18] [32mLinkML schema saved to: /Users/matteocastagna/Documents/Università/Assegno di ricerca 2024:2025/SemLink/demo/eurostat/output/schema/linkml_schema.yaml[0m


In [5]:
# Prune LinkML schema
pruned_schema = schema_gen.prune_schema(
    data_lake_list = datalake, # Either the list of dict produced by load_and_describe_datalake or the path to the json file produced
    yaml_schema = schema, # Either the dict produced by generate_linkml_schema or path to the yaml file produced
    output_directory=os.path.join(output_dir, "schema") # Directory to save the output yaml
)

[17:43:18] [33mStarting schema pruning...[0m
[17:43:18] [32mLoaded YAML file with 12 class.attribute pairs and 14 descriptions.[0m


Classifying datasets for pruning: 100%|██████████| 4/4 [00:18<00:00,  4.52s/it]

[17:43:36] [34mTotal unique relevant classes identified: 1[0m
[17:43:36] [34mRelevant classes: ['Document'][0m
[17:43:36] [32mEnsured directory exists: /Users/matteocastagna/Documents/Università/Assegno di ricerca 2024:2025/SemLink/demo/eurostat/output/schema[0m
[17:43:36] [32mPruned LinkML schema saved to: /Users/matteocastagna/Documents/Università/Assegno di ricerca 2024:2025/SemLink/demo/eurostat/output/schema/linkml_schema_pruned.yaml[0m





In [6]:
# Semnaitc column annotaiton with ArcheType
datalake_annotated = archetype_annotation(
    data_lake_list=datalake, # Either the list of dict produced by load_and_describe_datalake or the path to the json file produced
    yaml_schema=pruned_schema,  # Either the dict produced by generate_linkml_schema/prune_schema or path to the yaml file produced
    sample_size=10, # How many samples of the column pass to Archetype for the annotation
    output_directory=os.path.join(output_dir, "datalake") # Directory to save the output json
)

[17:43:36] [35mStarting semantic annotation with Archetype...[0m
[17:43:36] [32mLoaded YAML file with 12 class.attribute pairs and 14 descriptions.[0m
[17:43:36] [34mLoaded YAML schema from for Archetype.[0m


Annotating files:   0%|          | 0/4 [00:00<?, ?it/s]

[17:43:36] [36mProcessing file: AACT_ALI02_data.csv[0m


Annotating files:  25%|██▌       | 1/4 [00:05<00:16,  5.62s/it]

[17:43:41] [36mProcessing file: AACT_EAA06_data.csv[0m


Annotating files:  50%|█████     | 2/4 [00:12<00:12,  6.29s/it]

[17:43:48] [36mProcessing file: ACF_D_EQ1_data.csv[0m


Annotating files:  75%|███████▌  | 3/4 [00:21<00:07,  7.69s/it]

[17:43:58] [36mProcessing file: AACT_ALI01_data.csv[0m


Annotating files: 100%|██████████| 4/4 [00:28<00:00,  7.01s/it]

[17:44:04] [32mFinished semantic annotation for 4 files.[0m
[17:44:04] [32mEnsured directory exists: /Users/matteocastagna/Documents/Università/Assegno di ricerca 2024:2025/SemLink/demo/eurostat/output/datalake[0m
[17:44:04] [32mAnnotated data lake JSON saved to: /Users/matteocastagna/Documents/Università/Assegno di ricerca 2024:2025/SemLink/demo/eurostat/output/datalake/data_lake_annotated.json[0m





In [7]:
# Generate embeddings for each column
embeddings = join_dis.generate_embeddings(
    data_lake_list=datalake_annotated, # Either the list of dict produced by load_and_describe_datalake/archetype_annotation or the path to the json file produced
    output_directory=os.path.join(output_dir, "embeddings") # Directory to save the output json
)

[17:44:04] Generating Embeddings: 100%|██████████| 37/37 [00:17<00:00,  2.09it/s]

[17:44:22] [32mFinished generating embeddings for 37 columns.[0m
[17:44:22] [32mEnsured directory exists: /Users/matteocastagna/Documents/Università/Assegno di ricerca 2024:2025/SemLink/demo/eurostat/output/embeddings[0m
[17:44:22] [32mEmbeddings saved to: /Users/matteocastagna/Documents/Università/Assegno di ricerca 2024:2025/SemLink/demo/eurostat/output/embeddings/embeddings.json[0m





In [8]:
# Generate Neo4j graph
_, _, _ = join_dis.compute_distances_and_export_neo4j(
    embeddings=embeddings, # Either the list of dict produced by generate_embeddings or the path to the json file produced
    cosine_sim_threshold = 0.5,
    anns_threshold = 0.2,
    output_directory=os.path.join(output_dir, "neo4j") # Directory to save the output CSVs
)

[17:44:22] [36mCalculating cosine similarity and euclidean distances...[0m
[17:44:22] [36mCalculating ANNS distances...[0m
[17:44:22] [32mEnsured directory exists: /Users/matteocastagna/Documents/Università/Assegno di ricerca 2024:2025/SemLink/demo/eurostat/output/neo4j[0m
[17:44:22] [32mDistances saved to: /Users/matteocastagna/Documents/Università/Assegno di ricerca 2024:2025/SemLink/demo/eurostat/output/neo4j/distances.csv[0m
[17:44:22] [36mGenerating Neo4j nodes CSV...[0m
[17:44:22] [32mEnsured directory exists: /Users/matteocastagna/Documents/Università/Assegno di ricerca 2024:2025/SemLink/demo/eurostat/output/neo4j[0m
[17:44:22] [32mNeo4j nodes saved to: /Users/matteocastagna/Documents/Università/Assegno di ricerca 2024:2025/SemLink/demo/eurostat/output/neo4j/nodes.csv[0m
[17:44:22] [32mEnsured directory exists: /Users/matteocastagna/Documents/Università/Assegno di ricerca 2024:2025/SemLink/demo/eurostat/output/neo4j[0m
[17:44:22] [32mNeo4j edges saved to: /Users

<h1>PKT<h1>

In [9]:
data_dir = "/Users/matteocastagna/Documents/Università/Assegno di ricerca 2024:2025/SemLink/demo/pkt/data"
output_dir = "/Users/matteocastagna/Documents/Università/Assegno di ricerca 2024:2025/SemLink/demo/pkt/output"

# Initialization
data_loader = DataLoader(openai_client=openai_client)
schema_gen = SchemaGenerator(openai_client=openai_client)
join_dis = JoinDiscoverer(openai_client=openai_client)

In [10]:
datalake = data_loader.load_and_describe_datalake(
    data_directory=data_dir, # Directory with CSV or TSV
    llm=True, # Produce a desciption with an LLM for each column
    sample_size=10, # How many values sample from a column
    output_directory=os.path.join(output_dir, "datalake") # Directory to save the output json
)

print(f"\nProcessed {len(datalake)} files")

[17:44:22] [35mStarting data lake loading and description from /Users/matteocastagna/Documents/Università/Assegno di ricerca 2024:2025/SemLink/demo/pkt/data[0m


Processing data files: 100%|██████████| 7/7 [00:58<00:00,  8.32s/it]


[17:45:20] [33mUnifying column data...[0m


Unifying descriptions with LLM: 100%|██████████| 16/16 [00:01<00:00,  9.64it/s]

[17:45:22] [32mColumn data unification complete.[0m
[17:45:22] [32mEnsured directory exists: /Users/matteocastagna/Documents/Università/Assegno di ricerca 2024:2025/SemLink/demo/pkt/output/datalake[0m
[17:45:22] [32mData lake JSON saved to: /Users/matteocastagna/Documents/Università/Assegno di ricerca 2024:2025/SemLink/demo/pkt/output/datalake/data_lake.json[0m
[17:45:22] [32mFinished loading and describing 7 data files.[0m

Processed 7 files





In [11]:
# Generate LinkML schema
schema = schema_gen.generate_linkml_schema(
    data_lake_list = datalake, # Either the list of dict produced by load_and_describe_datalake or the path to the json file produced
    output_directory=os.path.join(output_dir, "schema") # Directory to save the output yaml
)

[17:45:22] [33mGenerating LinkML schema with LLM...[0m
[17:45:55] [32mSuccessfully generated LinkML schema.[0m
[17:45:55] [32mEnsured directory exists: /Users/matteocastagna/Documents/Università/Assegno di ricerca 2024:2025/SemLink/demo/pkt/output/schema[0m
[17:45:55] [32mLinkML schema saved to: /Users/matteocastagna/Documents/Università/Assegno di ricerca 2024:2025/SemLink/demo/pkt/output/schema/linkml_schema.yaml[0m


In [12]:
# Prune LinkML schema
pruned_schema = schema_gen.prune_schema(
    data_lake_list = datalake, # Either the list of dict produced by load_and_describe_datalake or the path to the json file produced
    yaml_schema = schema, # Either the dict produced by generate_linkml_schema or path to the yaml file produced
    output_directory=os.path.join(output_dir, "schema") # Directory to save the output yaml
)

[17:45:55] [33mStarting schema pruning...[0m
[17:45:55] [32mLoaded YAML file with 16 class.attribute pairs and 24 descriptions.[0m


Classifying datasets for pruning: 100%|██████████| 7/7 [00:42<00:00,  6.03s/it]

[17:46:37] [34mTotal unique relevant classes identified: 8[0m
[17:46:37] [34mRelevant classes: ['Chemical', 'NamedEntity', 'Cofactor', 'Gene', 'Protein', 'Catalyst', 'Phenotype', 'Document'][0m
[17:46:37] [32mEnsured directory exists: /Users/matteocastagna/Documents/Università/Assegno di ricerca 2024:2025/SemLink/demo/pkt/output/schema[0m
[17:46:37] [32mPruned LinkML schema saved to: /Users/matteocastagna/Documents/Università/Assegno di ricerca 2024:2025/SemLink/demo/pkt/output/schema/linkml_schema_pruned.yaml[0m





In [13]:
# Semnaitc column annotaiton with ArcheType
datalake_annotated = archetype_annotation(
    data_lake_list=datalake, # Either the list of dict produced by load_and_describe_datalake or the path to the json file produced
    yaml_schema=pruned_schema,  # Either the dict produced by generate_linkml_schema/prune_schema or path to the yaml file produced
    sample_size=10, # How many samples of the column pass to Archetype for the annotation
    output_directory=os.path.join(output_dir, "datalake") # Directory to save the output json
)

[17:46:37] [35mStarting semantic annotation with Archetype...[0m
[17:46:37] [32mLoaded YAML file with 16 class.attribute pairs and 24 descriptions.[0m
[17:46:37] [34mLoaded YAML schema from for Archetype.[0m


Annotating files:   0%|          | 0/7 [00:00<?, ?it/s]

[17:46:37] [36mProcessing file: ENSEMBL_GENE_ENTREZ_GENE_MAP.tsv[0m


Annotating files:  14%|█▍        | 1/7 [00:05<00:30,  5.16s/it]

[17:46:43] [36mProcessing file: UNIPROT_PROTEIN_COFACTOR.tsv[0m


Annotating files:  29%|██▊       | 2/7 [00:06<00:15,  3.07s/it]

[17:46:44] [36mProcessing file: UNIPROT_PROTEIN_CATALYST.tsv[0m


Annotating files:  43%|████▎     | 3/7 [00:08<00:09,  2.38s/it]

[17:46:46] [36mProcessing file: UNIPROT_ACCESSION_PRO_ONTOLOGY_MAP.tsv[0m


Annotating files:  57%|█████▋    | 4/7 [00:10<00:06,  2.21s/it]

[17:46:48] [36mProcessing file: MESH_CHEBI_MAP.tsv[0m


Annotating files:  71%|███████▏  | 5/7 [00:12<00:04,  2.19s/it]

[17:46:50] [36mProcessing file: STRING_PRO_ONTOLOGY_MAP.tsv[0m


Annotating files:  86%|████████▌ | 6/7 [00:14<00:02,  2.00s/it]

[17:46:51] [36mProcessing file: PHENOTYPE_HPO_MAP.tsv[0m


Annotating files: 100%|██████████| 7/7 [00:15<00:00,  2.26s/it]

[17:46:53] [32mFinished semantic annotation for 7 files.[0m
[17:46:53] [32mEnsured directory exists: /Users/matteocastagna/Documents/Università/Assegno di ricerca 2024:2025/SemLink/demo/pkt/output/datalake[0m
[17:46:53] [32mAnnotated data lake JSON saved to: /Users/matteocastagna/Documents/Università/Assegno di ricerca 2024:2025/SemLink/demo/pkt/output/datalake/data_lake_annotated.json[0m





In [14]:
# Generate embeddings for each column
embeddings = join_dis.generate_embeddings(
    data_lake_list=datalake_annotated, # Either the list of dict produced by load_and_describe_datalake/archetype_annotation or the path to the json file produced
    output_directory=os.path.join(output_dir, "embeddings") # Directory to save the output json
)

[17:46:53] Generating Embeddings: 100%|██████████| 19/19 [00:08<00:00,  2.22it/s]

[17:47:02] [32mFinished generating embeddings for 19 columns.[0m
[17:47:02] [32mEnsured directory exists: /Users/matteocastagna/Documents/Università/Assegno di ricerca 2024:2025/SemLink/demo/pkt/output/embeddings[0m
[17:47:02] [32mEmbeddings saved to: /Users/matteocastagna/Documents/Università/Assegno di ricerca 2024:2025/SemLink/demo/pkt/output/embeddings/embeddings.json[0m





In [15]:
# Generate Neo4j graph
_, _, _ = join_dis.compute_distances_and_export_neo4j(
    embeddings=embeddings, # Either the list of dict produced by generate_embeddings or the path to the json file produced
    cosine_sim_threshold = 0.5,
    anns_threshold = 0.2,
    output_directory=os.path.join(output_dir, "neo4j") # Directory to save the output CSVs
)

[17:47:02] [36mCalculating cosine similarity and euclidean distances...[0m
[17:47:02] [36mCalculating ANNS distances...[0m
[17:47:02] [32mEnsured directory exists: /Users/matteocastagna/Documents/Università/Assegno di ricerca 2024:2025/SemLink/demo/pkt/output/neo4j[0m
[17:47:02] [32mDistances saved to: /Users/matteocastagna/Documents/Università/Assegno di ricerca 2024:2025/SemLink/demo/pkt/output/neo4j/distances.csv[0m
[17:47:02] [36mGenerating Neo4j nodes CSV...[0m
[17:47:02] [32mEnsured directory exists: /Users/matteocastagna/Documents/Università/Assegno di ricerca 2024:2025/SemLink/demo/pkt/output/neo4j[0m
[17:47:02] [32mNeo4j nodes saved to: /Users/matteocastagna/Documents/Università/Assegno di ricerca 2024:2025/SemLink/demo/pkt/output/neo4j/nodes.csv[0m
[17:47:02] [32mEnsured directory exists: /Users/matteocastagna/Documents/Università/Assegno di ricerca 2024:2025/SemLink/demo/pkt/output/neo4j[0m
[17:47:02] [32mNeo4j edges saved to: /Users/matteocastagna/Documents