In [148]:
import re
from difflib import SequenceMatcher
from pathlib import Path
import json
import os
from collections import OrderedDict
import re
from collections import defaultdict

## Taking the names of all the databases

In [149]:
# Define the folder path
folder_path = '/Users/paolocadei/Documents/Masters/Thesis/Spider2/spider2-snow/resource/databases'

# Get and sort the list of file names in the folder
file_names = list(sorted(os.listdir(folder_path)))

file_names.remove('.DS_Store')

# Output the list of file names
file_names

['ADVENTUREWORKS',
 'AIRLINES',
 'AMAZON_VENDOR_ANALYTICS__SAMPLE_DATASET',
 'AUSTIN',
 'BANK_SALES_TRADING',
 'BASEBALL',
 'BBC',
 'BLS',
 'BOWLINGLEAGUE',
 'BRAZE_USER_EVENT_DEMO_DATASET',
 'BRAZILIAN_E_COMMERCE',
 'CALIFORNIA_TRAFFIC_COLLISION',
 'CENSUS_BUREAU_ACS_1',
 'CENSUS_BUREAU_ACS_2',
 'CENSUS_BUREAU_INTERNATIONAL',
 'CENSUS_BUREAU_USA',
 'CENSUS_GALAXY__AIML_MODEL_DATA_ENRICHMENT_SAMPLE',
 'CENSUS_GALAXY__ZIP_CODE_TO_BLOCK_GROUP_SAMPLE',
 'CHICAGO',
 'CHINOOK',
 'CITY_LEGISLATION',
 'CMS_DATA',
 'COMPLEX_ORACLE',
 'COVID19_JHU_WORLD_BANK',
 'COVID19_NYT',
 'COVID19_OPEN_DATA',
 'COVID19_OPEN_WORLD_BANK',
 'COVID19_SYMPTOM_SEARCH',
 'COVID19_USA',
 'CPTAC_PDC',
 'CRYPTO',
 'CYMBAL_INVESTMENTS',
 'DB_IMDB',
 'DEATH',
 'DELIVERY_CENTER',
 'DEPS_DEV_V1',
 'DIMENSIONS_AI_COVID19',
 'EBI_CHEMBL',
 'ECLIPSE_MEGAMOVIE',
 'ECOMMERCE',
 'EDUCATION_BUSINESS',
 'ELECTRONIC_SALES',
 'ENTERTAINMENTAGENCY',
 'EPA_HISTORICAL_AIR_QUALITY',
 'ETHEREUM_BLOCKCHAIN',
 'EU_SOCCER',
 'E_COMMERCE'

In [150]:
# Assuming file_names is the list of file names obtained earlier
print(f"Number of items in file_names: {len(file_names)}")

Number of items in file_names: 152


## Taking all json files for all the database folders

In [None]:
# Define the folder path
folder_path = '/Users/paolocadei/Documents/Masters/Thesis/Spider2/spider2-snow/resource/databases'

# Function to build a nested dictionary representing the folder structure
def build_file_structure_dict(folder_path, allowed_extensions=None):
    file_structure = {}

    for root, _, files in os.walk(folder_path):
        relative_path = os.path.relpath(root, folder_path)
        parts = relative_path.split(os.sep)
        current_level = file_structure
        for part in parts:
            if part == '.':
                continue
            if part not in current_level:
                current_level[part] = {}
            current_level = current_level[part]

        for file_name in files:
            if allowed_extensions:
                if not any(file_name.endswith(ext) for ext in allowed_extensions):
                    continue
            current_level[file_name] = os.path.join(root, file_name)

    # Recursively sort all dictionaries
    def sort_dict(d):
        return OrderedDict(
            sorted(
                ((k, sort_dict(v)) if isinstance(v, dict) else (k, v)) 
                for k, v in d.items()
            )
        )

    return sort_dict(file_structure)

# Build the dictionary
file_structure_dict = build_file_structure_dict(folder_path, allowed_extensions=['.json'])

# Compare with list of database folders (alphabetically sorted)
file_names = sorted([
    name for name in os.listdir(folder_path)
    if os.path.isdir(os.path.join(folder_path, name)) and name != '.DS_Store'
])

print(f'The file structure of the dictionary contains all the databases: {set(file_structure_dict.keys()) == set(file_names)}')

with open("0_filesystem_structure.json", "w") as f:
    json.dump(file_structure_dict, f, indent=2)


The file structure of the dictionary contains all the databases: True


In [152]:
# Taking a look at the file structure
file_structure_dict

OrderedDict([('ADVENTUREWORKS',
              OrderedDict([('ADVENTUREWORKS',
                            OrderedDict([('COUNTRYREGIONCURRENCY.json',
                                          '/Users/paolocadei/Documents/Masters/Thesis/Spider2/spider2-snow/resource/databases/ADVENTUREWORKS/ADVENTUREWORKS/COUNTRYREGIONCURRENCY.json'),
                                         ('CURRENCYRATE.json',
                                          '/Users/paolocadei/Documents/Masters/Thesis/Spider2/spider2-snow/resource/databases/ADVENTUREWORKS/ADVENTUREWORKS/CURRENCYRATE.json'),
                                         ('PRODUCT.json',
                                          '/Users/paolocadei/Documents/Masters/Thesis/Spider2/spider2-snow/resource/databases/ADVENTUREWORKS/ADVENTUREWORKS/PRODUCT.json'),
                                         ('PRODUCTCATEGORY.json',
                                          '/Users/paolocadei/Documents/Masters/Thesis/Spider2/spider2-snow/resource/databases/AD

## Transforming the dictionary

In [153]:
def flatten_structure(nested_dict):
    flat_dict = {}
    for db_name, tables in nested_dict.items():
        if isinstance(tables, dict):
            for table_name, files in tables.items():
                if isinstance(files, dict):
                    key = f"{db_name}/{table_name}"
                    # Just get the file names without paths
                    flat_dict[key] = [
                        os.path.basename(file_path) for file_path in files.values()
                    ]
    return flat_dict

flat_file_structure = flatten_structure(file_structure_dict)

flat_file_structure

{'ADVENTUREWORKS/ADVENTUREWORKS': ['COUNTRYREGIONCURRENCY.json',
  'CURRENCYRATE.json',
  'PRODUCT.json',
  'PRODUCTCATEGORY.json',
  'PRODUCTDESCRIPTION.json',
  'PRODUCTMODELPRODUCTDESCRIPTIONCULTURE.json',
  'PRODUCTREVIEW.json',
  'PRODUCTSUBCATEGORY.json',
  'SALESORDERDETAIL.json',
  'SALESORDERHEADER.json',
  'SALESPERSON.json',
  'SALESPERSONQUOTAHISTORY.json',
  'SALESTERRITORY.json'],
 'AIRLINES/AIRLINES': ['AIRCRAFTS_DATA.json',
  'AIRPORTS_DATA.json',
  'BOARDING_PASSES.json',
  'BOOKINGS.json',
  'FLIGHTS.json',
  'SEATS.json',
  'TICKETS.json',
  'TICKET_FLIGHTS.json'],
 'AMAZON_VENDOR_ANALYTICS__SAMPLE_DATASET/PUBLIC': ['ADS_SPONSORED_BRANDS_CAMPAIGN_VC.json',
  'ADS_SPONSORED_BRANDS_KEYWORD_VC.json',
  'ADS_SPONSORED_BRANDS_SEARCH_TERM_VC.json',
  'ADS_SPONSORED_BRANDS_VIDEO_CAMPAIGN_VC.json',
  'ADS_SPONSORED_BRANDS_VIDEO_KEYWORD_VC.json',
  'ADS_SPONSORED_BRANDS_VIDEO_SEARCH_TERM_VC.json',
  'ADS_SPONSORED_DISPLAY_CAMPAIGN_VC.json',
  'ADS_SPONSORED_DISPLAY_TARGETING_

## Checking which json files could be the same

This is to reduce the amount of checks we have to do. We check the similarity of the names based on removing either 4 or more digits or 1 digit

In [154]:
# Function to group JSON files based on similarity (considering prefixes like PLACES_)
def group_json_files_by_custom_similarity(json_files_dict):
    similar_files = defaultdict(list)
    unique_files = {}

    for path, files in json_files_dict.items():
        # Group files by their "base name" (name after removing variable parts)
        base_name_to_files = defaultdict(list)
        for file_name in files:
            # Handle specific prefixes like "PLACES_"
            if file_name.startswith("PLACES_"):
                base_name = "PLACES_"
            elif file_name.startswith("CENSUS_TRACTS_"):
                base_name = "CENSUS_TRACTS"
            else:
                # Remove sequences of digits (1 digit or more than 4 digits)
                base_name = re.sub(r'\d{4,}', '', file_name)  # Remove sequences of 4 or more digits
                base_name = re.sub(r'\d', '', base_name)      # Remove single digits
                
                # Normalize underscores or other separators
                base_name = re.sub(r'_+', '_', base_name).strip('_')  # Remove extra underscores
            
            base_name_to_files[base_name].append(file_name)

        # Separate similar and unique files
        for base_name, grouped_files in base_name_to_files.items():
            if len(grouped_files) > 1:  # If more than one file has the same base name
                similar_files[path].append(grouped_files)
            else:  # If only one file has this base name
                if path not in unique_files:
                    unique_files[path] = []
                unique_files[path].extend(grouped_files)

    return dict(similar_files), unique_files

# Group JSON files by similarity
similar_files_dict, unique_files_dict = group_json_files_by_custom_similarity(flat_file_structure)

# Print the results
print("Similar Files:")
for path, groups in similar_files_dict.items():
    print(f"Path: {path}")
    for group in groups:
        print(f"  Group: {group}")

print("\nUnique Files:")
for path, files in unique_files_dict.items():
    print(f"Path: {path}")
    for file in files:
        print(f"  File: {file}")

Similar Files:
Path: BLS/GEO_US_BOUNDARIES
  Group: ['CONGRESS_DISTRICT_115.json', 'CONGRESS_DISTRICT_116.json']
Path: CENSUS_BUREAU_ACS_1/CENSUS_BUREAU_ACS
  Group: ['BLOCKGROUP_2010_5YR.json', 'BLOCKGROUP_2011_5YR.json', 'BLOCKGROUP_2012_5YR.json', 'BLOCKGROUP_2013_5YR.json', 'BLOCKGROUP_2014_5YR.json', 'BLOCKGROUP_2015_5YR.json', 'BLOCKGROUP_2016_5YR.json', 'BLOCKGROUP_2017_5YR.json', 'BLOCKGROUP_2018_5YR.json']
  Group: ['CBSA_2007_1YR.json', 'CBSA_2007_3YR.json', 'CBSA_2008_1YR.json', 'CBSA_2008_3YR.json', 'CBSA_2009_1YR.json', 'CBSA_2009_3YR.json', 'CBSA_2010_1YR.json', 'CBSA_2010_3YR.json', 'CBSA_2010_5YR.json', 'CBSA_2011_1YR.json', 'CBSA_2011_3YR.json', 'CBSA_2011_5YR.json', 'CBSA_2012_1YR.json', 'CBSA_2012_3YR.json', 'CBSA_2012_5YR.json', 'CBSA_2013_1YR.json', 'CBSA_2013_3YR.json', 'CBSA_2013_5YR.json', 'CBSA_2014_1YR.json', 'CBSA_2014_5YR.json', 'CBSA_2015_1YR.json', 'CBSA_2015_5YR.json', 'CBSA_2016_1YR.json', 'CBSA_2016_5YR.json', 'CBSA_2017_1YR.json', 'CBSA_2017_5YR.json',

## Validating that everything is in there

In [155]:
def check_structure_integrity(original_structure, similar_files_dict, unique_files_dict):
    # Build a set of all original paths
    original_paths = set()
    for path, files in original_structure.items():
        for file in files:
            original_paths.add(f"{path}/{file}")

    # Build a set of all grouped paths (similar + unique)
    grouped_paths = set()
    for path, groups in similar_files_dict.items():
        for group in groups:
            for file in group:
                grouped_paths.add(f"{path}/{file}")
    for path, files in unique_files_dict.items():
        for file in files:
            grouped_paths.add(f"{path}/{file}")

    # Compare sets
    missing_from_grouped = original_paths - grouped_paths
    extra_in_grouped = grouped_paths - original_paths

    print("Check result:")
    print(f"- Total original files: {len(original_paths)}")
    print(f"- Total grouped files: {len(grouped_paths)}")
    print(f"- Missing from grouped: {len(missing_from_grouped)}")
    print(f"- Extra in grouped: {len(extra_in_grouped)}")

    if missing_from_grouped:
        print("\nFiles missing from grouped:")
        for path in sorted(missing_from_grouped):
            print(f"  {path}")

    if extra_in_grouped:
        print("\nFiles that shouldn't be in grouped:")
        for path in sorted(extra_in_grouped):
            print(f"  {path}")

    return not missing_from_grouped and not extra_in_grouped

is_valid = check_structure_integrity(flat_file_structure, similar_files_dict, unique_files_dict)
print(f"\n✅ Structure is valid: {is_valid}")


Check result:
- Total original files: 5114
- Total grouped files: 5114
- Missing from grouped: 0
- Extra in grouped: 0

✅ Structure is valid: True


## Checking which json files match

We can see if some tables in the databases match by checking if they have the same:
- column names
- column types
- column descriptions (stripped and 0.9 match)

### 1st Step

Here we get a list of all the individual matches between the different tables

In [156]:
import re
import json
from pathlib import Path
from difflib import SequenceMatcher
from collections import defaultdict

# --- Step 1: Integrity Check ---

def check_structure_integrity(original_structure, similar_files_dict, unique_files_dict):
    original_paths = set()
    for path, files in original_structure.items():
        for file in files:
            original_paths.add(f"{path}/{file}")

    grouped_paths = set()
    for path, groups in similar_files_dict.items():
        for group in groups:
            for file in group:
                grouped_paths.add(f"{path}/{file}")
    for path, files in unique_files_dict.items():
        for file in files:
            grouped_paths.add(f"{path}/{file}")

    missing_from_grouped = original_paths - grouped_paths
    extra_in_grouped = grouped_paths - original_paths

    print("📦 Structure Integrity Check:")
    print(f"- Total original files: {len(original_paths)}")
    print(f"- Total grouped files: {len(grouped_paths)}")
    print(f"- Missing from grouped: {len(missing_from_grouped)}")
    print(f"- Extra in grouped: {len(extra_in_grouped)}")

    if missing_from_grouped:
        print("\n⚠️ Files missing from grouped:")
        for path in sorted(missing_from_grouped):
            print(f"  - {path}")

    if extra_in_grouped:
        print("\n⚠️ Extra files not in original structure:")
        for path in sorted(extra_in_grouped):
            print(f"  - {path}")

    return not missing_from_grouped and not extra_in_grouped

# --- Step 2: Structural Similarity Check ---

def load_json(file_path):
    with open(file_path, 'r') as file:
        return json.load(file)

def clean_description(description):
    if not isinstance(description, str):
        return ""
    return re.sub(r'[^\w\s]', '', description).strip().lower()

def is_similar_description(desc1, desc2, threshold=0.9):
    similarity = SequenceMatcher(None, desc1, desc2).ratio()
    return similarity >= threshold, similarity

def sort_structure_by_column_name(column_names, column_types, descriptions):
    combined = sorted(zip(column_names, column_types, descriptions), key=lambda x: x[0])
    sorted_column_names, sorted_column_types, sorted_descriptions = zip(*combined) if combined else ([], [], [])
    return list(sorted_column_names), list(sorted_column_types), list(sorted_descriptions)

def have_similar_structure(json1, json2):
    column_names1, column_types1, descriptions1 = sort_structure_by_column_name(
        json1.get("column_names", []),
        json1.get("column_types", []),
        [clean_description(desc) for desc in json1.get("description", [])]
    )
    column_names2, column_types2, descriptions2 = sort_structure_by_column_name(
        json2.get("column_names", []),
        json2.get("column_types", []),
        [clean_description(desc) for desc in json2.get("description", [])]
    )

    structure1 = list(zip(column_names1, column_types1, descriptions1))
    structure2 = list(zip(column_names2, column_types2, descriptions2))

    mismatched_columns = [
        (col1, typ1, col2, typ2)
        for (col1, typ1, _), (col2, typ2, _) in zip(structure1, structure2)
        if col1 != col2 or typ1 != typ2
    ]

    if mismatched_columns:
        return False, f"Mismatched columns/types: {mismatched_columns}"

    unmatched_descriptions = []
    empty_description_mismatches = 0

    for (col1, typ1, desc1), (col2, typ2, desc2) in zip(structure1, structure2):
        if col1 == col2 and typ1 == typ2:
            if desc1 != desc2:
                if desc1 == "" or desc2 == "":
                    empty_description_mismatches += 1
                    unmatched_descriptions.append((col1, typ1, desc1, desc2, "Empty description mismatch"))
                else:
                    similar, sim_val = is_similar_description(desc1, desc2)
                    if not similar:
                        unmatched_descriptions.append((col1, typ1, desc1, desc2, f"Similarity: {sim_val:.2f}"))

    if empty_description_mismatches > 5 or len(unmatched_descriptions) > empty_description_mismatches:
        return False, f"Unmatched descriptions: {unmatched_descriptions}"

    return True, "Files are similar"

def check_grouped_json_structures(similar_files_dict, base_folder):
    results_by_group = {}
    mismatched_groups = []

    for path, file_groups in similar_files_dict.items():
        group_results = []
        for file_list in file_groups:
            full_paths = [str(Path(base_folder) / path / file) for file in file_list]
            full_paths = [p for p in full_paths if Path(p).exists()]
            if not full_paths:
                group_results.append(([], [], []))
                continue

            matches = []
            mismatch_reasons = []

            for i, file_path1 in enumerate(full_paths):
                json1 = load_json(file_path1)
                for j, file_path2 in enumerate(full_paths):
                    if i >= j:
                        continue
                    json2 = load_json(file_path2)
                    is_similar, reason = have_similar_structure(json1, json2)
                    if is_similar:
                        matches.append((file_path1, file_path2))
                    else:
                        mismatch_reasons.append(f"{Path(file_path1).name} vs {Path(file_path2).name} — {reason}")

            group_results.append((full_paths, matches, mismatch_reasons))
            if mismatch_reasons:
                mismatched_groups.append((path, file_list, mismatch_reasons))

        results_by_group[path] = group_results

    return results_by_group, mismatched_groups

# --- RUN BOTH CHECKS ---

# These need to be already defined above in your environment:
# - flat_file_structure
# - similar_files_dict
# - unique_files_dict
# - folder_path (root path to your /databases directory)

print("\n====== 🔍 FULL VALIDATION STARTED ======\n")

# 1. Check integrity
integrity_ok = check_structure_integrity(flat_file_structure, similar_files_dict, unique_files_dict)

# 2. Check structure
structure_results, mismatches = check_grouped_json_structures(similar_files_dict, folder_path)

# --- Updated Final Summary ---

grouped_units = sum(len(groups) for groups in similar_files_dict.values())
unique_units = sum(len(files) for files in unique_files_dict.values())
total_units = grouped_units + unique_units

original_file_count = sum(len(files) for files in flat_file_structure.values())
reduction = original_file_count - total_units
reduction_pct = (reduction / original_file_count) * 100 if original_file_count > 0 else 0


print("\n====== ✅ FINAL SUMMARY ======")
print(f"✔️ Integrity Check Passed: {integrity_ok}")
print(f"✔️ Grouped Structure Check Passed: {len(mismatches) == 0}")
print(f"📦 Total Final File Units: {total_units} (Grouped: {grouped_units}, Unique: {unique_units})")
print(f"📁 Original File Count: {original_file_count}")
print(f"📉 Reduced by: {reduction} files ({reduction_pct:.2f}%)")
print(f"⚠️ Groups with Mismatches: {len(mismatches)}")

'''if mismatches:
    print("\n--- Detailed Mismatches ---")
    for path, files, reasons in mismatches:
        print(f"\n📁 Path: {path}")
        print(f"  Files: {files}")
        for reason in reasons:
            print(f"    - {reason}")'''




📦 Structure Integrity Check:
- Total original files: 5114
- Total grouped files: 5114
- Missing from grouped: 0
- Extra in grouped: 0

✔️ Integrity Check Passed: True
✔️ Grouped Structure Check Passed: False
📦 Total Final File Units: 2579 (Grouped: 221, Unique: 2358)
📁 Original File Count: 5114
📉 Reduced by: 2535 files (49.57%)
⚠️ Groups with Mismatches: 191


'if mismatches:\n    print("\n--- Detailed Mismatches ---")\n    for path, files, reasons in mismatches:\n        print(f"\n📁 Path: {path}")\n        print(f"  Files: {files}")\n        for reason in reasons:\n            print(f"    - {reason}")'

### 2nd Step

Putting all of it together

In [157]:
from collections import defaultdict

def group_perfectly_matched_files(structure_check_results_with_files):
    graph = defaultdict(set)
    file_to_key = {}
    all_valid_files = set()

    # Build graph of matches and record all valid files
    for group, results in structure_check_results_with_files.items():
        for file_list, matches, _ in results:
            for file in file_list:
                file_to_key[file] = group
                all_valid_files.add(file)

            for file1, file2 in matches:
                prefix1 = "/".join(file1.split("/")[:2])
                prefix2 = "/".join(file2.split("/")[:2])
                if prefix1 == prefix2:
                    graph[file1].add(file2)
                    graph[file2].add(file1)

    # Extract connected components (including singletons)
    visited = set()
    grouped_files_with_keys = []

    def dfs(node, group):
        visited.add(node)
        group.append(node)
        for neighbor in graph[node]:
            if neighbor not in visited:
                dfs(neighbor, group)

    for node in all_valid_files:
        if node not in visited:
            group = []
            dfs(node, group)
            group_key = file_to_key.get(node, "UNKNOWN_GROUP")
            grouped_files_with_keys.append((group_key, group))

    return grouped_files_with_keys


# Example usage
grouped_files_with_keys = group_perfectly_matched_files(structure_results)

# View result
grouped_files_with_keys


[('FEC/CENSUS_BUREAU_ACS',
  ['/Users/paolocadei/Documents/Masters/Thesis/Spider2/spider2-snow/resource/databases/FEC/CENSUS_BUREAU_ACS/CENSUSTRACT_2013_5YR.json',
   '/Users/paolocadei/Documents/Masters/Thesis/Spider2/spider2-snow/resource/databases/FEC/CENSUS_BUREAU_ACS/CENSUSTRACT_2017_5YR.json',
   '/Users/paolocadei/Documents/Masters/Thesis/Spider2/spider2-snow/resource/databases/FEC/CENSUS_BUREAU_ACS/CENSUSTRACT_2012_5YR.json',
   '/Users/paolocadei/Documents/Masters/Thesis/Spider2/spider2-snow/resource/databases/FEC/CENSUS_BUREAU_ACS/CENSUSTRACT_2015_5YR.json',
   '/Users/paolocadei/Documents/Masters/Thesis/Spider2/spider2-snow/resource/databases/FEC/CENSUS_BUREAU_ACS/CENSUSTRACT_2010_5YR.json',
   '/Users/paolocadei/Documents/Masters/Thesis/Spider2/spider2-snow/resource/databases/FEC/CENSUS_BUREAU_ACS/CENSUSTRACT_2014_5YR.json',
   '/Users/paolocadei/Documents/Masters/Thesis/Spider2/spider2-snow/resource/databases/FEC/CENSUS_BUREAU_ACS/CENSUSTRACT_2016_5YR.json',
   '/Users/pao

## Checking if the tables have been correctly assigned to schema and database

In [None]:
from pathlib import Path
from collections import defaultdict
import json

def validate_grouped_and_unique_file_coverage(grouped_files_with_keys, unique_files_dict, file_structure_dict):
    def extract_all_files(structure, path=None):
        if path is None:
            path = []
        files = set()
        for k, v in structure.items():
            current_path = path + [k]
            if isinstance(v, dict):
                files.update(extract_all_files(v, current_path))
            else:
                files.add("/".join(current_path))
        return files

    # Extract all file paths from the saved file structure
    actual_files_on_disk = extract_all_files(file_structure_dict)

    expected_files_combined = set()
    print("\n🔍 Building list of expected files from grouped + unique files...")

    # Add grouped files (using relative paths)
    for folder_path, file_list in grouped_files_with_keys:
        for full_path in file_list:
            parts = Path(full_path).parts
            if len(parts) >= 3:
                rel_path = "/".join(parts[-3:-1])  # database/table
                file_name = parts[-1]             # file.json
                expected_files_combined.add(f"{rel_path}/{file_name}")
            else:
                print(f"⚠️ Skipped malformed grouped path: {full_path}")

    # Add ungrouped (unique) files
    for path, files in unique_files_dict.items():
        for file_name in files:
            expected_files_combined.add(f"{path}/{file_name}")

    print("✅ Done building expectations.")

    # Compare expected vs actual
    missing_from_disk = expected_files_combined - actual_files_on_disk
    missing_from_combined = actual_files_on_disk - expected_files_combined

    # Summary
    print("\n📊 SUMMARY")
    print("----------")
    print(f"📁 Files expected from groups + unique: {len(expected_files_combined)}")
    print(f"💾 Files found in file structure (on disk): {len(actual_files_on_disk)}")
    print(f"❌ Files missing on disk (expected but not found): {len(missing_from_disk)}")
    print(f"❌ Files missing from expected set (found on disk but not grouped/unique): {len(missing_from_combined)}")

    if missing_from_disk:
        print("\n🔸 These files were expected but NOT found on disk (showing first 15):")
        for path in list(missing_from_disk)[:15]:
            print("   -", path)

    if missing_from_combined:
        print("\n🔹 These files were found on disk but NOT included in grouped or unique sets (showing first 10):")
        for path in list(missing_from_combined)[:10]:
            print("   -", path)

    print("\n✅ Validation complete.\n")

    return {
        "expected_files_combined": expected_files_combined,
        "actual_files_on_disk": actual_files_on_disk,
        "missing_from_disk": missing_from_disk,
        "missing_from_combined": missing_from_combined
    }


# Load the saved file structure
structure_path = "0_filesystem_structure.json"
with open(structure_path, "r") as f:
    file_structure_dict = json.load(f)

# Run the validation
result = validate_grouped_and_unique_file_coverage(
    grouped_files_with_keys,
    unique_files_dict,
    file_structure_dict
)



🔍 Building list of expected files from grouped + unique files...
✅ Done building expectations.

📊 SUMMARY
----------
📁 Files expected from groups + unique: 5114
💾 Files found in file structure (on disk): 5114
❌ Files missing on disk (expected but not found): 0
❌ Files missing from expected set (found on disk but not grouped/unique): 0

✅ Validation complete.



In [159]:
grouped_files_with_keys

[('FEC/CENSUS_BUREAU_ACS',
  ['/Users/paolocadei/Documents/Masters/Thesis/Spider2/spider2-snow/resource/databases/FEC/CENSUS_BUREAU_ACS/CENSUSTRACT_2013_5YR.json',
   '/Users/paolocadei/Documents/Masters/Thesis/Spider2/spider2-snow/resource/databases/FEC/CENSUS_BUREAU_ACS/CENSUSTRACT_2017_5YR.json',
   '/Users/paolocadei/Documents/Masters/Thesis/Spider2/spider2-snow/resource/databases/FEC/CENSUS_BUREAU_ACS/CENSUSTRACT_2012_5YR.json',
   '/Users/paolocadei/Documents/Masters/Thesis/Spider2/spider2-snow/resource/databases/FEC/CENSUS_BUREAU_ACS/CENSUSTRACT_2015_5YR.json',
   '/Users/paolocadei/Documents/Masters/Thesis/Spider2/spider2-snow/resource/databases/FEC/CENSUS_BUREAU_ACS/CENSUSTRACT_2010_5YR.json',
   '/Users/paolocadei/Documents/Masters/Thesis/Spider2/spider2-snow/resource/databases/FEC/CENSUS_BUREAU_ACS/CENSUSTRACT_2014_5YR.json',
   '/Users/paolocadei/Documents/Masters/Thesis/Spider2/spider2-snow/resource/databases/FEC/CENSUS_BUREAU_ACS/CENSUSTRACT_2016_5YR.json',
   '/Users/pao

## Processing the names to remove redundancy

The object of the following code is to "compress" the files that have been grouped into one single file with a variable in it and the different values that can go in the variable.

Key: NOAA_DATA_PLUS/NOAA_GSOD
  Template: GSOD{variable0}.json
  Variables:
    variable0: ['1929', '1930', '1931', '1932', '1933', '1934', '1935', '1936', '1937', '1938', '1939', '1940', '1941', '1942', '1943', '1944', '1945', '1946', '1947', '1948', '1949', '1950', '1951', '1952', '1953', '1954', '1955', '1956', '1957', '1958', '1959', '1960', '1961', '1962', '1963', '1964', '1965', '1966', '1967', '1968', '1969', '1970', '1971', '1972', '1973', '1974', '1975', '1976', '1977', '1978', '1979', '1980', '1981', '1982', '1983', '1984', '1985', '1986', '1987', '1988', '1989', '1990', '1991', '1992', '1993', '1994', '1995', '1996', '1997', '1998', '1999', '2000', '2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015']


In [160]:
import re
from pathlib import Path
from collections import defaultdict

### UTILITY FUNCTIONS ###

def extract_variable_tokens(variable_strs):
    split_parts = [[s] for s in variable_strs]  # Treat whole variation as one unit
    transposed = list(zip(*split_parts)) if split_parts else []
    variable_indices = [
        i for i, tokens in enumerate(transposed)
        if len(set(tokens)) > 1
    ]
    return split_parts, transposed, variable_indices


def find_common_parts(filenames):
    if not filenames:
        return "", ""
    prefix = filenames[0]
    suffix = filenames[0]
    for name in filenames[1:]:
        # Find common prefix
        i = 0
        while i < len(prefix) and i < len(name) and prefix[i] == name[i]:
            i += 1
        prefix = prefix[:i]
        # Find common suffix
        i = 1
        while i <= len(suffix) and i <= len(name) and suffix[-i] == name[-i]:
            i += 1
        suffix = suffix[-(i-1):] if i > 1 else ""
    return prefix, suffix

def get_rel_path(path):
    parts = Path(path).parts
    return "/".join(parts[-3:])  # DB/TABLE/FILENAME

### MAIN FUNCTION ###

def compress_grouped_files(grouped_files_with_keys):
    unique_files_dict = defaultdict(list)
    results = []

    for key, full_paths in grouped_files_with_keys:
        file_names = [Path(path).name for path in full_paths]
        rel_paths = [get_rel_path(path) for path in full_paths]

        if len(file_names) == 1:
            results.append({
                "Key": key,
                "Files": file_names,
                "Grouped": False,
                "RelPaths": rel_paths
            })
            continue

        prefix, suffix = find_common_parts(file_names)
        variable_strs = [f[len(prefix):len(f)-len(suffix)] for f in file_names]
        split_parts, transposed, variable_indices = extract_variable_tokens(variable_strs)

        if not variable_indices:
            unique_files_dict.setdefault(key, []).extend(zip(file_names, rel_paths))
            continue

        template_parts = []
        variable_map = defaultdict(set)
        variable_counter = 0
        index_to_var = {}
        variable_keys_ordered = []

        for i, token in enumerate(split_parts[0]):
            if i in variable_indices:
                var_name = f"variable{variable_counter}"
                index_to_var[i] = var_name
                template_parts.append(f"{{{var_name}}}")
                variable_keys_ordered.append(var_name)
                variable_counter += 1
            else:
                template_parts.append(token)

        combinations = []
        unmatched_files = []

        for idx, sp in enumerate(split_parts):
            combo = []
            skip = False
            for i in variable_indices:
                var_name = index_to_var[i]
                val = sp[i]
                if val == "":
                    skip = True
                    break
                variable_map[var_name].add(val)
                combo.append(val)
            if skip:
                unmatched_files.append((file_names[idx], rel_paths[idx]))
            else:
                combinations.append(tuple(combo))

        variable_map_sorted = {k: sorted(v) for k, v in variable_map.items()}
        template = prefix + "".join(template_parts) + suffix
        table_prefix = Path(key).name
        if template.startswith(table_prefix + "_") and len(template) > len(table_prefix) + 1:
            template = template[len(table_prefix) + 1:]

        if not combinations:
            unique_files_dict.setdefault(key, []).extend(zip(file_names, rel_paths))
            continue

        results.append({
            "Key": key,
            "Template": template,
            "Variables": variable_map_sorted,
            "Grouped": True,
            "RelPaths": rel_paths,
            "Combinations": combinations,
            "VariableOrder": variable_keys_ordered
        })

        if unmatched_files:
            unique_files_dict.setdefault(key, []).extend(unmatched_files)

    for key, file_list in unique_files_dict.items():
        if not file_list:
            continue
        rel_paths = [rel for _, rel in file_list]
        file_names = [f for f, _ in file_list]
        results.append({
            "Key": key,
            "Files": sorted(file_names),
            "Grouped": False,
            "RelPaths": rel_paths
        })

    return results


compressed_list = compress_grouped_files(grouped_files_with_keys)

compressed_list

[{'Key': 'FEC/CENSUS_BUREAU_ACS',
  'Template': 'CENSUSTRACT_201{variable0}_5YR.json',
  'Variables': {'variable0': ['0', '1', '2', '3', '4', '5', '6', '7']},
  'Grouped': True,
  'RelPaths': ['FEC/CENSUS_BUREAU_ACS/CENSUSTRACT_2013_5YR.json',
   'FEC/CENSUS_BUREAU_ACS/CENSUSTRACT_2017_5YR.json',
   'FEC/CENSUS_BUREAU_ACS/CENSUSTRACT_2012_5YR.json',
   'FEC/CENSUS_BUREAU_ACS/CENSUSTRACT_2015_5YR.json',
   'FEC/CENSUS_BUREAU_ACS/CENSUSTRACT_2010_5YR.json',
   'FEC/CENSUS_BUREAU_ACS/CENSUSTRACT_2014_5YR.json',
   'FEC/CENSUS_BUREAU_ACS/CENSUSTRACT_2016_5YR.json',
   'FEC/CENSUS_BUREAU_ACS/CENSUSTRACT_2011_5YR.json'],
  'Combinations': [('3',),
   ('7',),
   ('2',),
   ('5',),
   ('0',),
   ('4',),
   ('6',),
   ('1',)],
  'VariableOrder': ['variable0']},
 {'Key': 'SDOH/CENSUS_BUREAU_ACS',
  'Files': ['PUMA_2014_5YR.json'],
  'Grouped': False,
  'RelPaths': ['SDOH/CENSUS_BUREAU_ACS/PUMA_2014_5YR.json']},
 {'Key': 'HTAN_2/HTAN',
  'Files': ['BULKWES_LEVEL3_METADATA_CURRENT.json'],
  'Group

In [None]:
def extract_all_rel_paths(data):
    all_rel_paths = []

    for entry in data:
        if entry.get('Grouped'):
            all_rel_paths.extend(entry.get('RelPaths', []))
        else:
            key = entry['Key']
            files = entry.get('Files', [])
            for file in files:
                all_rel_paths.append(f"{key}/{file}")

    return all_rel_paths

rel_paths = extract_all_rel_paths(compressed_list)

unique_paths = [f'{k}/{v}' for k in unique_files_dict.keys() for v in unique_files_dict[k]]

all_file_paths = rel_paths + unique_paths

with open("/Users/paolocadei/Documents/Masters/Thesis/Spider2/0_filesystem_structure.json", "r") as f:
    ground_truth = json.load(f)

ground_truth_paths = [f'{k1}/{k2}/{v}' for k1 in ground_truth.keys() for k2 in ground_truth[k1].keys() for v in ground_truth[k1][k2].keys()]

print(len(rel_paths))
print(len(ground_truth_paths), len(all_file_paths))

set(ground_truth_paths) - set(all_file_paths)

2756
5114 5114


set()

In [162]:
rel_paths

['FEC/CENSUS_BUREAU_ACS/CENSUSTRACT_2013_5YR.json',
 'FEC/CENSUS_BUREAU_ACS/CENSUSTRACT_2017_5YR.json',
 'FEC/CENSUS_BUREAU_ACS/CENSUSTRACT_2012_5YR.json',
 'FEC/CENSUS_BUREAU_ACS/CENSUSTRACT_2015_5YR.json',
 'FEC/CENSUS_BUREAU_ACS/CENSUSTRACT_2010_5YR.json',
 'FEC/CENSUS_BUREAU_ACS/CENSUSTRACT_2014_5YR.json',
 'FEC/CENSUS_BUREAU_ACS/CENSUSTRACT_2016_5YR.json',
 'FEC/CENSUS_BUREAU_ACS/CENSUSTRACT_2011_5YR.json',
 'SDOH/CENSUS_BUREAU_ACS/PUMA_2014_5YR.json',
 'HTAN_2/HTAN/BULKWES_LEVEL3_METADATA_CURRENT.json',
 'NOAA_GSOD/NOAA_GSOD/GSOD1974.json',
 'NOAA_GSOD/NOAA_GSOD/GSOD1957.json',
 'NOAA_GSOD/NOAA_GSOD/GSOD1952.json',
 'NOAA_GSOD/NOAA_GSOD/GSOD1982.json',
 'NOAA_GSOD/NOAA_GSOD/GSOD2004.json',
 'NOAA_GSOD/NOAA_GSOD/GSOD1956.json',
 'NOAA_GSOD/NOAA_GSOD/GSOD1950.json',
 'NOAA_GSOD/NOAA_GSOD/GSOD1941.json',
 'NOAA_GSOD/NOAA_GSOD/GSOD1964.json',
 'NOAA_GSOD/NOAA_GSOD/GSOD2009.json',
 'NOAA_GSOD/NOAA_GSOD/GSOD2013.json',
 'NOAA_GSOD/NOAA_GSOD/GSOD1949.json',
 'NOAA_GSOD/NOAA_GSOD/GSOD199

In [163]:
def nested_dict(compressed_list):
    output = {}

    for entry in compressed_list:
        database, table = entry['Key'].split("/")
        grouped = entry['Grouped']

        if database not in output:
            output[database] = {}

        if table not in output[database]:
            output[database][table] = {'grouped': {}, 'ungrouped': []}

        if grouped:
            template = entry['Template']
            combinations = entry['Combinations']
            variable_order = entry['VariableOrder']
            rel_paths = entry['RelPaths']

            matched_combos = []
            
            for combo, path in zip(combinations, rel_paths):

                combo_path = database + "/" + table + "/" + template

                for count, variable in enumerate(variable_order):

                    replaced = combo_path.replace('{'+ f'{variable}' + '}', str(combo[count]))

                    combo_path = replaced

                if combo_path != path:

                    print(combo_path, path)

                else:

                    matched_combos.append(combo_path)

            if template not in output[database][table]['grouped']:
                output[database][table]['grouped'][template] = []


            output[database][table]['grouped'][template].append({
                'variables': entry['Variables'],
                'combinations': entry['Combinations'],
                'variable_order': variable_order
            })
            
        else:
            output[database][table]['ungrouped'].extend(entry['Files'])

    return output

nested_structure = nested_dict(compressed_list)

In [164]:
def extract_all_generated_paths(nested):
    all_paths = []

    for database, tables in nested.items():
        for table, content in tables.items():
            # Grouped files
            grouped_entries = content.get("grouped", {})
            for template, template_entries in grouped_entries.items():  # template_entries is now a list
                for entry in template_entries:
                    combinations = entry["combinations"]
                    variable_order = entry["variable_order"]

                    for combo in combinations:
                        subs = dict(zip(variable_order, combo))

                        try:
                            filename = template.format(**subs)
                            full_path = f"{database}/{table}/{filename}"
                            all_paths.append(full_path)
                        except KeyError as e:
                            print(f"❌ Missing key in format for {database}/{table} with template '{template}'")
                            print(f"   combo: {combo}")
                            print(f"   subs: {subs}")
                        except Exception as e:
                            print(f"❌ Error formatting template '{template}' with combo {combo}: {e}")

            # Ungrouped files
            ungrouped_files = content.get("ungrouped", [])
            for filename in ungrouped_files:
                full_path = f"{database}/{table}/{filename}"
                all_paths.append(full_path)

    return all_paths



resolved_paths = extract_all_generated_paths(nested_structure)
print(len(resolved_paths))

2756


In [None]:
unique_paths = [f'{k}/{v}' for k in unique_files_dict.keys() for v in unique_files_dict[k]]

all_file_paths = resolved_paths + unique_paths

with open("/Users/paolocadei/Documents/Masters/Thesis/Spider2/0_filesystem_structure.json", "r") as f:
    ground_truth = json.load(f)

ground_truth_paths = [f'{k1}/{k2}/{v}' for k1 in ground_truth.keys() for k2 in ground_truth[k1].keys() for v in ground_truth[k1][k2].keys()]

print(len(ground_truth_paths), len(all_file_paths))

set(ground_truth_paths) - set(all_file_paths)

5114 5114


set()

In [166]:
nested_structure

{'FEC': {'CENSUS_BUREAU_ACS': {'grouped': {'CENSUSTRACT_201{variable0}_5YR.json': [{'variables': {'variable0': ['0',
        '1',
        '2',
        '3',
        '4',
        '5',
        '6',
        '7']},
      'combinations': [('3',),
       ('7',),
       ('2',),
       ('5',),
       ('0',),
       ('4',),
       ('6',),
       ('1',)],
      'variable_order': ['variable0']}],
    'SCHOOLDISTRICTSECONDARY_20{variable0}YR.json': [{'variables': {'variable0': ['16_1',
        '16_5',
        '17_1',
        '17_5',
        '18_1',
        '18_5',
        '19_1',
        '19_5',
        '20_5',
        '21_1']},
      'combinations': [('16_1',),
       ('17_1',),
       ('19_1',),
       ('18_1',),
       ('20_5',),
       ('18_5',),
       ('21_1',),
       ('17_5',),
       ('19_5',),
       ('16_5',)],
      'variable_order': ['variable0']}],
    'COUNTY_20{variable0}YR.json': [{'variables': {'variable0': ['07_1',
        '07_3',
        '08_1',
        '08_3',
        '09_1',
 

## Extending the dictionary

Include:
- column names
- column types
- column descriptions into the dictionary

This step also includes the matching of tables that have the exact same columns but have different names. This is done to reduce schema complexity and if they are to be
passed to the LLM together, then we can also reduce the context.

### Adding the unique files that we haven't processed from the beginning

In [167]:
for database in unique_files_dict.keys():

    print(database)
    key1, key2 = database.split("/")

    if key1 not in nested_structure:
        nested_structure[key1] = {key2: {'grouped': {}, 'ungrouped': []}}
        nested_structure[key1][key2] = {'grouped': {}, 'ungrouped': []}

    else:
        if key2 not in nested_structure[key1]:
            nested_structure[key1][key2] = {'grouped': {}, 'ungrouped': []}

    nested_structure[key1][key2]['ungrouped'].extend(unique_files_dict[database])

ADVENTUREWORKS/ADVENTUREWORKS
AIRLINES/AIRLINES
AMAZON_VENDOR_ANALYTICS__SAMPLE_DATASET/PUBLIC
AUSTIN/AUSTIN_311
AUSTIN/AUSTIN_BIKESHARE
AUSTIN/AUSTIN_CRIME
AUSTIN/AUSTIN_WASTE
BANK_SALES_TRADING/BANK_SALES_TRADING
BASEBALL/BASEBALL
BBC/BBC_NEWS
BLS/BLS
BLS/GEO_US_BOUNDARIES
BOWLINGLEAGUE/BOWLINGLEAGUE
BRAZE_USER_EVENT_DEMO_DATASET/PUBLIC
BRAZILIAN_E_COMMERCE/BRAZILIAN_E_COMMERCE
CALIFORNIA_TRAFFIC_COLLISION/CALIFORNIA_TRAFFIC_COLLISION
CENSUS_BUREAU_ACS_1/GEO_CENSUS_TRACTS
CENSUS_BUREAU_ACS_1/GEO_US_BOUNDARIES
CENSUS_BUREAU_ACS_2/CYCLISTIC
CENSUS_BUREAU_ACS_2/GEO_US_BOUNDARIES
CENSUS_BUREAU_INTERNATIONAL/CENSUS_BUREAU_INTERNATIONAL
CENSUS_BUREAU_USA/UTILITY_US
CENSUS_GALAXY__AIML_MODEL_DATA_ENRICHMENT_SAMPLE/PUBLIC
CENSUS_GALAXY__ZIP_CODE_TO_BLOCK_GROUP_SAMPLE/PUBLIC
CHICAGO/CHICAGO_CRIME
CHICAGO/CHICAGO_TAXI_TRIPS
CHINOOK/CHINOOK
CITY_LEGISLATION/CITY_LEGISLATION
CMS_DATA/CMS_CODES
CMS_DATA/CMS_MEDICARE
CMS_DATA/CMS_SYNTHETIC_PATIENT_DATA_OMOP
COMPLEX_ORACLE/COMPLEX_ORACLE
COVID19_JH

## Final check

In [None]:
def extract_all_generated_paths(nested):
    all_paths = []

    for database, tables in nested.items():
        for table, content in tables.items():
            # Grouped files
            grouped_entries = content.get("grouped", {})
            for template, template_entries in grouped_entries.items():  # template_entries is now a list
                for entry in template_entries:
                    combinations = entry["combinations"]
                    variable_order = entry["variable_order"]

                    for combo in combinations:
                        subs = dict(zip(variable_order, combo))

                        try:
                            filename = template.format(**subs)
                            full_path = f"{database}/{table}/{filename}"
                            all_paths.append(full_path)
                        except KeyError as e:
                            print(f"❌ Missing key in format for {database}/{table} with template '{template}'")
                            print(f"   combo: {combo}")
                            print(f"   subs: {subs}")
                        except Exception as e:
                            print(f"❌ Error formatting template '{template}' with combo {combo}: {e}")

            # Ungrouped files
            ungrouped_files = content.get("ungrouped", [])
            for filename in ungrouped_files:
                full_path = f"{database}/{table}/{filename}"
                all_paths.append(full_path)

    return all_paths



final_paths = extract_all_generated_paths(nested_structure)

unique_paths = [f'{k}/{v}' for k in unique_files_dict.keys() for v in unique_files_dict[k]]

all_file_paths = final_paths #+ unique_paths

with open("/Users/paolocadei/Documents/Masters/Thesis/Spider2/0_filesystem_structure.json", "r") as f:
    ground_truth = json.load(f)

ground_truth_paths = [f'{k1}/{k2}/{v}' for k1 in ground_truth.keys() for k2 in ground_truth[k1].keys() for v in ground_truth[k1][k2].keys()]

print(len(ground_truth_paths), len(all_file_paths))

set(ground_truth_paths) - set(all_file_paths)

5114 5114


set()

## Adding information from the json file

Here we had:
- column names: column types
- column descriptions
- sample data <- only 1 row

In [169]:
new_nested_structure = nested_structure.copy()

for database in new_nested_structure.keys():

    for table in new_nested_structure[database].keys():

        for template in new_nested_structure[database][table]['grouped'].keys():

            for t in range(len(new_nested_structure[database][table]['grouped'][template])):

                combination = new_nested_structure[database][table]['grouped'][template][t]['combinations'][0]

                new_nested_structure[database][table]['grouped'][template][t]
                
                variable_order = new_nested_structure[database][table]['grouped'][template][t]['variable_order']

                subs = dict(zip(variable_order, combination))

                filename = template.format(**subs)

                full_path = f"{database}/{table}/{filename}"
                
                with open("/Users/paolocadei/Documents/Masters/Thesis/Spider2/spider2-snow/resource/databases/" + full_path, "r") as f:
                    ground_truth = json.load(f)

                # Combine column names and types into a dictionary: {name: type}
                column_name_types = {
                    name: col_type
                    for name, col_type in zip(
                        ground_truth.get("column_names", []),
                        ground_truth.get("column_types", [])
                    )
                }

                sample_rows = ground_truth.get("sample_rows", [])
                sample_row = sample_rows[:2] if sample_rows else None

                # Assign to your nested structure
                new_nested_structure[database][table]['grouped'][template][t]['details'] = {
                    'columns': column_name_types,
                    'description': ground_truth.get('description'),
                    'sample_row': sample_row
                }

        ungrouped_dictionary = {}

        for t in new_nested_structure[database][table]['ungrouped']:

            full_path = f"{database}/{table}/{t}"

            with open("/Users/paolocadei/Documents/Masters/Thesis/Spider2/spider2-snow/resource/databases/" + full_path, "r") as f:
                    ground_truth = json.load(f)

            column_name_types = {
                    name: col_type
                    for name, col_type in zip(
                        ground_truth.get("column_names", []),
                        ground_truth.get("column_types", [])
                    )
                }

            sample_rows = ground_truth.get("sample_rows", [])
            sample_row = sample_rows[:2] if sample_rows else None


            ungrouped_dictionary[t] = {
                    'columns': column_name_types,
                    'description': ground_truth.get('description'),
                    'sample_row': sample_row
                }
        
        new_nested_structure[database][table]['ungrouped'] = ungrouped_dictionary
        
 
    

In [None]:
import json

def save_dict_to_json(data: dict, output_path: str):
    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump(data, f, indent=2, ensure_ascii=False)

# Example usage:
save_dict_to_json(new_nested_structure, "0_final_preprocessed.json")
