In [1]:
import json

In [None]:
# Load cleaned 4-digit data
with open('data/processed-output/isic-nace-step2.json', 'r', encoding='utf-8') as f:
    cleaned_data = json.load(f)

def extract_parent_code(code, level):
    """Extract parent code at given level (2 or 3 digits for ISIC, 2-4 chars for NACE)"""
    if '.' in code:  # NACE format
        if level == 2:
            return code[:2]  # "01.11" -> "01"
        elif level == 3:
            return code[:4]  # "01.11" -> "01.1"
    else:  # ISIC format
        return code[:level]  # "0111" -> "011" or "01"

def determine_match_type(isic_count, nace_count):
    """Determine match type based on cardinality"""
    if isic_count == 1 and nace_count == 1:
        return "one-to-one"
    elif isic_count == 1 and nace_count > 1:
        return "one-to-many"
    elif isic_count > 1 and nace_count == 1:
        return "many-to-one"
    else:
        return "many-to-many"

# Build 3-digit mappings
print("Building 3-digit records...")
three_digit_map = {}

for record in cleaned_data:
    isic_codes = record['isic_codes']
    nace_codes = record['nace_codes']
    
    # Extract 3-digit parents
    isic_3digit = set([extract_parent_code(code, 3) for code in isic_codes if len(code) == 4])
    nace_3digit = set([extract_parent_code(code, 3) for code in nace_codes if len(code) == 5])
    
    # Group by parent combinations
    for isic_parent in isic_3digit:
        for nace_parent in nace_3digit:
            key = (isic_parent, nace_parent)
            if key not in three_digit_map:
                three_digit_map[key] = True

# Convert to records
three_digit_records = []
for (isic_parent, nace_parent), _ in sorted(three_digit_map.items()):
    match_type = determine_match_type(1, 1)  # Will aggregate below
    three_digit_records.append({
        'isic_codes': [isic_parent],
        'nace_codes': [nace_parent],
        'confidence': 'Manual',
        'match_type': match_type
    })

# Aggregate by ISIC 3-digit to find actual match types
isic_3digit_groups = {}
for record in three_digit_records:
    isic_code = record['isic_codes'][0]
    if isic_code not in isic_3digit_groups:
        isic_3digit_groups[isic_code] = []
    isic_3digit_groups[isic_code].extend(record['nace_codes'])

# Update match types based on aggregation
for record in three_digit_records:
    isic_code = record['isic_codes'][0]
    nace_code = record['nace_codes'][0]
    
    # Find all NACE codes for this ISIC
    all_nace_for_isic = isic_3digit_groups[isic_code]
    
    # Determine match type
    record['match_type'] = determine_match_type(1, len(set(all_nace_for_isic)))

# Build 2-digit mappings
print("Building 2-digit records...")
two_digit_map = {}

for record in three_digit_records:
    isic_codes = record['isic_codes']
    nace_codes = record['nace_codes']
    
    # Extract 2-digit parents
    isic_2digit = set([extract_parent_code(code, 2) for code in isic_codes if len(code) == 3])
    nace_2digit = set([extract_parent_code(code, 2) for code in nace_codes if len(code) == 4])
    
    # Group by parent combinations
    for isic_parent in isic_2digit:
        for nace_parent in nace_2digit:
            key = (isic_parent, nace_parent)
            if key not in two_digit_map:
                two_digit_map[key] = True

# Convert to records
two_digit_records = []
for (isic_parent, nace_parent), _ in sorted(two_digit_map.items()):
    two_digit_records.append({
        'isic_codes': [isic_parent],
        'nace_codes': [nace_parent],
        'confidence': 'Manual',
        'match_type': 'one-to-one'  # Will update below
    })

# Aggregate by ISIC 2-digit to find actual match types
isic_2digit_groups = {}
for record in two_digit_records:
    isic_code = record['isic_codes'][0]
    if isic_code not in isic_2digit_groups:
        isic_2digit_groups[isic_code] = []
    isic_2digit_groups[isic_code].extend(record['nace_codes'])

# Update match types based on aggregation
for record in two_digit_records:
    isic_code = record['isic_codes'][0]
    all_nace_for_isic = isic_2digit_groups[isic_code]
    record['match_type'] = determine_match_type(1, len(set(all_nace_for_isic)))

# Combine all levels
combined_data = cleaned_data + three_digit_records + two_digit_records

# Save combined data
with open('data/processed-output/isic-nace-step3.json', 'w', encoding='utf-8') as f:
    json.dump(combined_data, f, indent=2, ensure_ascii=False)

print(f"4-digit records: {len(cleaned_data)}")
print(f"3-digit records: {len(three_digit_records)}")
print(f"2-digit records: {len(two_digit_records)}")
print(f"Total records: {len(combined_data)}")
print(f"\nSaved to: data/processed-output/isic-nace-step3.json")

Building 3-digit records...
Building 2-digit records...
4-digit records: 473
3-digit records: 291
2-digit records: 87
Total records: 851

Saved to: data/processed-output/isic-nace-step3.json
