In [3]:
import json
import pandas as pd

import sys
sys.path.append('../src')  # Add src to Python path

from loader import load_isic_nace_master

In [None]:
# Load the step3 JSON
with open('data/processed-output/isic-nace-step3.json', 'r', encoding='utf-8') as f:
    step3_data = json.load(f)

# Explode into simple pairs
exploded_pairs = []

for record in step3_data:
    isic_codes = record['isic_codes']
    nace_codes = record['nace_codes']
    
    # Create a pair for each ISIC x NACE combination
    for isic_code in isic_codes:
        for nace_code in nace_codes:
            # Look up categories from master dataframes
            isic_cat = isic_df[isic_df['ISIC5_Code'] == isic_code]['CAT'].values
            nace_cat = nace_df[nace_df['CODE'] == nace_code]['CAT'].values
            
            # Use the category if found, otherwise use the code as-is
            isic_with_cat = f"{isic_cat[0]}{isic_code}" if len(isic_cat) > 0 and len(isic_code) > 1 else isic_code
            nace_with_cat = f"{nace_cat[0]}{nace_code}" if len(nace_cat) > 0 and len(nace_code) > 1 else nace_code

            exploded_pairs.append([isic_with_cat, nace_with_cat])

# Save exploded format
with open('data/processed-output/isic-nace-step4.json', 'w', encoding='utf-8') as f:
    json.dump(exploded_pairs, f, indent=2, ensure_ascii=False)

print(f"Original records: {len(step3_data)}")
print(f"Exploded pairs: {len(exploded_pairs)}")
print(f"\nSaved to: data/processed-output/isic-nace-step4.json")

# Show sample
print(f"\nSample pairs:")
for pair in exploded_pairs[:10]:
    print(f"  {pair}")

Original records: 851
Exploded pairs: 1088

Saved to: data/processed-output/isic-nace-step4.json

Sample pairs:
  ['A', 'A']
  ['A0111', 'A01.11']
  ['A0112', 'A01.12']
  ['A0113', 'A01.13']
  ['A0114', 'A01.14']
  ['A0115', 'A01.15']
  ['A0116', 'A01.16']
  ['A0119', 'A01.19']
  ['A0121', 'A01.21']
  ['A0122', 'A01.22']


In [None]:
with open('data/processed-output/isic-nace-step4.json', 'r', encoding='utf-8') as f:
    data = json.load(f)
    
# 2. Save minified (no whitespace)
with open('data/web-optim/correspondence.json', 'w', encoding='utf-8') as f:
    # separators=(',', ':') removes ALL whitespace
    json.dump(data, f, separators=(',', ':'))

In [4]:
isic_df, nace_df = load_isic_nace_master()

In [10]:
from typing import Dict, List

def convert_master_to_array_json(
    df: pd.DataFrame,
    output_path: str,
    cat_col: str = 'CAT',
    code_col: str = 'code',
    title_col: str = 'title',
    desc_col: str = 'description'
) -> Dict[str, List]:
    
    # 1. Clean and prepare data
    df_clean = df.copy()
    
    # Ensure required columns exist (fill missing with defaults)
    required_cols = [cat_col, code_col, title_col, desc_col]
    for col in required_cols:
        if col not in df_clean.columns:
            raise ValueError(f"Missing required column: {col}")
    
    # Clean text (remove extra whitespace, handle NaN)
    df_clean[title_col] = df_clean[title_col].fillna('').astype(str).str.strip()
    df_clean[desc_col] = df_clean[desc_col].fillna('').astype(str).str.strip()

    # 2. Convert to array format
    array_data = {}
    missing_counts = {'title': 0, 'desc': 0}
    
    for _, row in df_clean.iterrows():
        code = str(row[cat_col]) + str(row[code_col]) if len(str(row[code_col])) > 1 else str(row[code_col])
        
        # Validate data quality
        if not row[title_col]:
            missing_counts['title'] += 1
        if not row[desc_col]:
            missing_counts['desc'] += 1
        
        # Store as array [title, description, parent, level]
        array_data[code] = [
            row[title_col] or f"Missing title for {code}",
            row[desc_col] or f"",
        ]
    
    # 3. Report data quality
    print(f"\nData Quality Report:")
    print(f"  Total records: {len(array_data)}")
    for field, count in missing_counts.items():
        if count > 0:
            print(f"  WARNING: {count} records missing {field}")
    
    # Save minified version (production)
    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump(array_data, f, separators=(',', ':'))

In [11]:
isic_array = convert_master_to_array_json(
    df=isic_df,
    output_path='../data/web-optim/isic-master.json',
    cat_col = 'CAT',
    code_col='ISIC5_Code',
    title_col='ISIC5_Title',
    desc_col='ISIC5_Desc'
    # parent_col and level_col will be auto-derived
)


Data Quality Report:
  Total records: 830


In [12]:
nace_array = convert_master_to_array_json(
    df=nace_df,
    output_path='../data/web-optim/nace-master.json',
    cat_col = 'CAT',
    code_col='CODE',
    title_col='HEADING',
    desc_col='DESC'
    # parent_col and level_col will be auto-derived
)


Data Quality Report:
  Total records: 1047
