# Edit JSON output
- only keep 1 and 4 digits code; remove 2 and 3 digits since the AI mixing 3 and 4 digits in a single record
- rebuilding the 2 and 3 digits can be done from the 4 digits
- if needed clean up manually from `isic-nace-step1.json` then run again the process

In [10]:
import json



import sys
sys.path.append('../src')  # Add src to Python path

from loader import load_isic_nace_master
from prompt_util import create_code_description

In [8]:
isic_df, nace_df = load_isic_nace_master()
isic_df.head()

Unnamed: 0,ISIC5_Section,CAT,ISIC5_Code,ISIC5_Title,ISIC5_Desc
0,A,A,A,"Agriculture, forestry and fishing",This section includes the exploitation of vege...
1,A01,A,01,"Crop and animal production, hunting and relate...","This division includes two basic activities, n..."
2,A011,A,011,Growing of non-perennial crops,This group includes the growing of non-perenni...
3,A0111,A,0111,"Growing of cereals (except rice), leguminous c...",This class includes all forms of growing of ce...
4,A0112,A,0112,Growing of rice,This class includes:\n- growing of rice\n


In [9]:
nace_df.head()

Unnamed: 0,ID,CAT,CODE,HEADING,PARENT_ID,PARENT_CODE,LEVEL,DESC
0,A,A,A,"AGRICULTURE, FORESTRY AND FISHING",,,1,This section includes the exploitation of vege...
1,01,A,01,"Crop and animal production, hunting and relate...",A,A,2,"This division includes two basic activities, n..."
2,011,A,01.1,Growing of non-perennial crops,01,01,3,This group includes the growing of non-perenni...
3,0111,A,01.11,"Growing of cereals, other than rice, leguminou...",011,01.1,4,This class includes all forms of growing of ce...
4,0112,A,01.12,Growing of rice,011,01.1,4,This class includes:\n- growing of rice


In [None]:
# Load the JSON file
with open('data/processed-output/isic-nace-step1.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

# Clean up the data
cleaned_data = []
for record in data:
    # Filter isic_codes: keep only items with length NOT in [2, 3]
    cleaned_isic = [code for code in record['isic_codes'] if len(code) not in [2, 3]]
    
    # Filter nace_codes: keep only items with length NOT in [2, 3, 4]
    cleaned_nace = [code for code in record['nace_codes'] if len(code) not in [2, 3, 4]]
    
    # Only keep the record if at least one of the lists is not empty
    if cleaned_isic or cleaned_nace:
        cleaned_data.append({
            'isic_codes': cleaned_isic,
            'nace_codes': cleaned_nace,
            'confidence': record['confidence'],
            'match_type': record['match_type']
        })

# Save the cleaned data
with open('data/processed-output/isic-nace-step2.json', 'w', encoding='utf-8') as f:
    json.dump(cleaned_data, f, indent=2, ensure_ascii=False)

print(f"Original records: {len(data)}")
print(f"Cleaned records: {len(cleaned_data)}")
print(f"Removed {len(data) - len(cleaned_data)} records")

Original records: 691
Cleaned records: 473
Removed 218 records


In [None]:
# Get all 4-digit codes from master data
isic_master_4digit = set(isic_df[isic_df['ISIC5_Code'].str.len() == 4]['ISIC5_Code'])
nace_master_4digit = set(nace_df[nace_df['CODE'].str.len() == 5]['CODE'])  # NACE 4-digit is like "01.11" (5 chars)

print(f"ISIC master 4-digit codes: {len(isic_master_4digit)}")
print(f"NACE master 4-digit codes: {len(nace_master_4digit)}")

# Get all codes from cleaned JSON
with open('data/processed-output/isic-nace-step2.json', 'r', encoding='utf-8') as f:
    cleaned_data = json.load(f)

json_isic_codes = set()
json_nace_codes = set()

for record in cleaned_data:
    json_isic_codes.update(record['isic_codes'])
    json_nace_codes.update(record['nace_codes'])

# Filter to 4-digit only
json_isic_4digit = {code for code in json_isic_codes if len(code) == 4}
json_nace_4digit = {code for code in json_nace_codes if len(code) == 5}

print(f"\nJSON ISIC 4-digit codes: {len(json_isic_4digit)}")
print(f"JSON NACE 4-digit codes: {len(json_nace_4digit)}")

# Find missing codes
missing_isic = isic_master_4digit - json_isic_4digit
missing_nace = nace_master_4digit - json_nace_4digit

print(f"\n❌ Missing ISIC 4-digit codes: {len(missing_isic)}")
if missing_isic:
    print(sorted(missing_isic))

print(f"\n❌ Missing NACE 4-digit codes: {len(missing_nace)}")
if missing_nace:
    print(sorted(missing_nace))

# Extra codes (in JSON but not in master)
extra_isic = json_isic_4digit - isic_master_4digit
extra_nace = json_nace_4digit - nace_master_4digit

print(f"\n⚠️ Extra ISIC codes in JSON: {len(extra_isic)}")
if extra_isic:
    print(sorted(extra_isic))

print(f"\n⚠️ Extra NACE codes in JSON: {len(extra_nace)}")
if extra_nace:
    print(sorted(extra_nace))

ISIC master 4-digit codes: 463
NACE master 4-digit codes: 651

JSON ISIC 4-digit codes: 463
JSON NACE 4-digit codes: 651

❌ Missing ISIC 4-digit codes: 0

❌ Missing NACE 4-digit codes: 0

⚠️ Extra ISIC codes in JSON: 0

⚠️ Extra NACE codes in JSON: 0


In [None]:
# ask AI for remaining leftover in NACE codes, luckily only category C has leftovers
isic_descriptions = create_code_description(isic_df[isic_df['CAT'].isin(['C'])], "ISIC")
nace_descriptions = create_code_description(nace_df[nace_df['CAT'].isin(['C'])], "NACE")

prompt = f"""
TASK: Map the remaining UNMAPPED NACE codes to their most appropriate ISIC codes.

BACKGROUND:
- You previously mapped many NACE codes to ISIC codes, but some codes remain unmapped.
- These NACE codes might be: 
  1. More granular than ISIC equivalents
  2. European-specific classifications not in ISIC
  3. Codes you may have missed in initial analysis
  4. New or specialized economic activities

UNMAPPED NACE CODES:
['10.52', '24.20', '24.31', '24.32', '24.33', '24.34', '25.12', '28.14']

RELEVANT ISIC CODES (for context):
{chr(10).join(isic_descriptions)}

RELEVANT NACE CODES (for context):
{chr(10).join(nace_descriptions)}



Start by identifying obvious matches, then tackle difficult cases.Please analyze these classification systems and provide a mapping between them. For each mapping, indicate:
1. The ISIC code(s) and NACE code(s) that correspond
2. Confidence level (High/Medium/Low)
3. Type of match (one-to-one, one-to-many, many-to-one, many-to-many)

Return the results in a structured JSON format like:
{{
  "mappings": [
    {{
      "isic_codes": ["A"],
      "nace_codes": ["A"],
      "confidence": "High",
      "match_type": "one-to-one"
    }},
    {{
      "isic_codes": ["0141"],
      "nace_codes": ["01.41", "01.42"],
      "confidence": "High",
      "match_type": "one-to-many"
    }}
  ],
  "unmatched_codes": {{
    "nace": ["05", "90.11"]
  }}
}}

Focus on accuracy over speed.
"""
    
response = client.chat.completions.create(
    model="deepseek-chat",
    messages=[{"role": "user", "content": prompt}],
    temperature=0.1,  # Low temperature for consistent output
    max_tokens=4000
)

In [None]:
print(response.choices[0].message.content)

Based on my analysis of the NACE and ISIC classification systems, here is the mapping for the remaining unmapped NACE codes.

```json
{
  "mappings": [
    {
      "isic_codes": ["1050"],
      "nace_codes": ["10.52"],
      "confidence": "High",
      "match_type": "one-to-one",
      "notes": "NACE 10.52 'Manufacture of ice cream and other edible ice' is a direct subset of ISIC 1050 'Manufacture of dairy products', which explicitly includes 'manufacture of ice cream'."
    },
    {
      "isic_codes": ["2410"],
      "nace_codes": ["24.20"],
      "confidence": "High",
      "match_type": "one-to-one",
      "notes": "NACE 24.20 'Manufacture of tubes, pipes, hollow profiles and related fittings, of steel' is a specific product line within the broader ISIC 2410 'Manufacture of basic iron and steel', which includes the manufacture of seamless and welded tubes and pipes of steel."
    },
    {
      "isic_codes": ["2410"],
      "nace_codes": ["24.31"],
      "confidence": "High",
     