In [78]:
import os
import json

import pandas as pd
from openai import OpenAI
from dotenv import load_dotenv
from tqdm.notebook import tqdm

In [57]:
load_dotenv()
api_key = os.getenv("DEEPSEEK_API_KEY")

In [28]:
isic_df = pd.read_excel('data/isic5codes.xlsx')
isic_df.head()

Unnamed: 0,ISIC5_Section,CAT,ISIC5_Code,ISIC5_Title
0,A,A,A,"Agriculture, forestry and fishing"
1,A01,A,01,"Crop and animal production, hunting and relate..."
2,A011,A,011,Growing of non-perennial crops
3,A0111,A,0111,"Growing of cereals (except rice), leguminous c..."
4,A0112,A,0112,Growing of rice


In [14]:
nace_df = pd.read_excel('data/nace21codes.xlsx')
nace_df.head()

Unnamed: 0,ID,CAT,CODE,HEADING,PARENT_ID,PARENT_CODE,LEVEL
0,A,A,A,"AGRICULTURE, FORESTRY AND FISHING",,,1
1,01,A,01,"Crop and animal production, hunting and relate...",A,A,2
2,011,A,01.1,Growing of non-perennial crops,01,01,3
3,0111,A,01.11,"Growing of cereals, other than rice, leguminou...",011,01.1,4
4,0112,A,01.12,Growing of rice,011,01.1,4


# Check if code works

In [47]:
def create_code_description(df, system_name):
    code_col = "ISIC5_Code" if system_name == "ISIC" else "CODE"
    title_col = "ISIC5_Title" if system_name == "ISIC" else "HEADING"
    return [f"{system_name} {row[code_col]} {row[title_col]} \n" for _, row in df.iterrows()]

cat_prompted = ['A', 'B']
isic_descriptions = create_code_description(isic_df[isic_df['CAT'].isin(cat_prompted)], "ISIC")
nace_descriptions = create_code_description(nace_df[nace_df['CAT'].isin(cat_prompted)], "NACE")

In [48]:
isic_descriptions[:5]

['ISIC A Agriculture, forestry and fishing \n',
 'ISIC 01 Crop and animal production, hunting and related service activities \n',
 'ISIC 011 Growing of non-perennial crops \n',
 'ISIC 0111 Growing of cereals (except rice), leguminous crops and oil seeds \n',
 'ISIC 0112 Growing of rice \n']

In [None]:
prompt = f"""
I need to create a correspondence mapping between two industrial classification systems: ISIC (International Standard Industrial Classification) and NACE (Statistical Classification of Economic Activities in the European Community EU). This is a new classification version so the correspondence is not readily available. Since there is a token limit, I will provide you with the data in parts.

Here are the ISIC codes with their titles:
{chr(10).join(isic_descriptions)}

Here are the NACE codes with their titles:
{chr(10).join(nace_descriptions)}

Please analyze these classification systems and provide a mapping between them. For each mapping, indicate:
1. The ISIC code(s) and NACE code(s) that correspond
2. Confidence level (High/Medium/Low)
3. Type of match (one-to-one, one-to-many, many-to-one, many-to-many)

Return the results in a structured JSON format like:
{{
  "mappings": [
    {{
      "isic_codes": ["A"],
      "nace_codes": ["A"],
      "confidence": "High",
      "match_type": "one-to-one"
    }},
    {{
      "isic_codes": ["01"],
      "nace_codes": ["01.11", "01.12"],
      "confidence": "High",
      "match_type": "one-to-many"
    }}
  ],
  "unmatched_codes": {{
    "isic": ["05", "10", "9000"],
    "nace": ["05", "90.11"]
  }}
}}

Important matching principles:
1. Check hierarchical relationships (ISIC sections/divisions vs NACE sections/divisions)
2. Look for semantic similarity in titles
3. Consider cross-references between systems
4. Note that NACE is based on ISIC but has more granular European adaptations
"""

In [67]:
def estimate_tokens(text):
    """
    Rough estimation: tokens ≈ words/0.75 or chars/4
    Accurate for English, approximate for code/mixed content
    """
    # Simple word-based estimation
    word_count = len(text.split())
    token_estimate = int(word_count / 0.75)
    
    # Alternative: character-based estimation
    char_count = len(text)
    token_estimate_char = int(char_count / 4)
    
    return max(token_estimate, token_estimate_char)

estimated_tokens = estimate_tokens(prompt)
estimated_tokens

2283

In [68]:
prompt

'\nI need to create a correspondence mapping between two industrial classification systems: ISIC (International Standard Industrial Classification) and NACE (Statistical Classification of Economic Activities in the European Community EU). This is a new classification version so the correspondence is not readily available. Since there is a token limit, I will provide you with the data in parts.\n\nHere are the ISIC codes with their titles:\nISIC A Agriculture, forestry and fishing \n\nISIC 01 Crop and animal production, hunting and related service activities \n\nISIC 011 Growing of non-perennial crops \n\nISIC 0111 Growing of cereals (except rice), leguminous crops and oil seeds \n\nISIC 0112 Growing of rice \n\nISIC 0113 Growing of vegetables and melons, roots and tubers \n\nISIC 0114 Growing of sugar cane \n\nISIC 0115 Growing of tobacco \n\nISIC 0116 Growing of fibre crops \n\nISIC 0119 Growing of other non-perennial crops \n\nISIC 012 Growing of perennial crops \n\nISIC 0121 Growing

In [None]:
client = OpenAI(
    api_key=api_key,
    base_url="https://api.deepseek.com"
)


In [None]:
response = client.chat.completions.create(
    model="deepseek-chat",
    messages=[{"role": "user", "content": prompt}],
    temperature=0.1,  # Low temperature for consistent output
    max_tokens=4000
)

In [70]:
print(response.choices[0].message.content)

Looking at the provided data for Sections A and B, I can create a detailed mapping. The structures are extremely similar, with NACE being a more granular European adaptation of ISIC. The main differences are in the formatting of codes (ISIC uses 4-digit codes without decimals, NACE uses a decimal system) and some minor title variations and splits.

Here is the structured JSON mapping:

```json
{
  "mappings": [
    {
      "isic_codes": ["A"],
      "nace_codes": ["A"],
      "confidence": "High",
      "match_type": "one-to-one"
    },
    {
      "isic_codes": ["01"],
      "nace_codes": ["01"],
      "confidence": "High",
      "match_type": "one-to-one"
    },
    {
      "isic_codes": ["011"],
      "nace_codes": ["01.1"],
      "confidence": "High",
      "match_type": "one-to-one"
    },
    {
      "isic_codes": ["0111"],
      "nace_codes": ["01.11"],
      "confidence": "High",
      "match_type": "one-to-one"
    },
    {
      "isic_codes": ["0112"],
      "nace_codes": ["0

In [72]:
def save_response_basic(response_content, filename, folder="raw-output"):
    """Save response content to a text file"""
    
    # Create folder if it doesn't exist
    os.makedirs(folder, exist_ok=True)
    # Full file path
    filepath = os.path.join(folder, filename)
    
    # Save the content
    with open(filepath, 'w', encoding='utf-8') as f:
        f.write(response_content)
    
    print(f"✅ Response saved to: {filepath}")
    return filepath


In [73]:
nametxt = ''.join(cat_prompted)
save_response_basic(response.choices[0].message.content, filename=f"{nametxt}.txt")

✅ Response saved to: raw-output\AB.txt


'raw-output\\AB.txt'

# Real Batch Process

In [71]:
def create_prompt(isic_desc, nace_desc):
    return f"""
I need to create a correspondence mapping between two industrial classification systems: ISIC (International Standard Industrial Classification) and NACE (Statistical Classification of Economic Activities in the European Community EU). This is a new classification version so the correspondence is not readily available. Since there is a token limit, I will provide you with the data in parts.

Here are the ISIC codes with their titles:
{chr(10).join(isic_desc)}

Here are the NACE codes with their titles:
{chr(10).join(nace_desc)}

Please analyze these classification systems and provide a mapping between them. For each mapping, indicate:
1. The ISIC code(s) and NACE code(s) that correspond
2. Confidence level (High/Medium/Low)
3. Type of match (one-to-one, one-to-many, many-to-one, many-to-many)

Return the results in a structured JSON format like:
{{
  "mappings": [
    {{
      "isic_codes": ["A"],
      "nace_codes": ["A"],
      "confidence": "High",
      "match_type": "one-to-one"
    }},
    {{
      "isic_codes": ["01"],
      "nace_codes": ["01.11", "01.12"],
      "confidence": "High",
      "match_type": "one-to-many"
    }}
  ],
  "unmatched_codes": {{
    "isic": ["05", "10", "9000"],
    "nace": ["05", "90.11"]
  }}
}}

Important matching principles:
1. Check hierarchical relationships (ISIC sections/divisions vs NACE sections/divisions)
2. Look for semantic similarity in titles
3. Consider cross-references between systems
4. Note that NACE is based on ISIC but has more granular European adaptations
"""

In [80]:
cat_prompted_list = [
    # ['A', 'B'],
    ['C'],
    ['D', 'E', 'F'],
    ['G'],
    ['H', 'I'],
    ['J', 'K'],
    ['L', 'M', 'N'],
    ['O', 'P'],
    ['Q', 'R', 'S'],
    ['T', 'U', 'V']
]

for cat_prompted in tqdm(cat_prompted_list):
    isic_descriptions = create_code_description(isic_df[isic_df['CAT'].isin(cat_prompted)], "ISIC")
    nace_descriptions = create_code_description(nace_df[nace_df['CAT'].isin(cat_prompted)], "NACE")
    prompt = create_prompt(isic_descriptions, nace_descriptions)
    
    response = client.chat.completions.create(
        model="deepseek-chat",
        messages=[{"role": "user", "content": prompt}],
        temperature=0.1,  # Low temperature for consistent output
        max_tokens=4000
    )
    
    nametxt = ''.join(cat_prompted)
    save_response_basic(response.choices[0].message.content, filename=f"{nametxt}.txt")

  0%|          | 0/9 [00:00<?, ?it/s]

✅ Response saved to: raw-output\C.txt
✅ Response saved to: raw-output\DEF.txt
✅ Response saved to: raw-output\DEF.txt
✅ Response saved to: raw-output\G.txt
✅ Response saved to: raw-output\G.txt
✅ Response saved to: raw-output\HI.txt
✅ Response saved to: raw-output\HI.txt
✅ Response saved to: raw-output\JK.txt
✅ Response saved to: raw-output\JK.txt
✅ Response saved to: raw-output\LMN.txt
✅ Response saved to: raw-output\LMN.txt
✅ Response saved to: raw-output\OP.txt
✅ Response saved to: raw-output\OP.txt
✅ Response saved to: raw-output\QRS.txt
✅ Response saved to: raw-output\QRS.txt
✅ Response saved to: raw-output\TUV.txt
✅ Response saved to: raw-output\TUV.txt
