In [None]:
import os
import json

from openai import OpenAI
from dotenv import load_dotenv
from tqdm.notebook import tqdm

In [5]:
import sys
sys.path.append('../src')  # Add src to Python path

from loader import load_isic_nace_master
from prompt_util import create_code_description

In [22]:
load_dotenv()
api_key = os.getenv("DEEPSEEK_API_KEY")

In [3]:
isic_df, nace_df = load_isic_nace_master()
isic_df.head()

Unnamed: 0,ISIC5_Section,CAT,ISIC5_Code,ISIC5_Title,ISIC5_Desc
0,A,A,A,"Agriculture, forestry and fishing",This section includes the exploitation of vege...
1,A01,A,01,"Crop and animal production, hunting and relate...","This division includes two basic activities, n..."
2,A011,A,011,Growing of non-perennial crops,This group includes the growing of non-perenni...
3,A0111,A,0111,"Growing of cereals (except rice), leguminous c...",This class includes all forms of growing of ce...
4,A0112,A,0112,Growing of rice,This class includes:\n- growing of rice\n


In [4]:
nace_df.head()

Unnamed: 0,ID,CAT,CODE,HEADING,PARENT_ID,PARENT_CODE,LEVEL,DESC
0,A,A,A,"AGRICULTURE, FORESTRY AND FISHING",,,1,This section includes the exploitation of vege...
1,01,A,01,"Crop and animal production, hunting and relate...",A,A,2,"This division includes two basic activities, n..."
2,011,A,01.1,Growing of non-perennial crops,01,01,3,This group includes the growing of non-perenni...
3,0111,A,01.11,"Growing of cereals, other than rice, leguminou...",011,01.1,4,This class includes all forms of growing of ce...
4,0112,A,01.12,Growing of rice,011,01.1,4,This class includes:\n- growing of rice


# Check if code works

In [None]:
cat_prompted = ['A', 'B']
isic_descriptions = create_code_description(isic_df[isic_df['CAT'].isin(cat_prompted)], "ISIC")
nace_descriptions = create_code_description(nace_df[nace_df['CAT'].isin(cat_prompted)], "NACE")

In [11]:
isic_descriptions[:5]

['ISIC Code: A \n Title: Agriculture, forestry and fishing \n Description: This section includes the exploitation of vegetal and animal natural resources, comprising the activities of growing of crops, raising and breeding of animals, harvesting of timber and other plants, production of animal products from a farm or natural habitats.\nThis section includes also:\norganic agriculture, soilless cultivation of crops, hydroponics and aquaponics, aquaculture, the growing of genetically modified crops and the raising of genetically modified animals\nThis section excludes undifferentiated subsistence goods-producing activities of households, which is classified in class 9810. \n',
 'ISIC Code: 01 \n Title: Crop and animal production, hunting and related service activities \n Description: This division includes two basic activities, namely the production of crop products and production of _x000D_animal products. This division includes growing of crops on open land, under cover or in greenhous

In [None]:
prompt = f"""
I need to create a correspondence mapping between two industrial classification systems: ISIC (International Standard Industrial Classification) and NACE (Statistical Classification of Economic Activities in the European Community EU). This is a new classification version so the correspondence is not readily available. Since there is a token limit, I will provide you with the data in parts.

Here are the ISIC codes with their titles and descriptions:
{chr(10).join(isic_descriptions)}

Here are the NACE codes with their titles and descriptions:
{chr(10).join(nace_descriptions)}

Please analyze these classification systems and provide a mapping between them. For each mapping, indicate:
1. The ISIC code(s) and NACE code(s) that correspond
2. Confidence level (High/Medium/Low)
3. Type of match (one-to-one, one-to-many, many-to-one, many-to-many)

Return the results in a structured JSON format like:
{{
  "mappings": [
    {{
      "isic_codes": ["A"],
      "nace_codes": ["A"],
      "confidence": "High",
      "match_type": "one-to-one"
    }},
    {{
      "isic_codes": ["0141"],
      "nace_codes": ["01.41", "01.42"],
      "confidence": "High",
      "match_type": "one-to-many"
    }}
  ],
  "unmatched_codes": {{
    "isic": ["05", "10", "9000"],
    "nace": ["05", "90.11"]
  }}
}}

Important matching principles:
1. Check hierarchical relationships
2. Look for semantic similarity in titles and descriptions
3. Consider cross-references between systems
4. Note that NACE is based on ISIC but has more granular European adaptations
5. Avoid suggesting matches for codes not provided in the input data
6. Only map codes of the same level (e.g., 4-digit to 4-digit)
7. if NACE is more granular and ISIC is broader, try to map multiple NACE codes to a single ISIC code; provided that no other ISIC code matches those NACE codes
"""

In [13]:
def estimate_tokens(text):
    """
    Rough estimation: tokens ≈ words/0.75 or chars/4
    Accurate for English, approximate for code/mixed content
    """
    # Simple word-based estimation
    word_count = len(text.split())
    token_estimate = int(word_count / 0.75)
    
    # Alternative: character-based estimation
    char_count = len(text)
    token_estimate_char = int(char_count / 4)
    
    return max(token_estimate, token_estimate_char)

estimated_tokens = estimate_tokens(prompt)
estimated_tokens

23576

In [23]:
client = OpenAI(
    api_key=api_key,
    base_url="https://api.deepseek.com"
)


In [15]:
response = client.chat.completions.create(
    model="deepseek-chat",
    messages=[{"role": "user", "content": prompt}],
    temperature=0.1,  # Low temperature for consistent output
    max_tokens=4000
)

In [70]:
print(response.choices[0].message.content)

Looking at the provided data for Sections A and B, I can create a detailed mapping. The structures are extremely similar, with NACE being a more granular European adaptation of ISIC. The main differences are in the formatting of codes (ISIC uses 4-digit codes without decimals, NACE uses a decimal system) and some minor title variations and splits.

Here is the structured JSON mapping:

```json
{
  "mappings": [
    {
      "isic_codes": ["A"],
      "nace_codes": ["A"],
      "confidence": "High",
      "match_type": "one-to-one"
    },
    {
      "isic_codes": ["01"],
      "nace_codes": ["01"],
      "confidence": "High",
      "match_type": "one-to-one"
    },
    {
      "isic_codes": ["011"],
      "nace_codes": ["01.1"],
      "confidence": "High",
      "match_type": "one-to-one"
    },
    {
      "isic_codes": ["0111"],
      "nace_codes": ["01.11"],
      "confidence": "High",
      "match_type": "one-to-one"
    },
    {
      "isic_codes": ["0112"],
      "nace_codes": ["0

In [None]:
def save_response_basic(response_content, filename, folder="data/raw-output"):
    """Save response content to a text file"""
    
    # Create folder if it doesn't exist
    os.makedirs(folder, exist_ok=True)
    # Full file path
    filepath = os.path.join(folder, filename)
    
    # Save the content
    with open(filepath, 'w', encoding='utf-8') as f:
        f.write(response_content)
    
    print(f"✅ Response saved to: {filepath}")
    return filepath


In [17]:
nametxt = ''.join(cat_prompted)
save_response_basic(response.choices[0].message.content, filename=f"{nametxt}.md")

✅ Response saved to: raw-output\AB.md


'raw-output\\AB.md'

# Real Batch Process

In [None]:
def create_prompt(isic_desc, nace_desc):
    return f"""
I need to create a correspondence mapping between two industrial classification systems: ISIC (International Standard Industrial Classification) and NACE (Statistical Classification of Economic Activities in the European Community EU). This is a new classification version so the correspondence is not readily available. Since there is a token limit, I will provide you with the data in parts.

Here are the ISIC codes with their titles and descriptions:
{chr(10).join(isic_desc)}

Here are the NACE codes with their titles and descriptions:
{chr(10).join(nace_desc)}

Please analyze these classification systems and provide a mapping between them. For each mapping, indicate:
1. The ISIC code(s) and NACE code(s) that correspond
2. Confidence level (High/Medium/Low)
3. Type of match (one-to-one, one-to-many, many-to-one, many-to-many)

Return the results in a structured JSON format like:
{{
  "mappings": [
    {{
      "isic_codes": ["A"],
      "nace_codes": ["A"],
      "confidence": "High",
      "match_type": "one-to-one"
    }},
    {{
      "isic_codes": ["0141"],
      "nace_codes": ["01.41", "01.42"],
      "confidence": "High",
      "match_type": "one-to-many"
    }}
  ],
  "unmatched_codes": {{
    "isic": ["05", "10", "9000"],
    "nace": ["05", "90.11"]
  }}
}}

Important matching principles:
1. Check hierarchical relationships
2. Look for semantic similarity in titles and descriptions
3. Consider cross-references between systems
4. Note that NACE is based on ISIC but has more granular European adaptations
5. Avoid suggesting matches for codes not provided in the input data
6. Only map codes of the same level (e.g., 4-digit to 4-digit)
7. if NACE is more granular and ISIC is broader, try to map multiple NACE codes to a single ISIC code; provided that no other ISIC code matches those NACE codes
"""

In [21]:
cat_prompted_list = [
    ['A', 'B'],
    ['C'],
    ['D', 'E', 'F'],
    ['G'],
    ['H', 'I'],
    ['J', 'K'],
    ['L', 'M', 'N'],
    ['O', 'P'],
    ['Q', 'R', 'S'],
    ['T', 'U', 'V']
]

for cat_prompted in tqdm(cat_prompted_list):
    isic_descriptions = create_code_description(isic_df[isic_df['CAT'].isin(cat_prompted)], "ISIC")
    nace_descriptions = create_code_description(nace_df[nace_df['CAT'].isin(cat_prompted)], "NACE")
    prompt = create_prompt(isic_descriptions, nace_descriptions)
    
    response = client.chat.completions.create(
        model="deepseek-chat",
        messages=[{"role": "user", "content": prompt}],
        temperature=0.1,  # Low temperature for consistent output
        max_tokens=8000
    )
    print(response.usage)
    
    nametxt = ''.join(cat_prompted)
    save_response_basic(response.choices[0].message.content, filename=f"{nametxt}.md")

  0%|          | 0/10 [00:00<?, ?it/s]

CompletionUsage(completion_tokens=3672, prompt_tokens=22586, total_tokens=26258, completion_tokens_details=None, prompt_tokens_details=PromptTokensDetails(audio_tokens=None, cached_tokens=22528), prompt_cache_hit_tokens=22528, prompt_cache_miss_tokens=58)
✅ Response saved to: raw-output\AB.md
CompletionUsage(completion_tokens=8000, prompt_tokens=103093, total_tokens=111093, completion_tokens_details=None, prompt_tokens_details=PromptTokensDetails(audio_tokens=None, cached_tokens=103040), prompt_cache_hit_tokens=103040, prompt_cache_miss_tokens=53)
✅ Response saved to: raw-output\C.md
CompletionUsage(completion_tokens=2234, prompt_tokens=19137, total_tokens=21371, completion_tokens_details=None, prompt_tokens_details=PromptTokensDetails(audio_tokens=None, cached_tokens=19136), prompt_cache_hit_tokens=19136, prompt_cache_miss_tokens=1)
✅ Response saved to: raw-output\DEF.md
CompletionUsage(completion_tokens=3143, prompt_tokens=26077, total_tokens=29220, completion_tokens_details=None, pr