In [3]:
import json
from collections import Counter

from sentence_splitter import SentenceSplitter

In [1]:
import sys
sys.path.append('../src')  # Add src to Python path

from loader_local import load_isic_nace_master, process_and_save_data
from aligner_util_local import copy_files_with_rename

# Ensure no pair duplicates

In [4]:

with open('../data/web-optim/correspondence.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

# Convert lists to tuples (since lists aren't hashable)
data_tuples = [tuple(pair) for pair in data]

# Count occurrences
counts = Counter(data_tuples)

# Find duplicates
duplicates = {pair: count for pair, count in counts.items() if count > 1}

if duplicates:
    print("Duplicate pairs found:")
    for pair, count in duplicates.items():
        print(f"  {list(pair)} appears {count} times")
else:
    print("No duplicate pairs found!")

No duplicate pairs found!


# Make the input data for SentAlign

In [3]:
isic_df, nace_df = load_isic_nace_master()

In [20]:
isic_df['ISIC5_Desc2'] = isic_df['ISIC5_Section'] + ' ' + isic_df['ISIC5_Title'] + '\n' + isic_df['ISIC5_Desc'].fillna('')
isic_df.head()

Unnamed: 0,ISIC5_Section,CAT,ISIC5_Code,ISIC5_Title,ISIC5_Desc,ISIC5_Desc2
0,A,A,A,"Agriculture, forestry and fishing",This section includes the exploitation of vege...,"A Agriculture, forestry and fishing\nThis sect..."
1,A01,A,01,"Crop and animal production, hunting and relate...","This division includes two basic activities, n...","A01 Crop and animal production, hunting and re..."
2,A011,A,011,Growing of non-perennial crops,This group includes the growing of non-perenni...,A011 Growing of non-perennial crops\nThis grou...
3,A0111,A,0111,"Growing of cereals (except rice), leguminous c...",This class includes all forms of growing of ce...,"A0111 Growing of cereals (except rice), legumi..."
4,A0112,A,0112,Growing of rice,This class includes:\n- growing of rice\n,A0112 Growing of rice\nThis class includes:\n-...


In [21]:
process_and_save_data(
    dataframe=isic_df,
    output_dir='../data/txt-isic5',
    code_column='ISIC5_Section',
    desc_column='ISIC5_Desc2'
)

In [22]:
nace_df['ACODE'] = nace_df.apply(lambda row: row['CAT'] + row['CODE'] if len(row['CODE']) > 1 else row['CAT'], axis=1)
nace_df['DESC2'] = nace_df['ACODE'] + ' ' + nace_df['HEADING'] + '\n' + nace_df['DESC'].fillna('')
nace_df.head()

Unnamed: 0,ID,CAT,CODE,HEADING,PARENT_ID,PARENT_CODE,LEVEL,DESC,ACODE,DESC2
0,A,A,A,"AGRICULTURE, FORESTRY AND FISHING",,,1,This section includes the exploitation of vege...,A,"A AGRICULTURE, FORESTRY AND FISHING\nThis sect..."
1,01,A,01,"Crop and animal production, hunting and relate...",A,A,2,"This division includes two basic activities, n...",A01,"A01 Crop and animal production, hunting and re..."
2,011,A,01.1,Growing of non-perennial crops,01,01,3,This group includes the growing of non-perenni...,A01.1,A01.1 Growing of non-perennial crops\nThis gro...
3,0111,A,01.11,"Growing of cereals, other than rice, leguminou...",011,01.1,4,This class includes all forms of growing of ce...,A01.11,"A01.11 Growing of cereals, other than rice, le..."
4,0112,A,01.12,Growing of rice,011,01.1,4,This class includes:\n- growing of rice,A01.12,A01.12 Growing of rice\nThis class includes:\n...


In [23]:
process_and_save_data(
    dataframe=nace_df,
    output_dir='../data/txt-nace21',
    code_column='ACODE',
    desc_column='DESC2'
)

In [None]:
copy_files_with_rename(
    cor_data=data,
    isic_dir='../data/txt-isic5/',
    nace_dir='../data/txt-nace21/',
    isic_dest='../data/input-sa/isic5/',
    nace_dest='../data/input-sa/nace21/'
)