In [3]:
import pandas as pd
import ast
import json


In [4]:
with open('tariff_info.txt', 'r', encoding='utf-8') as file:
    data = [json.loads(line) for line in file]

In [5]:
df = pd.DataFrame(data)
df['id'] = df.index + 1

In [6]:
df

Unnamed: 0,id,hsCode,description,origin,type,measureType,geographicalArea,geographicalSigl,regulationRoleType,regulationId,...,startDate,endDate,additionalCodeId,additionalCodeText,exclusions,tariffFormula,footnotes,conditions,hs_info_code,country
0,1,,,ERGA OMNES,Third country duty,103,1011,TOUT,1,R8726580,...,01/01/2005,,,,,0%,[],[],0501,AT
1,2,,,ERGA OMNES,Third country duty,103,1011,TOUT,1,R8726580,...,15/09/1994,,,,,0%,[],[],0510,AT
2,3,,,ERGA OMNES,Import control - CITES,710,1011,TOUT,4,R2309660,...,20/05/2023,01/01/3000,,,,,"[{'code': 370, 'type': 'CD', 'text': 'If the p...","[{'measureID': 0, 'type': 'Y', 'sequenceNumber...",0510,AT
3,4,,,ERGA OMNES,Third country duty,103,1011,TOUT,4,R9922040,...,01/07/2000,01/01/3000,,,,1.60%,[],[],0814,AT
4,5,,,Viet Nam,Tariff preference,142,VN,VN,1,D2007530,...,01/08/2020,,,,,0%,[],[],0814,AT
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
396,397,,,Viet Nam,Tariff preference,142,VN,VN,1,D2007530,...,01/08/2020,,,,,0%,[],[],081310,AT
397,398,,,ERGA OMNES,Third country duty,103,1011,TOUT,4,R9922040,...,01/07/2000,01/01/3000,,,,9.60%,[],[],081320,AT
398,399,,,Viet Nam,Tariff preference,142,VN,VN,1,D2007530,...,01/08/2020,,,,,0%,[],[],081320,AT
399,400,,,ERGA OMNES,Third country duty,103,1011,TOUT,4,R9922040,...,01/07/2000,01/01/3000,,,,3.20%,[],[],081330,AT


In [7]:
measure_df = df.drop(columns=['hsCode','footnotes','conditions'])

In [8]:
footnotes_list = []
for index, row in df.iterrows():
    footnotes_raw = row["footnotes"]

    if pd.isna(footnotes_raw) or footnotes_raw in ["", "[]", None]:
        continue
    
    try:
        # Dùng ast.literal_eval() để chuyển đổi từ chuỗi sang danh sách Python
        footnotes = ast.literal_eval(footnotes_raw)

        # Nếu không phải danh sách, bỏ qua
        if not isinstance(footnotes, list):
            continue
        
        for footnote in footnotes:
            footnote["measure_id"] = row["id"]
            footnotes_list.append(footnote)

    except (ValueError, SyntaxError) as e:
        print(f"⚠️ Lỗi khi parse JSON tại dòng {index}: {footnotes_raw} - {e}")

# Chuyển thành DataFrame
footnotes_df = pd.DataFrame(footnotes_list)

In [9]:
footnotes_df

Unnamed: 0,code,type,text,measure_id
0,370,CD,If the product is mentioned in the list annexe...,3
1,775,CD,In case the goods are exported directly or ind...,9
2,370,CD,If the product is mentioned in the list annexe...,11
3,775,CD,In case the goods are exported directly or ind...,21
4,775,CD,In case the goods are exported directly or ind...,22
...,...,...,...,...
138,750,CD,Eligibility to benefit from this quota is subj...,346
139,83,CD,Entry into free circulation is subject to the ...,348
140,775,CD,In case the goods are exported directly or ind...,371
141,2,PB,The duty rate can be linked to the entry price...,382


In [10]:
conditions_list = []
for index, row in df.iterrows():
    conditions_raw = row["conditions"]

    if pd.isna(conditions_raw) or conditions_raw in ["", "[]", None]:
        continue
    
    try:
        # Dùng ast.literal_eval() để chuyển đổi từ chuỗi sang danh sách Python
        conditions = ast.literal_eval(conditions_raw)

        # Nếu không phải danh sách, bỏ qua
        if not isinstance(conditions, list):
            continue
        
        for condition in conditions:
            condition["measure_id"] = row["id"]
            conditions_list.append(condition)

    except (ValueError, SyntaxError) as e:
        print(f"⚠️ Lỗi khi parse JSON tại dòng {index}: {conditions_raw} - {e}")

# Chuyển thành DataFrame
conditions_df = pd.DataFrame(conditions_list)
conditions_df = conditions_df.drop(columns=['measureID'])


In [11]:
conditions_df

Unnamed: 0,type,sequenceNumber,description,actionDescription,certificateType,certificateNumber,certificateDescription,conditionExpression,tariffFormula,measure_id
0,Y,1,Other conditions,Import/export allowed after control,C,400,"Presentation of the required ""CITES"" certificate",,,3
1,Y,2,Other conditions,Import/export allowed after control,Y,900,Declared goods do not belong to the Washington...,,,3
2,Y,3,Other conditions,Import/export not allowed after control,,,,,,3
3,B,1,Presentation of a certificate/licence/document,Apply the amount of the action (see components),Y,155,Products exported directly or indirectly from ...,,50.00%,9
4,B,2,Presentation of a certificate/licence/document,Apply the amount of the action (see components),,,,,0%,9
...,...,...,...,...,...,...,...,...,...,...
236,Q,2,Presentation of an endorsed certificate/licence,Measure not applicable,,,,,,346
237,C,1,Presentation of a certificate/licence/document,Entry into free circulation allowed,U,004,Certificate of origin for imports of products ...,,,348
238,C,2,Presentation of a certificate/licence/document,The entry into free circulation is not allowed,,,,,,348
239,B,1,Presentation of a certificate/licence/document,Apply the amount of the action (see components),Y,155,Products exported directly or indirectly from ...,,50.00%,371


In [13]:
measure_df.to_csv('output/tariff_measure.csv')
footnotes_df.to_csv('output/tariff_measure_footnotes_df.csv')
conditions_df.to_csv('output/tariff_measure_conditions_df.csv')