In [3]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import time
import json
from selenium import webdriver
from selenium.common.exceptions import WebDriverException

In [4]:
countries = pd.read_csv('countries.csv')['code'].tolist()

In [5]:
def fetch_bycode_with_retry(retries=5, delay=5):
    """Gửi request với retry khi lỗi."""
    for attempt in range(retries):
        try:
            url = "https://trade.ec.europa.eu/access-to-markets/api/v2/nomenclature/products"
            params = {
                "country": country,
                "lang": "EN"
            }

            response = requests.get(url, params=params, timeout=60)
            response.encoding = 'utf-8'
            if response.status_code == 200:
                return response
        except requests.RequestException as e:
            print(f"Request error: {e} - Retrying {attempt+1}/{retries}...")
            time.sleep(delay)
    return None

In [6]:
def fetch_byid_with_retry(id, retries=5, delay=5):
    """Gửi request với retry khi lỗi."""
    for attempt in range(retries):
        try:
            url = "https://trade.ec.europa.eu/access-to-markets/api/v2/nomenclature/products"
            params = {
                "parent": id,
                "country": country,
                "lang": "EN"
            }

            response = requests.get(url,params=params,  timeout=60)
            response.encoding = 'utf-8'
            if response.status_code == 200:
                return response
        except requests.RequestException as e:
            print(f"Request error: {e} - Retrying {attempt+1}/{retries}...")
            time.sleep(delay)
    return None

lần đầu chạy đoạn code bên dưới để lấy toàn bộ dữ liệu

In [None]:
# all_sections_df = []
# for country in countries:
#     section_info = fetch_bycode_with_retry().json()
#     section_df = pd.DataFrame(section_info)
#     section_df['index'] = section_df.index
#     section_df['crawled'] = ''
#     section_df['country'] = country
#     all_sections_df.append(section_df)
#     all_dfs =  pd.concat(all_sections_df, ignore_index=True) 
#     all_dfs.to_csv('section_info.csv', index=False)
# df = pd.read_csv('section_info.csv')
# df

từ lần 2 chạy từ ô bên dưới

In [37]:
df = pd.read_csv('hs_info.csv', dtype={'id': str})
df

Unnamed: 0,id,code,description,hasChildren,section,index,crawled,country,parent_id,error_message
0,-1,,Live animals; animal products,True,"{'description': 'SECTION I', 'chapterFrom': '0...",0,,AT,,
1,-2,,Vegetable products,True,"{'description': 'SECTION II', 'chapterFrom': '...",1,,AT,,
2,-3,,Animal or vegetable fats and oils and their cl...,True,"{'description': 'SECTION III', 'chapterFrom': ...",2,,AT,,
3,-4,,"Prepared foodstuffs; beverages, spirits and vi...",True,"{'description': 'SECTION IV', 'chapterFrom': '...",3,,AT,,
4,-5,,Mineral products,True,"{'description': 'SECTION V', 'chapterFrom': '2...",4,,AT,,
...,...,...,...,...,...,...,...,...,...,...
562,-17,,"Vehicles, aircraft, vessels and associated tra...",True,"{'description': 'SECTION XVII', 'chapterFrom':...",16,,SE,,
563,-18,,"Optical, photographic, cinematographic, measur...",True,"{'description': 'SECTION XVIII', 'chapterFrom'...",17,,SE,,
564,-19,,Arms and ammunition; parts and accessories the...,True,"{'description': 'SECTION XIX', 'chapterFrom': ...",18,,SE,,
565,-20,,Miscellaneous manufactured articles,True,"{'description': 'SECTION XX', 'chapterFrom': '...",19,,SE,,


In [None]:
for country in countries:
    while ((df['country'] == country) & (df['hasChildren'] == True) & ((df['crawled'] == '') | pd.isna(df['crawled']) | (df['crawled'] == 'Error'))).any():
        for i in df.index:
            if df.loc[i, 'country'] == country and df.loc[i, 'hasChildren'] and (df.loc[i, 'crawled'] == '' or pd.isna(df.loc[i, 'crawled']) or df.loc[i, 'crawled'] == 'Error'):
                id = df.loc[i, 'id']
                attempts = 0
                success = False
                error_occurred = False  # Biến để kiểm soát lỗi

                while attempts < 5 and not success:
                    try:
                        print(f"Fetching index: {i} id: {id}... (Attempt {attempts}/5)")
                        
                        response = fetch_byid_with_retry(id)  
                        if response and response.status_code == 200:
                            df2 = pd.DataFrame(response.json())
                            df2['country'] = country
                            df2['parent_id'] = id
                            print(df2)
                            
                            df = pd.concat([df, df2], ignore_index=True).reset_index(drop=True)
                            df.loc[i, 'crawled'] = 'Done'
                            df.to_csv('hs_info.csv')
                            print(f"✅ Done fetching index: {i} id: {id}")
                            success = True
                        else:
                            raise Exception(f"Failed request, status code: {response.status_code}")
                    
                    except Exception as e:
                        attempts += 1
                        error_message = str(e)
                        print(f"❌ Error fetching index: {i} id: {id}... (Attempt {attempts}/5) - {e}")
                        if attempts == 5: 
                            error_occurred = True

                if error_occurred: 
                    df.loc[i, 'crawled'] = 'Error'
                    df.loc[i, 'error_message'] = error_message
                    df.to_csv('hs_info.csv')
                    print("⏳ Encountered multiple errors. Sleeping for 1 minutes...")
                    
                    # Mở trình duyệt để nhập captcha
                    print("🔓 Mở trình duyệt để nhập captcha...")
                    driver = webdriver.Chrome()
                    driver.get("https://trade.ec.europa.eu/access-to-markets/en/search?product=0101&origin=VN&destination=AT")  # Thay bằng URL cần mở
                    print("⌨️ Vui lòng nhập captcha, sau đó đóng trình duyệt.")

                    # Đo thời gian bắt đầu
                    start_captcha_time = time.time()
                    wait_time = 600  # 10 phút (600 giây)

                    # Chờ người dùng tự đóng trình duyệt, nếu quá 10 phút thì tự động đóng
                    while True:
                        try:
                            if time.time() - start_captcha_time > wait_time:
                                print("⏳ Quá 10 phút, tự động đóng trình duyệt.")
                                driver.quit()
                                break  # Thoát khỏi vòng lặp

                            # Kiểm tra trình duyệt còn mở không
                            driver.title  
                            
                            # Kiểm tra trang có bị treo không
                            if driver.execute_script("return document.readyState") != "complete":
                                print("⚠️ Trang web có thể bị treo, chờ thêm...")
                            
                        except WebDriverException:
                            print("✅ Trình duyệt đã được đóng.")
                            break  # Nếu trình duyệt bị đóng, thoát vòng lặp
                        
                        time.sleep(5)  # Tránh vòng lặp chạy quá nhanh, gây tải CPU
                    
                    time.sleep(10)  # Đợi 10 giây trước khi tiếp tục
                    break
            


Fetching index: 0 id: -1... (Attempt 0/5)
    id code                                        description  hasChildren  \
0  301   01                                       Live animals         True   
1  302   02                         Meat and edible meat offal         True   
2  303   03  Fish and crustaceans, molluscs and other aquat...         True   
3  304   04  Dairy produce; birds' eggs; natural honey; edi...         True   
4  305   05  Products of animal origin, not elsewhere speci...         True   

  country parent_id  
0      AT        -1  
1      AT        -1  
2      AT        -1  
3      AT        -1  
4      AT        -1  
✅ Done fetching index: 0 id: -1
Fetching index: 1 id: -2... (Attempt 0/5)


  df.loc[i, 'crawled'] = 'Done'


    id code                                        description  hasChildren  \
0  306   06  Live trees and other plants; bulbs, roots and ...         True   
1  307   07     Edible vegetables and certain roots and tubers         True   
2  308   08  Edible fruit and nuts; peel of citrus fruit or...         True   
3  309   09                       Coffee, tea, maté and spices         True   
4  310   10                                            Cereals         True   
5  311   11  Products of the milling industry; malt; starch...         True   
6  312   12  Oil seeds and oleaginous fruits; miscellaneous...         True   
7  313   13  Lac; gums, resins and other vegetable saps and...         True   
8  314   14  Vegetable plaiting materials; vegetable produc...         True   

  country parent_id  
0      AT        -2  
1      AT        -2  
2      AT        -2  
3      AT        -2  
4      AT        -2  
5      AT        -2  
6      AT        -2  
7      AT        -2  
8      AT  

  df.loc[i, 'error_message'] = error_message


⌨️ Vui lòng nhập captcha, sau đó đóng trình duyệt.
✅ Trình duyệt đã được đóng.
Fetching index: 750 id: 1509... (Attempt 0/5)
     id    code intervalMin intervalMax         description  hasChildren  \
0  6809    None  0910110000  0910120000              Ginger         True   
1  6909  091020         NaN         NaN             Saffron         True   
2  7009  091030         NaN         NaN  Turmeric (curcuma)        False   
3  7309    None  0910910000  0910990000        Other spices         True   

  selectable country  parent_id  
0        NaN      AT       1509  
1        NaN      AT       1509  
2       True      AT       1509  
3        NaN      AT       1509  
✅ Done fetching index: 750 id: 1509
Fetching index: 751 id: 410... (Attempt 0/5)
     id  code intervalMin intervalMax  description  hasChildren country  \
0  2510  None  1001110000  1001190000  Durum wheat         True      AT   
1  2710  None  1001910000  1001990000        Other         True      AT   

   parent_id  
0 