In [19]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import time
import json
from selenium import webdriver
from selenium.common.exceptions import WebDriverException

In [20]:
countries = pd.read_csv('countries.csv')['code'].tolist()

In [21]:
def fetch_bycode_with_retry(retries=5, delay=5):
    """Gửi request với retry khi lỗi."""
    for attempt in range(retries):
        try:
            url = "https://trade.ec.europa.eu/access-to-markets/api/v2/nomenclature/products"
            params = {
                "country": country,
                "lang": "EN"
            }

            response = requests.get(url, params=params, timeout=60)
            response.encoding = 'utf-8'
            if response.status_code == 200:
                return response
        except requests.RequestException as e:
            print(f"Request error: {e} - Retrying {attempt+1}/{retries}...")
            time.sleep(delay)
    return None

In [22]:
def fetch_byid_with_retry(id, retries=5, delay=5):
    """Gửi request với retry khi lỗi."""
    for attempt in range(retries):
        try:
            url = "https://trade.ec.europa.eu/access-to-markets/api/v2/nomenclature/products"
            params = {
                "parent": id,
                "country": country,
                "lang": "EN"
            }

            response = requests.get(url,params=params,  timeout=60)
            response.encoding = 'utf-8'
            if response.status_code == 200:
                return response
        except requests.RequestException as e:
            print(f"Request error: {e} - Retrying {attempt+1}/{retries}...")
            time.sleep(delay)
    return None

lần đầu chạy đoạn code bên dưới để lấy toàn bộ dữ liệu

In [23]:
# all_sections_df = []
# for country in countries:
#     section_info = fetch_bycode_with_retry().json()
#     section_df = pd.DataFrame(section_info)
#     section_df['index'] = section_df.index
#     section_df['crawled'] = ''
#     section_df['country'] = country
#     all_sections_df.append(section_df)
#     all_dfs =  pd.concat(all_sections_df, ignore_index=True) 
#     all_dfs.to_csv('section_info.csv', index=False)
# df = pd.read_csv('section_info.csv')
# df

từ lần 2 chạy từ ô bên dưới

In [24]:
df = pd.read_csv('hs_info.csv', dtype={'id': str})
df

Unnamed: 0,id,code,description,hasChildren,section,index,crawled,country,parent_id,error_message,selectable,intervalMin,intervalMax
0,-1,,Live animals; animal products,True,"{'description': 'SECTION I', 'chapterFrom': '0...",0.0,,AT,,,,,
1,-2,,Vegetable products,True,"{'description': 'SECTION II', 'chapterFrom': '...",1.0,,AT,,,,,
2,-3,,Animal or vegetable fats and oils and their cl...,True,"{'description': 'SECTION III', 'chapterFrom': ...",2.0,,AT,,,,,
3,-4,,"Prepared foodstuffs; beverages, spirits and vi...",True,"{'description': 'SECTION IV', 'chapterFrom': '...",3.0,,AT,,,,,
4,-5,,Mineral products,True,"{'description': 'SECTION V', 'chapterFrom': '2...",4.0,,AT,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
65041,21264,6.406209e+09,Other,False,,1.0,,HR,11364.0,,True,,
65042,21364,6.406903e+09,Hand-made,False,,0.0,,HR,11464.0,,True,,
65043,21464,6.406903e+09,Other,False,,1.0,,HR,11464.0,,True,,
65044,21564,6.406905e+09,Hand-made,False,,0.0,,HR,11564.0,,True,,


In [29]:
for country in countries:
    while ((df['country'] == country) & (df['hasChildren'] == True) & ((df['crawled'] == '') | pd.isna(df['crawled']) | (df['crawled'] == 'Error'))).any():
        for i in df.index:
            if df.loc[i, 'country'] == country and df.loc[i, 'hasChildren'] and (df.loc[i, 'crawled'] == '' or pd.isna(df.loc[i, 'crawled']) or df.loc[i, 'crawled'] == 'Error'):
                id = df.loc[i, 'id']
                attempts = 0
                success = False
                error_occurred = False  # Biến để kiểm soát lỗi

                while attempts < 5 and not success:
                    try:
                        print(f"Fetching index: {i} id: {id}... (Attempt {attempts}/5)")
                        
                        response = fetch_byid_with_retry(id)  
                        if response and response.status_code == 200:
                            df2 = pd.DataFrame(response.json())
                            df2['index'] = df2.index
                            df2['country'] = country
                            df2['parent_id'] = id
                            print(df2)
                            
                            df = pd.concat([df, df2], ignore_index=True).reset_index(drop=True)
                            df.loc[i, 'crawled'] = 'Done'
                            df.to_csv('hs_info.csv', index = False)
                            print(f"✅ Done fetching index: {i} id: {id}")
                            success = True
                        else:
                            raise Exception(f"Failed request, status code: {response.status_code}")
                    
                    except Exception as e:
                        attempts += 1
                        error_message = str(e)
                        print(f"❌ Error fetching index: {i} id: {id}... (Attempt {attempts}/5) - {e}")
                        if attempts == 5: 
                            error_occurred = True

                if error_occurred: 
                    df.loc[i, 'crawled'] = 'Error'
                    df.loc[i, 'error_message'] = error_message
                    df.to_csv('hs_info.csv', index = False)
                    print("⏳ Encountered multiple errors. Sleeping for 1 minutes...")
                    
                    # Mở trình duyệt để nhập captcha
                    print("🔓 Mở trình duyệt để nhập captcha...")
                    driver = webdriver.Chrome()
                    driver.get("https://trade.ec.europa.eu/access-to-markets/en/search?product=0101&origin=VN&destination=AT")  # Thay bằng URL cần mở
                    print("⌨️ Vui lòng nhập captcha, sau đó đóng trình duyệt.")

                    # Đo thời gian bắt đầu
                    start_captcha_time = time.time()
                    wait_time = 1800  # 30 phút 

                    # Chờ người dùng tự đóng trình duyệt, nếu quá 10 phút thì tự động đóng
                    while True:
                        try:
                            if time.time() - start_captcha_time > wait_time:
                                print("⏳ Quá 10 phút, tự động đóng trình duyệt.")
                                driver.quit()
                                break  # Thoát khỏi vòng lặp

                            # Kiểm tra trình duyệt còn mở không
                            driver.title  
                            
                            # Kiểm tra trang có bị treo không
                            if driver.execute_script("return document.readyState") != "complete":
                                print("⚠️ Trang web có thể bị treo, chờ thêm...")
                            
                        except WebDriverException:
                            print("✅ Trình duyệt đã được đóng.")
                            break  # Nếu trình duyệt bị đóng, thoát vòng lặp
                        
                        time.sleep(5)  # Tránh vòng lặp chạy quá nhanh, gây tải CPU
                    
                    time.sleep(10)  # Đợi 10 giây trước khi tiếp tục
                    break
            


Fetching index: 90085 id: 403220... (Attempt 0/5)
       id  code intervalMin intervalMax  \
0  406720  None  2007993951  2007993952   
1  406820  None  2007993954  2007993956   

                                         description  hasChildren  index  \
0  Of tropical fruit (including mixtures containi...         True      0   
1                                              Other         True      1   

  country  parent_id  
0      AT     403220  
1      AT     403220  
✅ Done fetching index: 90085 id: 403220
Fetching index: 90096 id: 406320... (Attempt 0/5)
       id        code                                   description  \
0  406920  2007993943  Containing less than 70 % by weight of sugar   
1  407020  2007993944                                         Other   

   hasChildren  selectable  index country  parent_id  
0        False        True      0      AT     406320  
1        False        True      1      AT     406320  
✅ Done fetching index: 90096 id: 406320
Fetching inde

KeyboardInterrupt: 

In [None]:
len(df)

90489