In [None]:
import requests
import pandas as pd
import time

# Đọc danh sách proxy từ file
def load_proxies(file_path="proxies.txt"):
    with open(file_path, "r") as f:
        return [line.strip() for line in f.readlines() if line.strip()]

proxies_list = load_proxies()
proxy_index = 0  # Chỉ số proxy hiện tại
request_count = 0  # Đếm số request

# Chọn proxy mới
def get_proxy():
    global proxy_index
    if proxies_list:
        proxy = proxies_list[proxy_index % len(proxies_list)]
        proxy_index += 1
        return {"http": proxy, "https": proxy}
    return None

# Hàm gửi request với retry & proxy rotation
def fetch_data_with_retry(url, params, retries=5, delay=5):
    global request_count
    proxy = get_proxy()  # Luôn gán proxy

    for attempt in range(retries):
        try:
            if request_count % 5 == 0 and proxies_list:
                proxy = get_proxy()  # Đổi proxy mỗi 50 request
                print(f"🔄 Switching to Proxy: {proxy}")

            response = requests.get(url, params=params, proxies=proxy, timeout=60)
            response.encoding = 'utf-8'

            if response.status_code == 200:
                request_count += 1
                return response
        except requests.RequestException as e:
            print(f"⚠️ Request error: {e} - Retrying {attempt+1}/{retries}...")
            time.sleep(delay)

    return None  # Trả về None nếu hết retries

# Fetch dữ liệu gốc
url = "https://trade.ec.europa.eu/access-to-markets/api/v2/nomenclature/products"
section_info = fetch_data_with_retry(url, params={"country": "AT", "lang": "EN"}).json()
section_df = pd.DataFrame(section_info)
section_df["index"] = section_df.index
section_df["crawled"] = ""

df = section_df.copy()

# Vòng lặp crawl dữ liệu con
while ((df["hasChildren"] == True) & ((df["crawled"] == "") | pd.isna(df["crawled"]))).any():
    for i in df.index:
        if df.loc[i, "hasChildren"] and (df.loc[i, "crawled"] == "" or pd.isna(df.loc[i, "crawled"])):
            id = df.loc[i, "id"]
            attempts = 0
            success = False

            while attempts < 5 and not success:
                try:
                    print(f"📡 Fetching index: {i} id: {id}... (Attempt {attempts}/5)")
                    response = fetch_data_with_retry(url, params={"parent": id, "country": "AT", "lang": "EN"})
                    if response:
                        df2 = pd.DataFrame(response.json())
                        df2["index"] = df2.index
                        df2["parent_id"] = id
                        df = pd.concat([df, df2], ignore_index=True)
                        df.loc[i, "crawled"] = "Done"
                        print(f"✅ Done fetching index: {i} id: {id}...")
                        success = True
                    else:
                        raise Exception("Max retries reached")
                except Exception as e:
                    attempts += 1
                    error_message = str(e)
                    print(f"❌ Error fetching index: {i} id: {id}... (Attempt {attempts}/5) - {e}")

            if not success:
                df.loc[i, "crawled"] = "Error"
                df.loc[i, "error_message"] = error_message


🔄 Switching to Proxy: {'http': 'http://113.160.132.195:8080', 'https': 'http://113.160.132.195:8080'}
📡 Fetching index: 0 id: -1... (Attempt 0/5)
⚠️ Request error: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response')) - Retrying 1/5...
⚠️ Request error: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response')) - Retrying 2/5...
⚠️ Request error: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response')) - Retrying 3/5...
⚠️ Request error: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response')) - Retrying 4/5...
⚠️ Request error: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response')) - Retrying 5/5...
❌ Error fetching index: 0 id: -1... (Attempt 1/5) - Max retries reached
📡 Fetching index: 0 id: -1... (Attempt 1/5)
✅ Done fetching index: 0 id: -1...
📡 Fetching index: 1 id: -2... (Attempt 0/5)
⚠️ Request err