In [1]:
import requests
from bs4 import BeautifulSoup
import math
import concurrent.futures
import json
import os
import time

In [None]:
with open("heading_links.txt", "r", encoding="utf-8") as f:
    links = [line.strip() for line in f if line.strip()]
print(f"Total links for craw: {len(links)}")

output_dir = "org"
os.makedirs(output_dir, exist_ok=True)

state_file_template = os.path.join(output_dir, "state_{}.json")
output_file_template = os.path.join(output_dir, "output_{}.txt")
error_file_template = os.path.join(output_dir, "error_{}.txt")

def save_state(thread_index, start_index):
    """Lưu trạng thái hiện tại của thread vào file."""
    with open(state_file_template.format(thread_index), "w") as f:
        json.dump({"start_index": start_index}, f)

def load_state(thread_index):
    """Tải trạng thái đã lưu từ file."""
    try:
        with open(state_file_template.format(thread_index), "r") as f:
            states = json.load(f)
            return states.get("start_index", 0)
    except (FileNotFoundError, json.JSONDecodeError):
        return 0  # Nếu chưa có file hoặc lỗi load state, bắt đầu từ đầu

def fetch_with_retry(url, retries=5, delay=5):
    """Gửi request với retry khi lỗi."""
    for attempt in range(retries):
        try:
            response = requests.get(url, timeout=10)
            response.encoding = 'utf-8'
            if response.status_code == 200:
                return response
        except requests.RequestException as e:
            print(f"Request error: {e} - Retrying {attempt+1}/{retries}...")
            time.sleep(delay)
    return None

def scrape_links(sub_links, thread_index, start_index):
    """Hàm xử lý từng nhóm link, lưu kết quả vào file."""
    error_list = []
    batch_results = []
    batch_size = 10

    for idx, link in enumerate(sub_links[start_index:], start=start_index):
        hs4digit = link[-4:]
        print(f"Thread {thread_index} - Processing: {hs4digit}")

        response = fetch_with_retry(f'https://www.transcustoms.com/{link}')
        if response is None:
            error_list.append({"link": link, "error": "Failed to fetch"})
            continue

        soup = BeautifulSoup(response.text, 'html.parser')
        info = soup.find('table', class_="form")
        if not info:
            continue

        try:
            info2 = info.find('b').find('font').text
            pages = math.ceil(int(info2) / 50)
            print(f"Thread {thread_index} - Pages: {pages}")
        except Exception as e:
            print(f"Error parsing page count: {e}")
            continue

        for page in range(pages):
            page_url = f'https://www.transcustoms.com/{link}&selectT=&page={page}'
            response = fetch_with_retry(page_url)
            if response is None:
                error_list.append({"link": page_url, "error": "Failed to fetch"})
                continue

            soup = BeautifulSoup(response.text, 'html.parser')
            table = soup.find('table', class_="zebra")
            if not table:
                continue

            for tr in table.find_all('tr')[1:]:
                try:
                    commodity_name = tr.find_all('td')[1].get_text().replace('\n','').replace('\r','').replace('\t','').strip(),
                    hs10digit = tr.find('a').text
                    hs10dg_link = tr.find('a')['href'].replace('..', 'https://www.transcustoms.com/')
                    trinfo = {"4digit": hs4digit, "10digit": hs10digit, "link": hs10dg_link, " commodity_name": commodity_name}
                    
                    
                    batch_results.append(json.dumps(trinfo, ensure_ascii=False))
                    
                    # Ghi batch để tránh mở file nhiều lần
                    if len(batch_results) >= batch_size:
                        with open(output_file_template.format(thread_index), "a", encoding='utf-8') as f:
                            f.write("\n".join(batch_results) + "\n")
                        batch_results.clear()

                except Exception as e:
                    print(f"Thread {thread_index} - Error parsing row: {e}")
                    error_list.append({"link": page_url, "error": str(e)})

        save_state(thread_index, idx + 1)

    # Ghi lỗi vào file
    if error_list:
        with open(error_file_template.format(thread_index), "a", encoding='utf-8') as f:
            f.writelines([json.dumps(err, ensure_ascii=False) + "\n" for err in error_list])

    # Ghi dữ liệu còn lại trong batch
    if batch_results:
        with open(output_file_template.format(thread_index), "a", encoding='utf-8') as f:
            f.writelines("\n".join(batch_results) + "\n")

# Chia chunk
chunk_size = 100
link_chunks = [links[i:i + chunk_size] for i in range(0, len(links), chunk_size)]

# Tải trạng thái đã lưu
saved_states = [load_state(i) for i in range(len(link_chunks))]

# Threading
with concurrent.futures.ThreadPoolExecutor(max_workers=min(len(link_chunks), 10)) as executor:
    futures = {
        executor.submit(scrape_links, chunk, idx, saved_states[idx]): idx
        for idx, chunk in enumerate(link_chunks)
    }
    for future in concurrent.futures.as_completed(futures):
        thread_index = futures[future]
        try:
            future.result()
        except Exception as e:
            print(f"Thread {thread_index} encountered an error: {e}")


Total links: 39
Thread 0 - Processing: 0506
Request error: HTTPSConnectionPool(host='www.transcustoms.com', port=443): Max retries exceeded with url: /Hscode/HScode_search.asp?word=0506 (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x000001B37689CF20>: Failed to resolve 'www.transcustoms.com' ([Errno 11001] getaddrinfo failed)")) - Retrying 1/3...
Request error: HTTPSConnectionPool(host='www.transcustoms.com', port=443): Max retries exceeded with url: /Hscode/HScode_search.asp?word=0506 (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x000001B376EC1AC0>: Failed to resolve 'www.transcustoms.com' ([Errno 11001] getaddrinfo failed)")) - Retrying 2/3...
Request error: HTTPSConnectionPool(host='www.transcustoms.com', port=443): Max retries exceeded with url: /Hscode/HScode_search.asp?word=0506 (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x000001B376EC2270>: Failed to resolve 'www.transcustoms.com