In [1]:
import requests
from bs4 import BeautifulSoup
import math
import concurrent.futures
import json
import os
import time
import pandas as pd

In [2]:
with open("all_10digit_info.csv", "r", encoding="utf-8") as f:
    links = [line.strip() for line in f if line.strip()]
print(f"Total links for craw: {len(links)-1}")

Total links for craw: 11982


In [3]:
df = pd.read_csv("all_10digit_info.csv" , encoding="utf-8",dtype=str)

In [4]:
linklist = df['link'].tolist()

In [5]:
links = [x.replace('//China_HS_Code/China_HS_code.asp?HS_Code=','//China_HS_Code/China_Tariff.asp?HS_Code=') for x in linklist]

In [6]:
output_dir = "org3"
os.makedirs(output_dir, exist_ok=True)

state_file_template = os.path.join(output_dir, "state_{}.json")
output_file_template = os.path.join(output_dir, "output_{}.txt")
error_file_template = os.path.join(output_dir, "error_{}.txt")

def save_state(thread_index, start_index):
    """Lưu trạng thái hiện tại của thread vào file."""
    with open(state_file_template.format(thread_index), "w") as f:
        json.dump({"start_index": start_index}, f)

def load_state(thread_index):
    """Tải trạng thái đã lưu từ file."""
    try:
        with open(state_file_template.format(thread_index), "r") as f:
            states = json.load(f)
            return states.get("start_index", 0)
    except (FileNotFoundError, json.JSONDecodeError):
        return 0  # Nếu chưa có file hoặc lỗi load state, bắt đầu từ đầu

def fetch_with_retry(url, retries=5, delay=5):
    """Gửi request với retry khi lỗi."""
    for attempt in range(retries):
        try:
            response = requests.get(url, timeout=60)
            response.encoding = 'utf-8'
            if response.status_code == 200:
                return response
        except requests.RequestException as e:
            print(f"Request error: {e} - Retrying {attempt+1}/{retries}...")
            time.sleep(delay)
    return None

In [None]:
def scrape_links(sub_links, thread_index, start_index):
    """Xử lý từng nhóm link và lưu kết quả."""
    batch_results = []

    for idx, link in enumerate(sub_links[start_index:], start=start_index):
        hs10digit = link[-10:]
        print(f"Thread {thread_index} - Processing: {hs10digit}")
        
        response = fetch_with_retry(link)
        
        if response is None:
            error_msg = {"link": link, "error": "Failed to fetch after 3 retries"}
            print(f"Thread {thread_index} - {error_msg}")  
            with open(error_file_template.format(thread_index), "a", encoding='utf-8') as f:
                f.write(json.dumps(error_msg, ensure_ascii=False) + "\n")
            continue

        soup = BeautifulSoup(response.text, 'html.parser')
        tables = soup.find_all('table', class_="zebra")
        centers = soup.find('div', class_="main-container").find_all('div', align="center")

        if len(tables) < 2:
            print(f"Thread {thread_index} - No valid tables found for {hs10digit}")
            continue
        
        try:
            import_taiff_trs = centers[1].find('table').find_all('tr')
        except:
            import_taiff_trs = None
        try:    
            import_conventional_trs = centers[2].find('table').find_all('tr')
        except:
            import_conventional_trs = None
        try:  
            ecommerce_taiff_trs = centers[3].find('table').find_all('tr')
        except:
            ecommerce_taiff_trs = None
        try:  
            export_taiff_trs = tables[-1].find_all('tr')
        except:
            export_taiff_trs = None      
        
        try:
            batch_results = []
            if import_taiff_trs is not None and len(import_taiff_trs) > 1:
                headers = [f'import_{th.get_text().replace('\n','').replace('\r','').replace('\t','').replace('\xa0','').strip().lower().replace(' ','_')}' for th in import_taiff_trs[0].find_all('th')]

                for tr in import_taiff_trs[1:]: 
                    values = [td.get_text().replace('\n','').replace('\r','').replace('\t','').replace('\xa0','').strip() for td in tr.find_all('td')]  
                    hs10_tariff_info = dict(zip(headers, values))  
                    hs10_tariff_info["hs10digit"] = hs10digit  
                    batch_results.append(json.dumps(hs10_tariff_info, ensure_ascii=False))
            if import_conventional_trs is not None and len(import_conventional_trs) > 1:
                headers = [f'conventional_{th.get_text().replace('\n','').replace('\r','').replace('\t','').replace('\xa0','').strip().lower().replace(' ','_')}' for th in import_conventional_trs[0].find_all('th')]
                for tr in import_conventional_trs[1:]: 
                    values = [td.get_text().replace('\n','').replace('\r','').replace('\t','').replace('\xa0','').strip() for td in tr.find_all('td')]  
                    hs10_tariff_info = dict(zip(headers, values))  
                    hs10_tariff_info["hs10digit"] = hs10digit 
                    batch_results.append(json.dumps(hs10_tariff_info, ensure_ascii=False))
            if export_taiff_trs is not None and len(export_taiff_trs) > 1:
                headers = [f'export_{th.get_text().replace('\n','').replace('\r','').replace('\t','').replace('\xa0','').strip().lower().replace(' ','_')}' for th in export_taiff_trs[0].find_all('th')]
                for tr in export_taiff_trs[1:]: 
                    values = [td.get_text().replace('\n','').replace('\r','').replace('\t','').replace('\xa0','').strip() for td in tr.find_all('td')]  
                    hs10_tariff_info = dict(zip(headers, values))  
                    hs10_tariff_info["hs10digit"] = hs10digit  
                    batch_results.append(json.dumps(hs10_tariff_info, ensure_ascii=False))
            if ecommerce_taiff_trs is not None and len(ecommerce_taiff_trs) > 1:
                headers = [f'ecommerce_{th.get_text().replace('\n','').replace('\r','').replace('\t','').replace('\xa0','').strip().lower().replace(' ','_')}' for th in ecommerce_taiff_trs[0].find_all('th')]
                for tr in ecommerce_taiff_trs[1:]: 
                    values = [td.get_text().replace('\n','').replace('\r','').replace('\t','').replace('\xa0','').strip() for td in tr.find_all('td')]  
                    hs10_tariff_info = dict(zip(headers, values))  
                    hs10_tariff_info["hs10digit"] = hs10digit  
                    batch_results.append(json.dumps(hs10_tariff_info, ensure_ascii=False))
                    
            if len(batch_results) > 0:
                with open(output_file_template.format(thread_index), "a", encoding='utf-8') as f:
                    f.write("\n".join(batch_results) + "\n")
                batch_results.clear()
        except Exception as e:
            error_msg = {"link": link, "error": str(e)}
        save_state(thread_index, idx + 1)


In [8]:
# data = [json.loads(item) for item in batch_results]
# df_check = pd.DataFrame(data)

In [9]:
# Chia chunk
chunk_size = 1000
link_chunks = [links[i:i + chunk_size] for i in range(0, len(links), chunk_size)]

# Tải trạng thái đã lưu
saved_states = [load_state(i) for i in range(len(link_chunks))]

# Threading
with concurrent.futures.ThreadPoolExecutor(max_workers=min(len(link_chunks), 15)) as executor:
    futures = {
        executor.submit(scrape_links, chunk, idx, saved_states[idx]): idx
        for idx, chunk in enumerate(link_chunks)
    }
    for future in concurrent.futures.as_completed(futures):
        thread_index = futures[future]
        try:
            future.result()
        except Exception as e:
            print(f"Thread {thread_index} encountered an error: {e}")

Thread 0 - Processing: 0101210010
Thread 1 - Processing: 0805219000
Thread 2 - Processing: 2603000090
Thread 3 - Processing: 2918990024
Thread 4 - Processing: 3004320017
Thread 5 - Processing: 4101901910
Thread 6 - Processing: 5208490010
Thread 7 - Processing: 6204491010
Thread 8 - Processing: 7226992000
Thread 9 - Processing: 8419899010
Thread 10 - Processing: 8504311000
Thread 11 - Processing: 8708295100
Thread 8 - Processing: 7226999001
Thread 7 - Processing: 6204491090
Thread 1 - Processing: 0805220000
Thread 3 - Processing: 2918990025
Thread 6 - Processing: 5208490090
Thread 4 - Processing: 3004320018
Thread 9 - Processing: 8419899021
Thread 10 - Processing: 8504319000
Thread 0 - Processing: 0101210090
Thread 2 - Processing: 2604000001
Thread 11 - Processing: 8708295200
Thread 5 - Processing: 4101901990
Thread 4 - Processing: 3004320019
Thread 8 - Processing: 7226999090
Thread 7 - Processing: 6204499000
Thread 1 - Processing: 0805290000
Thread 3 - Processing: 2918990026Thread 5 - 

In [10]:
import pandas as pd
directory = 'org3/'  

err_file_names = [f'error_stt_{i}.txt' for i in range(len(link_chunks))]
all_err_data = []
try:
    for file_name in err_file_names:
        file_path = os.path.join(directory, file_name)
        with open(file_path, 'r', encoding='utf-8') as file:
            data = [json.loads(line) for line in file]
            all_err_data.extend(data) 

    df = pd.DataFrame(all_err_data)
    print(len(df))

    df.to_csv('all_craw_10digit_tariff_error_stt.csv', index=False, encoding='utf-8')
except Exception as e:
    print(e)

output_file_names = [f'output_{i}.txt' for i in range(len(link_chunks))]
all_output_data = []
try:
# Đọc và xử lý từng file
    for file_name in output_file_names:
        file_path = os.path.join(directory, file_name)
        with open(file_path, 'r', encoding='utf-8') as file:
            data = [json.loads(line) for line in file]
            all_output_data.extend(data)  
    df2 = pd.DataFrame(all_output_data)
    print(len(df2))

    df2.to_csv('all_10digit_tariff_info.csv', index=False, encoding='utf-8')
except Exception as e:
    print(e)

[Errno 2] No such file or directory: 'org3/error_stt_0.txt'
391464


In [11]:
import pandas as pd

# Đọc file CSV với encoding UTF-8
df2 = pd.read_csv("all_10digit_tariff_info.csv", dtype=str, encoding="utf-8")

# Ghi file Excel với `openpyxl`
with pd.ExcelWriter("all_10digit_tariff_info.xlsx", engine="openpyxl") as writer:
    df2.to_excel(writer, sheet_name="Sheet1", index=False)
    
    # Lấy sheet và workbook
    workbook = writer.book
    worksheet = writer.sheets["Sheet1"]
    
    # Định dạng toàn bộ cột là "Text"
    for col in worksheet.iter_cols():
        for cell in col:
            cell.number_format = "@"

print("Đã lưu file Excel với Unicode và định dạng Text thành công!")


Đã lưu file Excel với Unicode và định dạng Text thành công!
