In [1]:
import requests
from bs4 import BeautifulSoup
import re

url = "https://www.transcustoms.com/HS_tree.htm"

# Fetch the webpage
response = requests.get(url)
response.raise_for_status()  # Check for HTTP errors

# Parse HTML
soup = BeautifulSoup(response.text, 'html.parser')

# Extract all text from the page
text = soup.get_text()

# Use regex to find all "Heading XXXX" patterns
hs_headings = re.findall(r'Heading (\d{4}):', text)

# Remove duplicates (if any) and sort
unique_hs_headings = sorted(list(set(hs_headings)))

print(f"Extracted {len(unique_hs_headings)} unique HS Headings:", unique_hs_headings)

Extracted 1224 unique HS Headings: ['0101', '0102', '0103', '0104', '0105', '0106', '0201', '0202', '0203', '0204', '0205', '0206', '0207', '0208', '0209', '0210', '0301', '0302', '0303', '0304', '0305', '0306', '0307', '0308', '0401', '0402', '0403', '0404', '0405', '0406', '0407', '0408', '0409', '0410', '0501', '0502', '0504', '0505', '0506', '0507', '0508', '0510', '0511', '0601', '0602', '0603', '0604', '0701', '0702', '0703', '0704', '0705', '0706', '0707', '0708', '0709', '0710', '0711', '0712', '0713', '0714', '0801', '0802', '0803', '0804', '0805', '0806', '0807', '0808', '0809', '0810', '0811', '0812', '0813', '0814', '0901', '0902', '0903', '0904', '0905', '0906', '0907', '0908', '0909', '0910', '1001', '1002', '1003', '1004', '1005', '1006', '1007', '1008', '1101', '1102', '1103', '1104', '1105', '1106', '1107', '1108', '1109', '1201', '1202', '1203', '1204', '1205', '1206', '1207', '1208', '1209', '1210', '1211', '1212', '1213', '1214', '1301', '1302', '1401', '1404', '150

In [3]:
import requests
from bs4 import BeautifulSoup
import re
from time import sleep

def scrape_hs_codes_for_heading(heading, max_pages=4):
    """Scrape HS codes for a given heading across paginated results (pages 0-4)."""
    all_hs_codes = set()

    for page in range(max_pages):
        url = f"https://www.transcustoms.com/Hscode/HScode_search.asp?word={heading}&selectT=&page={page}"
        headers = {"User-Agent": "Mozilla/5.0"}

        try:
            response = requests.get(url, headers=headers)
            response.raise_for_status()
            soup = BeautifulSoup(response.text, 'html.parser')
            page_text = soup.get_text()

            # Extract 10-digit HS codes
            hs_codes = re.findall(r'\b\d{10}\b', page_text)

            if not hs_codes:  # Stop if no codes found on this page
                break

            all_hs_codes.update(hs_codes)
            #sleep(1)  # Respectful delay

        except Exception as e:
            print(f"Error scraping {heading} (page {page}): {e}")
            break

    return sorted(all_hs_codes)

# Scrape all HS codes for each heading
all_hs_codes = []
for heading in unique_hs_headings:
    codes = scrape_hs_codes_for_heading(heading)
    print(f"Heading {heading}: Found {len(codes)} HS codes")
    all_hs_codes.extend(codes)

# Remove duplicates (optional, if headings overlap)
final_hs_codes = sorted(list(set(all_hs_codes)))
print("\nTotal unique HS codes:", len(final_hs_codes))

Heading 0101: Found 9 HS codes
Heading 0102: Found 10 HS codes
Heading 0103: Found 8 HS codes
Heading 0104: Found 4 HS codes
Heading 0105: Found 17 HS codes
Heading 0106: Found 57 HS codes
Heading 0201: Found 6 HS codes
Heading 0202: Found 6 HS codes
Heading 0203: Found 16 HS codes
Heading 0204: Found 9 HS codes
Heading 0205: Found 2 HS codes
Heading 0206: Found 11 HS codes
Heading 0207: Found 36 HS codes
Heading 0208: Found 12 HS codes
Heading 0209: Found 2 HS codes
Heading 0210: Found 15 HS codes
Heading 0301: Found 29 HS codes
Heading 0302: Found 69 HS codes
Heading 0303: Found 67 HS codes
Heading 0304: Found 69 HS codes
Heading 0305: Found 42 HS codes
Heading 0306: Found 50 HS codes
Heading 0307: Found 63 HS codes
Heading 0308: Found 28 HS codes
Heading 0401: Found 4 HS codes
Heading 0402: Found 5 HS codes
Heading 0403: Found 2 HS codes
Heading 0404: Found 3 HS codes
Heading 0405: Found 3 HS codes
Heading 0406: Found 5 HS codes
Heading 0407: Found 11 HS codes
Heading 0408: Found 4 

In [4]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from time import sleep

def fetch_vat_rate(hs_code):
    """Fetch VAT rate for a single HS code."""
    url = f"https://www.transcustoms.com/China_HS_Code/China_Tariff.asp?HS_Code={hs_code}"
    headers = {"User-Agent": "Mozilla/5.0"}

    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')

        vat_label = soup.find("td", text="Import VAT (Value-Added Tax)")
        if vat_label:
            return vat_label.find_next("td").text.strip()
        return "Not Found"

    except Exception as e:
        return f"Error: {e}"

# Scrape VAT rates for all HS codes
results = []
for i, code in enumerate(final_hs_codes):
    vat_rate = fetch_vat_rate(code)
    results.append({"HS_Code": code, "VAT_Rate": vat_rate})
    print(f"Processed {i+1}/{len(final_hs_codes)}: {code} → {vat_rate}")
    #sleep(1)  # Avoid rate-limiting

# Export to CSV
df = pd.DataFrame(results)
df.to_csv("hs_codes_vat_rates.csv", index=False)
print("Data saved to 'hs_codes_vat_rates.csv'")

  vat_label = soup.find("td", text="Import VAT (Value-Added Tax)")


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Processed 6948/11946: 6203330000 → 13.0%
Processed 6949/11946: 6203391010 → 13.0%
Processed 6950/11946: 6203391090 → 13.0%
Processed 6951/11946: 6203399000 → 13.0%
Processed 6952/11946: 6203410022 → 13.0%
Processed 6953/11946: 6203410029 → 13.0%
Processed 6954/11946: 6203410090 → 13.0%
Processed 6955/11946: 6203421000 → 13.0%
Processed 6956/11946: 6203429015 → 13.0%
Processed 6957/11946: 6203429019 → 13.0%
Processed 6958/11946: 6203429049 → 13.0%
Processed 6959/11946: 6203429062 → 13.0%
Processed 6960/11946: 6203429069 → 13.0%
Processed 6961/11946: 6203429090 → 13.0%
Processed 6962/11946: 6203431000 → 13.0%
Processed 6963/11946: 6203439015 → 13.0%
Processed 6964/11946: 6203439019 → 13.0%
Processed 6965/11946: 6203439049 → 13.0%
Processed 6966/11946: 6203439061 → 13.0%
Processed 6967/11946: 6203439069 → 13.0%
Processed 6968/11946: 6203439082 → 13.0%
Processed 6969/11946: 6203439089 → 13.0%
Processed 6970/11946: 6203439090 