## HS Codes Scraping By Metal ##
A script to find all relevant HS codes for a given keyword (metal)
The results are written to data/<metal_name> in the form of raw HS codes and trade data

In [6]:
import pandas as pd
import os
import re

base_dir = "../data_baci"
reference_file = os.path.join(base_dir, "product_codes_HS12_V202501.csv")
baci_files = [f for f in os.listdir(base_dir) if f.startswith("BACI_HS12_Y") and f.endswith(".csv")]
os.makedirs(os.path.join(base_dir, "outputs"), exist_ok=True)

'/Users/pranavgunhal/Downloads/e-waste research /BACI_HS12_V202501/data_baci'

#get files
reference_df = pd.read_csv(reference_file)
metal_keywords = ["copper", "gold", "silver", "aluminium", "tin", "tungsten", "nickel", "lithium", "cobalt", "lead", "zinc", "rare earth", "graphite", "antimony", "gallium", "germanium", "indium", "tantalum"]
metal_keywords = ["Earth-metals, rare"]



#process each metal
for metal in metal_keywords:
    print(f"\n=== Processing keyword: {metal} ===")
    pattern = re.compile(rf"\b{re.escape(metal.lower())}\b", re.IGNORECASE)

    matched_rows = []
    for _, row in reference_df.iterrows():
        if pattern.search(str(row["description"]).lower()):
            matched_rows.append({
                "metal": metal,
                "hs_code": int(row["code"]),
                "description": row["description"]
            })

    if not matched_rows:
        print(f"No matches for: {metal}")
        continue

    metal_df = pd.DataFrame(matched_rows).drop_duplicates()
    hs_code_set = set(metal_df["hs_code"])

    # Create output directory
    metal_output_dir = os.path.join(base_dir, "data", metal.lower().replace(" ", "_"))
    os.makedirs(metal_output_dir, exist_ok=True)

    # Save matched HS codes
    metal_df.to_csv(os.path.join(metal_output_dir, "hs_codes.csv"), index=False)
    print(f"Saved {len(metal_df)} HS codes for {metal}")

    # extract from each year's file
    trade_rows = []
    for fname in sorted(baci_files):
        fpath = os.path.join(base_dir, fname)
        try:
            df = pd.read_csv(fpath)
            df_filtered = df[df["k"].isin(hs_code_set)].copy()
            year_match = re.search(r"Y(\d{4})", fname)
            if year_match:
                df_filtered["year"] = int(year_match.group(1))
            if not df_filtered.empty:
                trade_rows.append(df_filtered)
                print(f"Included rows from {fname}")
        except Exception as e:
            print(f"Skipped {fname} due to error: {e}")

    if trade_rows:
        combined_df = pd.concat(trade_rows, ignore_index=True)
        combined_df.to_csv(os.path.join(metal_output_dir, "trade_data.csv"), index=False)
        print(f"Saved trade data for {metal}")
    else:
        print(f"No trade data found for {metal}")



=== Processing keyword: Earth-metals, rare ===
Saved 1 HS codes for Earth-metals, rare
Included rows from BACI_HS12_Y2012_V202501.csv
Included rows from BACI_HS12_Y2013_V202501.csv
Included rows from BACI_HS12_Y2014_V202501.csv
Included rows from BACI_HS12_Y2015_V202501.csv
Included rows from BACI_HS12_Y2016_V202501.csv
Included rows from BACI_HS12_Y2017_V202501.csv
Included rows from BACI_HS12_Y2018_V202501.csv


KeyboardInterrupt: 

In [5]:

manufacturing_keywords = {
    "semiconductors": ["integrated circuit", "electronic ic", "semiconductor"],
    "pcbs": ["printed circuit", "pcb"],
    "capacitors": ["fixed capacitor", "capacitor"],
    "resistors": ["fixed resistor", "resistor"],
    "power_supplies": ["ac/dc converter", "static converter", "power supply"],
    "batteries": ["lithium-ion accumulator", "battery", "accumulator"],
    "connectors": ["switching", "protection device", "electrical apparatus"],
    "storage_devices": ["magnetic storage", "optical storage", "hdd", "ssd", "storage unit"],
    "fans": ["fan", "cooling unit"],
    "leds_lasers": ["led", "diode", "laser"]
}


for category, keywords in manufacturing_keywords.items():
    print(f"\n=== Processing category: {category} ===")
    
    pattern = re.compile(rf"\b({'|'.join([re.escape(k.lower()) for k in keywords])})\b", re.IGNORECASE)

    matched_rows = []
    for _, row in reference_df.iterrows():
        if pattern.search(str(row["description"]).lower()):
            try:
                hs_code = int(row["code"])
                matched_rows.append({
                    "category": category,
                    "hs_code": hs_code,
                    "description": row["description"]
                })
            except (ValueError, TypeError):
                continue

    if not matched_rows:
        print(f"No matches for: {category}")
        continue

    category_df = pd.DataFrame(matched_rows).drop_duplicates()
    hs_code_set = set(category_df["hs_code"])

    # Create output folder under 'manufacturing'
    category_output_dir = os.path.join(base_dir, "outputs", "manufacturing", category.lower().replace(" ", "_"))
    os.makedirs(category_output_dir, exist_ok=True)

    # Save HS code matches
    category_df.to_csv(os.path.join(category_output_dir, "hs_codes.csv"), index=False)
    print(f"Saved {len(category_df)} HS codes for {category}")

    # Extract BACI trade rows
    trade_rows = []
    for fname in sorted(baci_files):
        fpath = os.path.join(base_dir, fname)
        try:
            df = pd.read_csv(fpath)
            df_filtered = df[df["k"].isin(hs_code_set)].copy()
            year_match = re.search(r"Y(\d{4})", fname)
            if year_match:
                df_filtered["year"] = int(year_match.group(1))
            if not df_filtered.empty:
                trade_rows.append(df_filtered)
                print(f"Included rows from {fname}")
        except Exception as e:
            print(f"Skipped {fname} due to error: {e}")

    if trade_rows:
        combined_df = pd.concat(trade_rows, ignore_index=True)
        combined_df.to_csv(os.path.join(category_output_dir, "trade_data.csv"), index=False)
        print(f"Saved trade data for {category}")
    else:
        print(f"No trade data found for {category}")



=== Processing category: semiconductors ===
Saved 11 HS codes for semiconductors
Included rows from BACI_HS12_Y2012_V202501.csv
Included rows from BACI_HS12_Y2013_V202501.csv
Included rows from BACI_HS12_Y2014_V202501.csv
Included rows from BACI_HS12_Y2015_V202501.csv
Included rows from BACI_HS12_Y2016_V202501.csv
Included rows from BACI_HS12_Y2017_V202501.csv
Included rows from BACI_HS12_Y2018_V202501.csv
Included rows from BACI_HS12_Y2019_V202501.csv
Included rows from BACI_HS12_Y2020_V202501.csv
Included rows from BACI_HS12_Y2021_V202501.csv
Included rows from BACI_HS12_Y2022_V202501.csv
Included rows from BACI_HS12_Y2023_V202501.csv
Saved trade data for semiconductors

=== Processing category: pcbs ===
No matches for: pcbs

=== Processing category: capacitors ===
No matches for: capacitors

=== Processing category: resistors ===
No matches for: resistors

=== Processing category: power_supplies ===
No matches for: power_supplies

=== Processing category: batteries ===
Saved 1 HS c