# Clean US Tariff data from Federal Registry

## Setup

In [1]:
import pandas as pd
from pathlib import Path

## Configs

In [4]:
RAW_DATA_PATH = Path("../data/raw/fedregister/fedregister_raw.parquet")
CLEAN_DATA_PATH = Path("../data/tidy/fedregister_clean.parquet")
DEDUPLICATION_KEY = "document_number"

## Main

### Load

In [5]:
df_raw = pd.read_parquet(RAW_DATA_PATH)
print(f"Raw documents loaded: {len(df_raw):,}")

Raw documents loaded: 36,686


### Deduplicate

In [6]:
df_clean = df_raw.drop_duplicates(subset=DEDUPLICATION_KEY)
print(f"Unique documents after deduplication: {len(df_clean):,}")

Unique documents after deduplication: 32,764


### Save

In [7]:
CLEAN_DATA_PATH.parent.mkdir(parents=True, exist_ok=True)
df_clean.to_parquet(CLEAN_DATA_PATH, index=False)
print(f"Cleaned data saved to: {CLEAN_DATA_PATH}")

Cleaned data saved to: ../data/tidy/fedregister_clean.parquet
