# 01 — Cleaning FPDS Procurement Records

In [1]:

import sys, subprocess, importlib
def ensure(pkg, pip_name=None):
    try:
        importlib.import_module(pkg)
    except Exception:
        subprocess.check_call([sys.executable, "-m", "pip", "install", pip_name or pkg])
for mod in ["pandas", "numpy"]:
    ensure(mod)
try:
    importlib.import_module("rapidfuzz")
except Exception:
    try:
        ensure("rapidfuzz")
    except Exception:
        ensure("fuzzywuzzy"); ensure("python-Levenshtein","python-Levenshtein")
print("[INFO] Dependencies are ready.")

[INFO] Dependencies are ready.


In [2]:

import sys, subprocess, importlib
from pathlib import Path

def ensure(pkg, pip_name=None):
    try:
        importlib.import_module(pkg)
    except Exception:
        subprocess.check_call([sys.executable, "-m", "pip", "install", pip_name or pkg])


ensure("pandas")
ensure("numpy")

try:
    importlib.import_module("rapidfuzz")
except Exception:
    try:
        ensure("rapidfuzz")
    except Exception:
        ensure("fuzzywuzzy")
        ensure("python-Levenshtein", "python-Levenshtein")


ROOT = Path.cwd()

if not (ROOT / "src").exists() and (ROOT.name.lower() == "notebooks") and (ROOT.parent / "src").exists():
    ROOT = ROOT.parent

p = ROOT
while not (p / "src").exists() and p != p.parent:
    p = p.parent
ROOT = p
sys.path.insert(0, str(ROOT))


(src_dir := (ROOT / "src")).mkdir(parents=True, exist_ok=True)
(src_dir / "__init__.py").touch(exist_ok=True)

print("[INFO] Project root:", ROOT)
print("[INFO] Python:", sys.version)
print("[INFO] Executable:", sys.executable)


[INFO] Project root: c:\Users\taylo\OneDrive\Desktop\DoD-SupplyChain-Disruption-Complete - Copy
[INFO] Python: 3.13.5 (tags/v3.13.5:6cb20a2, Jun 11 2025, 16:15:46) [MSC v.1943 64 bit (AMD64)]
[INFO] Executable: c:\Users\taylo\AppData\Local\Programs\Python\Python313\python.exe


In [3]:
import pandas as pd, numpy as np, re
from pathlib import Path
from src.utils import find_project_root, simple_vendor_normalize, clean_naics, ensure_dir


try:
    from rapidfuzz import fuzz, process
except Exception:
    from fuzzywuzzy import fuzz, process

print("Pandas:", pd.__version__)

Pandas: 2.3.1


In [4]:
# Locate or create raw CSV
ROOT = find_project_root(Path.cwd())
raw_dir = ROOT / "data" / "raw"
proc_dir = ROOT / "data" / "processed"
ensure_dir(proc_dir)

csv_path = raw_dir / "fpds_contracts.csv"
if not csv_path.exists():
    
    raw_dir.mkdir(parents=True, exist_ok=True)
    csv_path.write_text(
        "contract_award_id,vendor_name,naics_code,product_or_service_code,action_date,dollar_obligated,place_of_performance_city,place_of_performance_state,place_of_performance_country\n"
        "1234567,Lockheed Martin Corp,336413,1510,2021-07-15,12000000,Fort Worth,TX,USA\n"
        "1234568,LOCKHEED MARTIN CORPORATION,336413,1510,07/15/2021,12000000.00,FORT WORTH,Texas,USA\n"
        "1234569,Boeing Co,'3364-13',1510,2021-07-XX,8000000,,CA,USA\n",
        encoding="utf-8"
    )
    print(f"[INFO] Created sample at {csv_path}")
else:
    print(f"[INFO] Using CSV at {csv_path}")
fpds = pd.read_csv(csv_path)
fpds.head()

[INFO] Using CSV at c:\Users\taylo\OneDrive\Desktop\DoD-SupplyChain-Disruption-Complete - Copy\data\raw\fpds_contracts.csv


Unnamed: 0,contract_award_id,vendor_name,naics_code,product_or_service_code,action_date,dollar_obligated,place_of_performance_city,place_of_performance_state,place_of_performance_country
0,9000001,Lockheed Martin Corp,336413,1510,2021-01-10,9500000,Fort Worth,TX,USA
1,9000002,Boeing Co,336413,1510,2021-07-20,8400000,Long Beach,CA,USA
2,9000003,Raytheon Technologies,334111,1510,2022-02-05,2500000,Tucson,AZ,USA
3,9000004,LOCKHEED MARTIN CORPORATION,336413,1510,2022-11-30,12300000,Fort Worth,TX,USA
4,9000005,BOEING COMPANY,336413,1510,2023-03-12,7800000,San Diego,CA,USA


In [5]:
# Standardize columns
fpds.columns = fpds.columns.str.strip().str.lower().str.replace(" ", "_")

# Parse dates robustly
fpds["action_date"] = pd.to_datetime(fpds.get("action_date"), errors="coerce")

# Numeric conversion for dollar_obligated
if "dollar_obligated" in fpds.columns:
    fpds["dollar_obligated"] = (
        fpds["dollar_obligated"].astype(str).str.replace(",", "", regex=False).str.strip().replace({"": np.nan})
    )
    fpds["dollar_obligated"] = pd.to_numeric(fpds["dollar_obligated"], errors="coerce")

# Vendor normalization
fpds["vendor_group"] = fpds.get("vendor_name","").apply(simple_vendor_normalize)

# Clean NAICS and derive sector
fpds["naics_code_clean"] = fpds.get("naics_code","").apply(clean_naics)
fpds["naics_sector2"] = fpds["naics_code_clean"].str[:2]

# State formatting
if "place_of_performance_state" in fpds.columns:
    fpds["place_of_performance_state"] = fpds["place_of_performance_state"].astype(str).str.upper().str.strip()

# Date parts
fpds["year"] = fpds["action_date"].dt.year
fpds["month"] = fpds["action_date"].dt.month
fpds["year_month"] = fpds["action_date"].dt.to_period("M").astype(str)

fpds.head()

Unnamed: 0,contract_award_id,vendor_name,naics_code,product_or_service_code,action_date,dollar_obligated,place_of_performance_city,place_of_performance_state,place_of_performance_country,vendor_group,naics_code_clean,naics_sector2,year,month,year_month
0,9000001,Lockheed Martin Corp,336413,1510,2021-01-10,9500000,Fort Worth,TX,USA,LOCKHEED MARTIN,336413,33,2021,1,2021-01
1,9000002,Boeing Co,336413,1510,2021-07-20,8400000,Long Beach,CA,USA,BOEING,336413,33,2021,7,2021-07
2,9000003,Raytheon Technologies,334111,1510,2022-02-05,2500000,Tucson,AZ,USA,RAYTHEON,334111,33,2022,2,2022-02
3,9000004,LOCKHEED MARTIN CORPORATION,336413,1510,2022-11-30,12300000,Fort Worth,TX,USA,LOCKHEED MARTIN,336413,33,2022,11,2022-11
4,9000005,BOEING COMPANY,336413,1510,2023-03-12,7800000,San Diego,CA,USA,BOEING COMPANY,336413,33,2023,3,2023-03


In [6]:

out_path = proc_dir / "fpds_cleaned.csv"
fpds.to_csv(out_path, index=False)
print(f"[OK] Wrote {out_path}")

[OK] Wrote c:\Users\taylo\OneDrive\Desktop\DoD-SupplyChain-Disruption-Complete - Copy\data\processed\fpds_cleaned.csv
