1. Setup Path & Folder

In [1]:
from pathlib import Path
import pandas as pd
import numpy as np

INT  = Path("data/interim")
PROC = Path("data/processed"); PROC.mkdir(parents=True, exist_ok=True)
TBL  = Path("reports/tables"); TBL.mkdir(parents=True, exist_ok=True)
FIG  = Path("reports/figures"); FIG.mkdir(parents=True, exist_ok=True)

FILES_INT = {
    "gold": INT/"gold_monthly_clean_2020_2025.csv",
    "btc": INT/"btc_monthly_close_2020_2025.csv",
    "merged": INT/"merged_gold_btc_monthly_2020_2025.csv",
}
print({k: v.exists() for k,v in FILES_INT.items()})

{'gold': True, 'btc': True, 'merged': True}


2. Load Interim Artifacts & Index Normalization (EOM)

In [2]:
def load_eom(path: Path) -> pd.DataFrame:
    df = (pd.read_csv(path, parse_dates=["Date"])
          .set_index("Date")
          .sort_index())
    df.index = df.index.to_period("M").to_timestamp("M")
    df.index.name = "Date"
    return df

gold = load_eom(FILES_INT["gold"])
btc = load_eom(FILES_INT["btc"])
merged = load_eom(FILES_INT["merged"])

gold.head(), btc.head(), merged.head()


(            Gold_USD
 Date                
 2020-01-31   1560.67
 2020-02-29   1597.10
 2020-03-31   1591.93
 2020-04-30   1683.17
 2020-05-31   1715.91,
             BTC_USD
 Date               
 2020-01-31  9342.23
 2020-02-29  8545.45
 2020-03-31  6423.61
 2020-04-30  8637.56
 2020-05-31  9450.47,
             Gold_USD  BTC_USD
 Date                         
 2020-01-31   1560.67  9342.23
 2020-02-29   1597.10  8545.45
 2020-03-31   1591.93  6423.61
 2020-04-30   1683.17  8637.56
 2020-05-31   1715.91  9450.47)

3. Freeze to data/processed/ + Make Sure Returns Exist

In [3]:
# Freeze
gold.to_csv(PROC/"gold_monthly_clean_2020_2025.csv",   index_label="Date")
btc.to_csv(PROC/"btc_monthly_close_2020_2025.csv",     index_label="Date")
merged.to_csv(PROC/"merged_gold_btc_monthly_2020_2025.csv", index_label="Date")

# Ensure returns
ret_path = PROC/"monthly_returns_gold_btc_2020_2025.csv"
if ret_path.exists():
    ret = (pd.read_csv(ret_path, parse_dates=["Date"])
             .set_index("Date").sort_index())
else:
    ret = (merged.assign(
            Gold_ret = np.log(merged["Gold_USD"]).diff(),
            BTC_ret  = np.log(merged["BTC_USD"]).diff()
          )
          .replace([np.inf, -np.inf], np.nan)
          .dropna(subset=["Gold_ret","BTC_ret"]))
    ret.to_csv(ret_path, index_label="Date")
    print("Saved returns →", ret_path.resolve())

print("Processed files:", [p.name for p in PROC.glob("*.csv")])


Processed files: ['btc_monthly_close_2020_2025.csv', 'gold_monthly_clean_2020_2025.csv', 'merged_gold_btc_monthly_2020_2025.csv', 'monthly_returns_gold_btc_2020_2025.csv']


4. Compact QC + Save Summary and Manifest

In [4]:
def qc(df: pd.DataFrame, name: str):
    expected = pd.period_range("2020-01","2025-12",freq="M").to_timestamp("M")
    missing = expected.difference(df.index)
    return {
        "name": name,
        "rows": len(df),
        "start": df.index.min().date(),
        "end": df.index.max().date(),
        "dupe_index": int(df.index.duplicated().sum()),
        "n_missing_eom": len(missing)
    }, list(missing)

qc_gold, miss_gold = qc(gold,   "gold")
qc_btc,  miss_btc  = qc(btc,    "btc")
qc_mrg,  miss_mrg  = qc(merged, "merged")
qc_ret,  miss_ret  = qc(ret,    "returns")

qc_table = pd.DataFrame([qc_gold, qc_btc, qc_mrg, qc_ret]).round(3)
qc_table.to_csv(TBL/"qc_freeze_summary.csv", index=False)

for name, miss in [("gold",miss_gold),("btc",miss_btc),("merged",miss_mrg),("returns",miss_ret)]:
    pd.Series(miss, name="missing_eom").to_csv(TBL/f"qc_missing_{name}.csv", index=False)

# Minimal hard checks (fail fast if there are serious problems)
assert qc_gold["dupe_index"]==0 and qc_btc["dupe_index"]==0 and qc_mrg["dupe_index"]==0, "Duplicate EOM index detected."
assert (gold["Gold_USD"]>0).all() and (btc["BTC_USD"]>0).all(), "Price ≤ 0 detected; clear in step 02/03."

# Manifest (hash + concise)
import hashlib
def sha256(path: Path) -> str:
    h = hashlib.sha256()
    with open(path, "rb") as f:
        for chunk in iter(lambda: f.read(1<<20), b""): h.update(chunk)
    return h.hexdigest()

manifest_rows=[]
for p in [PROC/"gold_monthly_clean_2020_2025.csv",
          PROC/"btc_monthly_close_2020_2025.csv",
          PROC/"merged_gold_btc_monthly_2020_2025.csv",
          PROC/"monthly_returns_gold_btc_2020_2025.csv"]:
    df = pd.read_csv(p, parse_dates=["Date"]).set_index("Date")
    manifest_rows.append({
        "file": p.name, "rows": len(df),
        "start": df.index.min().date(), "end": df.index.max().date(),
        "sha256": sha256(p)
    })

pd.DataFrame(manifest_rows).to_csv(TBL/"manifest_processed.csv", index=False)

print("QC →", (TBL/"qc_freeze_summary.csv").resolve())
print("Manifest →", (TBL/"manifest_processed.csv").resolve())


QC → C:\Users\Noveno\OneDrive\CA1-BTC-Gold-Correlation\reports\tables\qc_freeze_summary.csv
Manifest → C:\Users\Noveno\OneDrive\CA1-BTC-Gold-Correlation\reports\tables\manifest_processed.csv
