Setup & file raw

In [27]:
from pathlib import Path 
import pandas as pd 

Audit GOLD (monthly)

In [33]:
# Folder
RAW = Path("data/raw")
OUT = Path("data/interim")
OUT.mkdir(parents=True, exist_ok=True)

# File gold
df_gold = pd.read_csv(RAW / "gold_monthly_price.csv")

assert {"Date","Price"}.issubset(df_gold.columns), df_gold.columns.tolist()

# Parse & clean
dates  = pd.to_datetime(df_gold["Date"].astype(str).str.strip(), format="%Y-%m", errors="coerce")
prices = pd.to_numeric(df_gold["Price"], errors="coerce")

gold_m = (
    pd.DataFrame({"Date": dates, "Gold_USD": prices})
      .dropna(subset=["Date","Gold_USD"])
      .sort_values("Date")
      .set_index("Date")
      .resample("M").last()          
)

# Filter range 2020–2025
gold_m = gold_m.loc["2020-01-31":"2025-12-31"]

# Save to CSV
out_gold = OUT / "gold_monthly_clean_2020_2025.csv"
gold_m.to_csv(out_gold, index_label="Date")

print("Saved:", out_gold.resolve())
print("Rows:", len(gold_m), "| Range:", gold_m.index.min(), "→", gold_m.index.max())


Saved: C:\Users\Noveno\OneDrive\CA1-BTC-Gold-Correlation\data\interim\gold_monthly_clean_2020_2025.csv
Rows: 67 | Range: 2020-01-31 00:00:00 → 2025-07-31 00:00:00


  .resample("M").last()


Audit BTC (hourly)

In [34]:
# Folder
RAW = Path("data/raw")
OUT = Path("data/interim")
OUT.mkdir(parents=True, exist_ok=True)

# Load
df = pd.read_csv(RAW / "btc-hourly-price_2015_2025.csv" )

# Parse datetime from TIME_UNIX
col_unix  = "TIME_UNIX"
col_close = "CLOSE_PRICE"

unix_series = pd.to_numeric(df[col_unix], errors="coerce")
unit = "ms" if unix_series.dropna().median() > 1e12 else "s"
dt = pd.to_datetime(unix_series, unit=unit, utc=True, errors="coerce")

# Take the close price (numeric)
close = pd.to_numeric(df[col_close], errors="coerce")

btc = (pd.DataFrame({"dt": dt, "BTC_USD": close})
         .dropna()
         .sort_values("dt"))

# Filter range 2020–2025 (UTC)
start = pd.Timestamp("2020-01-01", tz="UTC")
end   = pd.Timestamp("2025-12-31", tz="UTC")
btc = btc[(btc["dt"] >= start) & (btc["dt"] <= end)]

# Resample to monthly (end-of-month close)
btc_m = btc.set_index("dt")["BTC_USD"].resample("ME").last().to_frame()

# Remove the timezone to make it easier to merge with gold
btc_m.index = btc_m.index.tz_localize(None)

# Save
out_btc = OUT / "btc_monthly_close_2020_2025.csv"
btc_m.to_csv(out_btc, index_label="Date")

# Summary
out_btc, btc_m.shape, btc_m.index.min(), btc_m.index.max(), btc_m.head()


(WindowsPath('data/interim/btc_monthly_close_2020_2025.csv'),
 (70, 1),
 Timestamp('2020-01-31 00:00:00'),
 Timestamp('2025-10-31 00:00:00'),
             BTC_USD
 dt                 
 2020-01-31  9342.23
 2020-02-29  8545.45
 2020-03-31  6423.61
 2020-04-30  8637.56
 2020-05-31  9450.47)

Checklist Audit 

In [30]:
notes = []

g_dt = dates
g_price_num = prices
dup_by_month = g_dt.dt.to_period("M").duplicated().sum()

# GOLD
notes.append(f"GOLD: parsed {dt.notna().mean():.2%} dates; range {dt.min()} → {dt.max()}")
notes.append(f"GOLD: price NA = {g_price_num.isna().sum()} rows")
notes.append(f"GOLD: duplicated months = {dup_by_month}")

# BTC (pakai variabel dari sel BTC: dt & close)
notes.append(f"BTC : parsed {dt.notna().mean():.2%} dates; range {dt.min()} → {dt.max()}")
notes.append(f"BTC : price NA = {close.isna().sum()} rows")

print("\n".join(notes))


GOLD: parsed 100.00% dates; range 2014-11-15 06:00:00+00:00 → 2025-10-29 23:00:00+00:00
GOLD: price NA = 0 rows
GOLD: duplicated months = 0
BTC : parsed 100.00% dates; range 2014-11-15 06:00:00+00:00 → 2025-10-29 23:00:00+00:00
BTC : price NA = 0 rows
