1. Setup

In [None]:
from pathlib import Path 
import pandas as pd 

2. Audit GOLD (monthly)

In [None]:
# Raw foler location
RAW = Path("data/raw")

# Load file
df_gold = pd.read_csv(RAW/"gold_monthly_price.csv")

# Preview structure & line sample
display(df_gold.head())
df_gold.info()
df_gold.describe(include="all")

# Parse date column, convert price, and count month duplicate
g_dt = pd.to_datetime(df_gold["Date"].astype(str), format="%Y-%m", errors="coerce")
g_price = pd.to_numeric(df_gold["Price"], errors="coerce")
dup_by_month = g_dt.dt.to_period("M").duplicated().sum()

# Summary data quality report to be recorded in log/audit
print("Gold parse rate   :", g_dt.notna().mean())
print("Gold date range   :", g_dt.min(), "→", g_dt.max())
print("Gold price NA     :", g_price.isna().sum())
print("Duplicated months :", dup_by_month)

Unnamed: 0,Date,Price
0,1833-01,18.93
1,1833-02,18.93
2,1833-03,18.93
3,1833-04,18.93
4,1833-05,18.93


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2311 entries, 0 to 2310
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Date    2311 non-null   object 
 1   Price   2311 non-null   float64
dtypes: float64(1), object(1)
memory usage: 36.2+ KB
Gold parse rate   : 1.0
Gold date range   : 1833-01-01 00:00:00 → 2025-07-01 00:00:00
Gold price NA     : 0
Duplicated months : 0


3. Audit BTC (hourly)

In [None]:
# Folder
RAW = Path("data/raw")
btc_path = RAW / "btc-hourly-price_2015_2025.csv"
print("RAW  :", RAW)
print("File :", btc_path.name)

# Load
df = pd.read_csv(btc_path, low_memory=False)
print("Columns:", df.columns.tolist()[:12], "...")

# Validate main column
REQUIRED = {"TIME_UNIX", "CLOSE_PRICE"}
missing = REQUIRED - set(df.columns)
if missing:
    raise ValueError(f"Required columns missing: {missing}. Available columns: {df.columns.tolist()}")

# Parse datetime & price
unix = pd.to_numeric(df["TIME_UNIX"], errors="coerce")
unit = "ms" if unix.dropna().median() > 1e12 else "s"
dt_utc = pd.to_datetime(unix, unit=unit, utc=True, errors="coerce")
close  = pd.to_numeric(df["CLOSE_PRICE"], errors="coerce")

# Summary global quality
print("\n== Global quality ==")
print(f"Parse rate datetime : {dt_utc.notna().mean():.2%}")
print("Datetime range       :", dt_utc.min(), "→", dt_utc.max())
print("Price NaN            :", int(close.isna().sum()))
print("Price ≤ 0            :", int((close<=0).sum()))

# Focus range audit 2020–2025
start = pd.Timestamp("2020-01-01", tz="UTC")
end   = pd.Timestamp("2025-12-31", tz="UTC")
btc = (pd.DataFrame({"dt": dt_utc, "close": close})
         .dropna()
         .sort_values("dt"))
sub = btc[(btc["dt"] >= start) & (btc["dt"] <= end)].copy()

print("\n== Within 2020–2025 ==")
print("Rows                :", len(sub))
print("Range               :", sub["dt"].min(), "→", sub["dt"].max())

# Check duplication
dup_ts = int(sub["dt"].duplicated().sum())
sub["step_h"] = sub["dt"].diff().dt.total_seconds().div(3600)
gaps_over_1h = int((sub["step_h"] > 1.0).sum())

print("Duplicated ts       :", dup_ts)
print("Gaps > 1 hour       :", gaps_over_1h)
print(sub["step_h"].describe().rename("Δt (hours)").to_string())

# Data density per month
sub["month"] = sub["dt"].dt.to_period("M")
counts = sub.groupby("month").size().rename("rows_per_month")
print("\n== Rows per month (preview) ==")
print("Min / Mean / Max    :", int(counts.min()), int(counts.mean()), int(counts.max()))
print("First months:\n", counts.head().to_string())
print("Last  months:\n", counts.tail().to_string())

# Monthly aggregation preview (end-of-month close)
btc_m_preview = (sub.set_index("dt")["close"]
                   .resample("ME").last()
                   .to_frame("BTC_USD"))
print("\n== Monthly close preview ==")
print("Shape               :", btc_m_preview.shape)
print(btc_m_preview.head().to_string())
print(btc_m_preview.tail().to_string())

RAW  : data\raw
File : btc-hourly-price_2015_2025.csv
Columns: ['TIME_UNIX', 'DATE_STR', 'HOUR_STR', 'OPEN_PRICE', 'HIGH_PRICE', 'CLOSE_PRICE', 'LOW_PRICE', 'VOLUME_FROM', 'VOLUME_TO'] ...

== Global quality ==
Parse rate datetime : 100.00%
Datetime range       : 2014-11-15 06:00:00+00:00 → 2025-10-29 23:00:00+00:00
Price NaN            : 0
Price ≤ 0            : 0

== Within 2020–2025 ==
Rows                : 51096
Range               : 2020-01-01 00:00:00+00:00 → 2025-10-29 23:00:00+00:00
Duplicated ts       : 0
Gaps > 1 hour       : 0
count    51095.0
mean         1.0
std          0.0
min          1.0
25%          1.0
50%          1.0
75%          1.0
max          1.0

== Rows per month (preview) ==
Min / Mean / Max    : 672 729 744
First months:
 month
2020-01    744
2020-02    696
2020-03    744
2020-04    720
2020-05    744
Freq: M
Last  months:
 month
2025-06    720
2025-07    744
2025-08    744
2025-09    720
2025-10    696
Freq: M

== Monthly close preview ==
Shape            

  sub["month"] = sub["dt"].dt.to_period("M")


4. Checklist Audit 

In [None]:
# GOLD
g = pd.read_csv(RAW/"gold_monthly_price.csv")  
g_dt = pd.to_datetime(g["Date"].astype(str).str.strip(), format="%Y-%m", errors="coerce")
g_price = pd.to_numeric(g["Price"], errors="coerce")
dup_by_month = g_dt.dt.to_period("M").duplicated().sum()

# BTC
b = pd.read_csv(RAW/"btc-hourly-price_2015_2025.csv") 
unix = pd.to_numeric(b["TIME_UNIX"], errors="coerce")
unit = "ms" if unix.dropna().median() > 1e12 else "s"
b_dt = pd.to_datetime(unix, unit=unit, utc=True, errors="coerce")
b_price = pd.to_numeric(b["CLOSE_PRICE"], errors="coerce")

# Focus range 2020–2025 
start = pd.Timestamp("2020-01-01", tz="UTC"); end = pd.Timestamp("2025-12-31", tz="UTC")
mask = (b_dt >= start) & (b_dt <= end)
dup_ts_2020_2025 = pd.Series(b_dt[mask]).duplicated().sum()

lines = [
    f"GOLD : parsed {g_dt.notna().mean():.2%}; range {g_dt.min()} → {g_dt.max()}",
    f"GOLD : price NA = {int(g_price.isna().sum())}; duplicated months = {int(dup_by_month)}",
    f"BTC  : parsed {b_dt.notna().mean():.2%}; range {b_dt.min()} → {b_dt.max()}",
    f"BTC  : (2020–2025) price NA = {int(b_price[mask].isna().sum())}; duplicated timestamps = {int(dup_ts_2020_2025)}",
]
print("\n".join(lines))

GOLD : parsed 100.00%; range 1833-01-01 00:00:00 → 2025-07-01 00:00:00
GOLD : price NA = 0; duplicated months = 0
BTC  : parsed 100.00%; range 2014-11-15 06:00:00+00:00 → 2025-10-29 23:00:00+00:00
BTC  : (2020–2025) price NA = 0; duplicated timestamps = 0
