1. Import & Path

In [1]:
from pathlib import Path
import pandas as pd


RAW = Path("data/raw")
OUT = Path("data/interim"); OUT.mkdir(parents=True, exist_ok=True)

2. Load, Clean & Filter

In [2]:
# Load file
df = pd.read_csv(RAW / "gold_monthly_price.csv")
display(df.head(3)); df.info()
assert {"Date","Price"}.issubset(df.columns), df.columns.tolist()

# Parse data type 
dates = pd.to_datetime(df["Date"].astype(str).str.strip(), format="%Y-%m", errors="coerce")
prices = pd.to_numeric(df["Price"], errors="coerce")

gold = (pd.DataFrame({"Date": dates, "Gold_USD": prices})
        .dropna(subset=["Date","Gold_USD"])
        .sort_values("Date"))

# Make sure only obserbvation one per month
gold["Month"] = gold["Date"].dt.to_period("M")
gold_m = (gold.drop(columns="Date")
          .drop_duplicates(subset=["Month"], keep="last")
          .set_index("Month")
          .sort_index())

# Use last month index (EOM)
gold_m.index = gold_m.index.to_timestamp("M")
gold_m.index.name = "Date"
gold_m.head()

# Filter 2020-2025
gold_m = gold_m.loc["2020-01-31":"2025-12-31"]

# Quality control (QC)
expected = pd.period_range("2020-01", "2025-12",freq="M").to_timestamp("M")
missing = expected.difference(gold_m.index)

print("Rows          :", len(gold_m), "(target 72)")
print("Range         :", gold_m.index.min(), "->", gold_m.index.max())
print("Missing month :", len(missing), list(missing)[:8])

# Save to interim
out_file = OUT / "gold_monthly_clean_2020_2025.csv"
gold_m.to_csv(out_file, index_label="Date")
print("Saved ->", out_file.resolve())

Unnamed: 0,Date,Price
0,1833-01,18.93
1,1833-02,18.93
2,1833-03,18.93


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2311 entries, 0 to 2310
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Date    2311 non-null   object 
 1   Price   2311 non-null   float64
dtypes: float64(1), object(1)
memory usage: 36.2+ KB
Rows          : 67 (target 72)
Range         : 2020-01-31 00:00:00 -> 2025-07-31 00:00:00
Missing month : 5 [Timestamp('2025-08-31 00:00:00'), Timestamp('2025-09-30 00:00:00'), Timestamp('2025-10-31 00:00:00'), Timestamp('2025-11-30 00:00:00'), Timestamp('2025-12-31 00:00:00')]
Saved -> C:\Users\Noveno\OneDrive\CA1-BTC-Gold-Correlation\data\interim\gold_monthly_clean_2020_2025.csv
