1. Import & Path

In [1]:
from pathlib import Path
import pandas as pd 

INT = Path("data/interim")
assert (INT/"gold_monthly_clean_2020_2025.csv").exists(), "There is no interim Gold yet"
assert (INT/"btc_monthly_close_2020_2025.csv").exists(), "There is no interim BTC yet"

2. Load & align index to EOM (end-of-month)

In [2]:
gold = pd.read_csv(INT/"gold_monthly_clean_2020_2025.csv", parse_dates=["Date"]).set_index("Date")
btc  = pd.read_csv(INT/"btc_monthly_close_2020_2025.csv", parse_dates=["Date"]).set_index("Date")

# Make sure index = end of month & numeric type is clean
gold.index = gold.index.to_period("M").to_timestamp("M")
btc.index  = btc.index.to_period("M").to_timestamp("M")

# Desuplicate index if exist, drop NA, sort
gold = gold[~gold.index.duplicated(keep="last")].dropna().sort_index()
btc  = btc[~btc.index.duplicated(keep="last")].dropna().sort_index()

3. QC Missing Months

In [3]:
expected = pd.period_range("2020-01","2025-12", freq="M").to_timestamp("M")
miss_gold = expected.difference(gold.index)
miss_btc  = expected.difference(btc.index)

print("Gold missing months  :", len(miss_gold), list(miss_gold)[:6])
print("BTC missing months   :", len(miss_btc), list(miss_btc)[:6])

Gold missing months  : 5 [Timestamp('2025-08-31 00:00:00'), Timestamp('2025-09-30 00:00:00'), Timestamp('2025-10-31 00:00:00'), Timestamp('2025-11-30 00:00:00'), Timestamp('2025-12-31 00:00:00')]
BTC missing months   : 2 [Timestamp('2025-11-30 00:00:00'), Timestamp('2025-12-31 00:00:00')]


4. Merge (inner join) + safe range filter

In [4]:
merged = gold.join(btc, how="inner").sort_index()
merged = merged.loc["2020-01-31":"2025-12-31"]

print("Merged shape  :", merged.shape)
print("Range         :", merged.index.min(), "->", merged.index.max())
print("NA counts     :\n", merged.isna().sum())
display(merged.head(3), merged.tail(3))

Merged shape  : (67, 2)
Range         : 2020-01-31 00:00:00 -> 2025-07-31 00:00:00
NA counts     :
 Gold_USD    0
BTC_USD     0
dtype: int64


Unnamed: 0_level_0,Gold_USD,BTC_USD
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-01-31,1560.67,9342.23
2020-02-29,1597.1,8545.45
2020-03-31,1591.93,6423.61


Unnamed: 0_level_0,Gold_USD,BTC_USD
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2025-05-31,3309.49,104696.46
2025-06-30,3352.66,107147.75
2025-07-31,3340.15,116009.4


5. Save merge result

In [5]:
out_merged = INT/"merged_gold_btc_monthly_2020_2025.csv"
merged.to_csv(out_merged, index_label="Date")
print("Saved ->", out_merged.resolve())

Saved -> C:\Users\Noveno\OneDrive\CA1-BTC-Gold-Correlation\data\interim\merged_gold_btc_monthly_2020_2025.csv
