1. Import & Path

In [1]:
from pathlib import Path
import pandas as pd

RAW = Path("data/raw")
OUT = Path("data/interim"); OUT.mkdir(parents=True, exist_ok=True)

2. Load & Parse

In [2]:
# File location
btc_path = RAW / "btc-hourly-price_2015_2025.csv"
df = pd.read_csv(btc_path, usecols=["TIME_UNIX","CLOSE_PRICE"], low_memory=False)

assert {"TIME_UNIX","CLOSE_PRICE"}.issubset(df.columns), df.columns.tolist()
display(df.head(3)); df.info()

# Detect epoch unit (seconds vs milliseconds)
unix = pd.to_numeric(df["TIME_UNIX"], errors="coerce")
unit = "ms" if unix.dropna().median() > 1e12 else "s"

# Datetime UTC & numeric price (bersihkan $, , jika ada)
dt_utc = pd.to_datetime(unix, unit=unit, utc=True, errors="coerce")
close = pd.to_numeric(
    df["CLOSE_PRICE"].astype(str).str.replace(r"[,$]", "", regex=True),
    errors="coerce")

btc = (
    pd.DataFrame({"dt": dt_utc, "BTC_USD": close})
      .dropna()
      .sort_values("dt"))

print("Parse rate:", f"{dt_utc.notna().mean():.2%}",
      "| Range:", btc["dt"].min(), "->", btc["dt"].max())
print("Non-positive price:", int((btc["BTC_USD"] <= 0).sum()))

Unnamed: 0,TIME_UNIX,CLOSE_PRICE
0,1416031200,396.15
1,1416034800,397.15
2,1416038400,399.9


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 96042 entries, 0 to 96041
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   TIME_UNIX    96042 non-null  int64  
 1   CLOSE_PRICE  96042 non-null  float64
dtypes: float64(1), int64(1)
memory usage: 1.5 MB
Parse rate: 100.00% | Range: 2014-11-15 06:00:00+00:00 -> 2025-10-29 23:00:00+00:00
Non-positive price: 0


3. Deduplicate, resample to monthly, & save 

In [3]:
# Deduplicate timestamp, filter 2020–2025
btc = btc[~btc["dt"].duplicated(keep="last")]

start = pd.Timestamp("2020-01-01", tz="UTC")
end   = pd.Timestamp("2025-12-31", tz="UTC")
btc   = btc[(btc["dt"] >= start) & (btc["dt"] <= end)]
print("Rows (2020–2025):", len(btc))

# Resample to end-of-month close
btc_m = btc.set_index("dt")["BTC_USD"].resample("M").last().to_frame()

# Remove timezone from index
btc_m.index = btc_m.index.tz_localize(None)
btc_m.index.name = "Date"

# Quality control (QC)
expected = pd.period_range("2020-01", "2025-12", freq="M").to_timestamp("M")
missing = expected.difference(btc_m.index)

print("Monthly rows:", len(btc_m),
      "| Range:", btc_m.index.min(), "->", btc_m.index.max())
print("Missing months:", len(missing), list(missing)[:8])  # preview 8 saja

# Save
out_file = OUT / "btc_monthly_close_2020_2025.csv"
btc_m.to_csv(out_file, index_label="Date")
print("Saved ->", out_file.resolve())

Rows (2020–2025): 51096
Monthly rows: 70 | Range: 2020-01-31 00:00:00 -> 2025-10-31 00:00:00
Missing months: 2 [Timestamp('2025-11-30 00:00:00'), Timestamp('2025-12-31 00:00:00')]
Saved -> C:\Users\Noveno\OneDrive\CA1-BTC-Gold-Correlation\data\interim\btc_monthly_close_2020_2025.csv


  btc_m = btc.set_index("dt")["BTC_USD"].resample("M").last().to_frame()
