# Summary

The code to acquire and pre-process the all price data to use for future analysis.

In [1]:
# Setup Notebook
import os
if os.path.basename(os.getcwd()) != "mtg-modeling":
    get_ipython().run_line_magic("run", '-i "../../scripts/notebook_header.py"')  # type: ignore

Changed working directory to: d:\mtg-modeling


In [2]:
import pandas as pd
import polars as pl
from src.data.mtgjson_wrangler import MtgPricesJsonWrangler

In [3]:
PROCESS_DATA = True

filename = "flat_prices_2024-05-25_2024-08-24.parquet"

paths = {
    "raw_file": Path("data/raw/mtgjson/AllPrices/AllPrices.json"),
    "interim": Path("data/interim/mtgjson/AllPrices"),
    "interim_cards": Path("data/interim/mtgjson/AllPrintings"),
    "processed": Path("data/processed/mtgjson/AllPrices"),
}

In [4]:
if PROCESS_DATA:
    wrangler = MtgPricesJsonWrangler(paths)
    wrangler.raw_json_to_parquet()
    wrangler.unstack_data()
else:
    wrangler = MtgPricesJsonWrangler(paths, filename=filename)

df = wrangler.load_data()
df.shape

Reading JSON
Writing data...
Metadata written!
Interim data written!
Unstacking data...
Data unstacked!
Saving data...
Data saved!
Final data loaded!


(46959192, 8)

In [5]:
df_cards = (
    pl.scan_parquet(paths["interim_cards"] /  "wide_cards.parquet")
    .select(["uuid", "name", "setCode", "number", "rarity", "type", "colors"])
    .collect()
)
df_cards.shape

(96216, 7)

In [6]:
df_full = df.join(df_cards, on="uuid", how="inner")
print(f"Shape: {df_full.shape}")
print(f"Mem Size in GB: {df_full.estimated_size() / (1024**3):.2f}")
df_full.head(1)

Shape: (46602076, 14)
Mem Size in GB: 5.41


uuid,medium,providers,currency,list,finish,date,price,name,setCode,number,rarity,type,colors
str,str,str,str,str,str,date,f64,str,str,str,str,str,str
"""00010d56-fe38-5e35-8aed-518019…","""paper""","""cardkingdom""","""USD""","""buylist""","""foil""",2024-05-23,5.0,"""Sphinx of the Final Word""","""POGW""","""63s""","""mythic""","""Creature — Sphinx""","""U"""


In [7]:
min_date = df_full["date"].min()
max_date = df_full["date"].max()
df_full.write_parquet(paths["interim"] / f"wide_prices_{min_date}_{max_date}.parquet")

In [8]:
df_otj = df_full.filter(
    pl.col("setCode") == "OTJ"
)
df_otj.write_parquet(paths["processed"] / f"prices_OTJ_{min_date}_{max_date}.parquet")
df_otj.shape

(268816, 14)