# Summary

The code to acquire and pre-process the all price data to use for future analysis.

In [2]:
# Setup Notebook
import os
if os.path.basename(os.getcwd()) != 'mtg-modeling':
    %run -i "../../scripts/notebook_header.py"

Changed working directory to: d:\mtg-modeling


In [None]:
import pandas as pd
import polars as pl
from src.data.mtgjson_data import MtgPricesJsonWrangler

ModuleNotFoundError: No module named 'polars'

In [None]:
paths = {
    'raw_filepath': "data/raw/mtgjson/AllPricesJson/AllPrices.json",
    'interim_path': "data/interim/mtgjson/prices",
    'processed_path': "data/processed/mtgjson/prices",
}

wrangler = MtgPricesJsonWrangler(paths)

In [None]:
wrangler.raw_json_to_parquet()

Reading JSON
Writing metadata to processed directory
Metadata written!
Writing data to interim directory
Data written!
Data persisted


In [None]:
wrangler.data = wrangler.data[:1000]
wrangler.data.head()

Unnamed: 0,uuid,data
0,00010d56-fe38-5e35-8aed-518019aa36a5,{'paper': {'cardkingdom': {'buylist': {'foil':...
1,0001e0d0-2dcd-5640-aadc-a84765cf5fc9,{'paper': {'cardkingdom': {'buylist': {'normal...
2,0003caab-9ff5-5d1a-bc06-976dd0457f19,{'mtgo': {'cardhoarder': {'retail': {'foil': {...
3,0003d249-25d9-5223-af1e-1130f09622a7,{'mtgo': {'cardhoarder': {'retail': {'foil': {...
4,0004a4fb-92c6-59b2-bdbe-ceb584a9e401,{'mtgo': {'cardhoarder': {'retail': {'normal':...


In [None]:
data_filename = "allCardPrices.parquet"

df = pl.read_parquet(paths["interim_path"] / data_filename)
df = df[:1000]
print(df.head(5))

shape: (5, 2)
┌─────────────────────────────────┬─────────────────────────────────┐
│ uuid                            ┆ data                            │
│ ---                             ┆ ---                             │
│ str                             ┆ struct[2]                       │
╞═════════════════════════════════╪═════════════════════════════════╡
│ 00010d56-fe38-5e35-8aed-518019… ┆ {null,{{{null,{5.5,5.5,5.5,5.5… │
│ 0001e0d0-2dcd-5640-aadc-a84765… ┆ {null,{{{null,null,{null,1.85,… │
│ 0003caab-9ff5-5d1a-bc06-976dd0… ┆ {{{"USD",{{0.01,0.01,0.01,0.01… │
│ 0003d249-25d9-5223-af1e-1130f0… ┆ {{{"USD",{{0.01,0.01,0.01,0.01… │
│ 0004a4fb-92c6-59b2-bdbe-ceb584… ┆ {{{"USD",{null,{0.03,0.03,0.03… │
└─────────────────────────────────┴─────────────────────────────────┘


In [None]:
print(df.unnest('data').head())

shape: (5, 3)
┌────────────────────────────────┬────────────────────────────────┬────────────────────────────────┐
│ uuid                           ┆ mtgo                           ┆ paper                          │
│ ---                            ┆ ---                            ┆ ---                            │
│ str                            ┆ struct[1]                      ┆ struct[4]                      │
╞════════════════════════════════╪════════════════════════════════╪════════════════════════════════╡
│ 00010d56-fe38-5e35-8aed-518019 ┆ null                           ┆ {{{null,{5.5,5.5,5.5,5.5,5.5,5 │
│ …                              ┆                                ┆ …                              │
│ 0001e0d0-2dcd-5640-aadc-a84765 ┆ null                           ┆ {{{null,null,{null,1.85,1.1,1. │
│ …                              ┆                                ┆ …                              │
│ 0003caab-9ff5-5d1a-bc06-976dd0 ┆ {{"USD",{{0.01,0.01,0.01,0.01, ┆ {{{null,{

In [None]:
df1 = (df
    .unnest("data")
    .unpivot(index='uuid')
    .drop_nulls('value')
    )
print(df1.head())

shape: (5, 3)
┌─────────────────────────────────┬──────────┬─────────────────────────────────┐
│ uuid                            ┆ variable ┆ value                           │
│ ---                             ┆ ---      ┆ ---                             │
│ str                             ┆ str      ┆ struct[5]                       │
╞═════════════════════════════════╪══════════╪═════════════════════════════════╡
│ 0003caab-9ff5-5d1a-bc06-976dd0… ┆ mtgo     ┆ {null,null,null,null,{"USD",{{… │
│ 0003d249-25d9-5223-af1e-1130f0… ┆ mtgo     ┆ {null,null,null,null,{"USD",{{… │
│ 0004a4fb-92c6-59b2-bdbe-ceb584… ┆ mtgo     ┆ {null,null,null,null,{"USD",{n… │
│ 0005d268-3fd0-5424-bc6b-573ecd… ┆ mtgo     ┆ {null,null,null,null,{"USD",{{… │
│ 0005f481-f2d4-53fa-ba37-cfcf5a… ┆ mtgo     ┆ {null,null,null,null,{"USD",{{… │
└─────────────────────────────────┴──────────┴─────────────────────────────────┘


In [None]:
wrangler.data["data"].apply(pd.Series)

Unnamed: 0,paper,mtgo
0,{'cardkingdom': {'buylist': {'foil': {'2024-05...,
1,{'cardkingdom': {'buylist': {'normal': {'2024-...,
2,{'cardkingdom': {'buylist': {'foil': {'2024-05...,{'cardhoarder': {'retail': {'foil': {'2024-05-...
3,{'cardkingdom': {'buylist': {'foil': {'2024-05...,{'cardhoarder': {'retail': {'foil': {'2024-05-...
4,{'cardkingdom': {'buylist': {'foil': {'2024-05...,{'cardhoarder': {'retail': {'normal': {'2024-0...
...,...,...
995,{'cardkingdom': {'retail': {'foil': {'2024-05-...,{'cardhoarder': {'retail': {'foil': {'2024-05-...
996,{'cardkingdom': {'buylist': {'foil': {'2024-05...,{'cardhoarder': {'retail': {'foil': {'2024-05-...
997,{'cardkingdom': {'retail': {'normal': {'2024-0...,
998,{'cardkingdom': {'buylist': {'foil': {'2024-06...,{'cardhoarder': {'retail': {'foil': {'2024-05-...


In [None]:
def extract_dict(df, var_keys=['paper', 'mtgo'], var_name='format', id_vars=['uuid'], index_vars=None):
    if index_vars is None:
        index_vars = id_vars
    df.set_index(index_vars, inplace=True)
    df = df['data'].apply(pd.Series)
    df.reset_index(inplace=True)
    df = df.melt(id_vars=id_vars, value_vars=var_keys, value_name='data', var_name=var_name)
    df.dropna(subset=['data'], inplace=True)
    return df

In [None]:
def process_price_data(df):

    df.reset_index(inplace=True)
    df.rename(columns={'index': 'uuid'}, inplace=True)

    print(f"   Starting Format Extraction.  Shape: {df.shape}")
    df = extract_dict(df, var_keys=['paper', 'mtgo'], var_name='format', id_vars=['uuid'])
    print(f"   Starting Provider Extraction.  Shape: {df.shape}")
    df = extract_dict(df, var_keys=['cardhoarder', 'cardkingdom', 'cardmarket', 'cardsphere', 'tcgplayer'], var_name='providers', id_vars=['uuid', 'format'])
    print(f"   Starting List Extraction.  Shape: {df.shape}")
    df = extract_dict(df, var_keys=['retail', 'buylist'], var_name='list', id_vars=['uuid', 'format', 'providers', 'currency'], index_vars=['uuid', 'format', 'providers'])
    print(f"   Starting Type Extraction.  Shape: {df.shape}")
    df = extract_dict(df, var_keys=['normal', 'foil'], var_name='type', id_vars=['uuid', 'format', 'providers', 'currency', 'list'])
    print(f"   Starting Date & Price Extraction.  Shape: {df.shape}")
    df = extract_dict(df, var_keys=None, var_name='date', id_vars=['uuid', 'format', 'providers', 'currency', 'list', 'type'])
    print(f"   Sorting and Saving.  Shape: {df.shape}")
    df.rename(columns={'data': 'price'}, inplace=True)
    df.sort_values(['uuid', 'format', 'providers', 'currency', 'list', 'type', 'date'], inplace=True)
    df.reset_index(drop=True, inplace=True)
    return df

In [None]:
tot_len = 0
chunk_size = 10000
for i, start in enumerate(range(0, df.shape[0], chunk_size)):
    print(f"Processing {i*10000}/{df.shape[0]}")
    end = min(start + chunk_size, df.shape[0])
    df_chunk = df.loc[df.index[start:end]]
    df_chunk = process_price_data(df_chunk)
    df_chunk.to_parquet(interim_root / f'AllPrices_chunk_{i}.parquet')
    tot_len += df_chunk.shape[0]
tot_len

Processing 0/101571
   Starting Format Extraction.  Shape: (10000, 2)
   Starting Provider Extraction.  Shape: (15592, 3)
   Starting List Extraction.  Shape: (43816, 4)
   Starting Type Extraction.  Shape: (48789, 6)
   Starting Date & Price Extraction.  Shape: (71351, 7)
   Sorting and Saving.  Shape: (5855193, 8)
Processing 10000/101571
   Starting Format Extraction.  Shape: (10000, 2)
   Starting Provider Extraction.  Shape: (15631, 3)
   Starting List Extraction.  Shape: (43850, 4)
   Starting Type Extraction.  Shape: (48734, 6)
   Starting Date & Price Extraction.  Shape: (71406, 7)
   Sorting and Saving.  Shape: (5853369, 8)
Processing 20000/101571
   Starting Format Extraction.  Shape: (10000, 2)
   Starting Provider Extraction.  Shape: (15614, 3)
   Starting List Extraction.  Shape: (43762, 4)
   Starting Type Extraction.  Shape: (48726, 6)
   Starting Date & Price Extraction.  Shape: (71217, 7)
   Sorting and Saving.  Shape: (5841349, 8)
Processing 30000/101571
   Starting Fo

48143467

In [None]:
n = 10
dfs = []
for j in range(n+1):
    print(f"Loading {j}/{n}")
    df = pd.read_parquet(interim_root / f'AllPrices_chunk_{j}.parquet')
    dfs.append(df)
df = pd.concat(dfs)
print(df.shape)
# assert df.shape[0] == tot_len, f"{df.shape[0]} != {tot_len}"

Loading 0/10


Loading 1/10
Loading 2/10
Loading 3/10
Loading 4/10
Loading 5/10
Loading 6/10
Loading 7/10
Loading 8/10
Loading 9/10
Loading 10/10
(48143467, 8)


: 

In [None]:
min_date = df['date'].min()
max_date = df['date'].max()
df.to_parquet(processed_root / f'AllPrices_{min_date}_{max_date}.parquet', engine='pyarrow', compression='snappy', partition_cols=['uuid', 'format', 'providers', 'currency', 'list', 'type', 'date'])

In [None]:
for j in range(n+1):
    file_path = interim_root / f'AllPrices_chunk_{j}.parquet'
    if os.path.exists(file_path):
        os.remove(file_path)

In [None]:
min_date = '2024-05-16'
max_date = '2024-08-16'
df = pd.read_parquet(processed_root / f'AllPrices_{min_date}_{max_date}.parquet', engine='pyarrow', filters=[('uuid', '==', '00010d56-fe38-5e35-8aed-518019aa36a5')])
df.head()

Unnamed: 0,uuid,format,providers,currency,list,type,date,price
0,00010d56-fe38-5e35-8aed-518019aa36a5,paper,cardkingdom,USD,buylist,foil,2024-05-16,5.5
1,00010d56-fe38-5e35-8aed-518019aa36a5,paper,cardkingdom,USD,buylist,foil,2024-05-17,5.5
2,00010d56-fe38-5e35-8aed-518019aa36a5,paper,cardkingdom,USD,buylist,foil,2024-05-18,5.5
3,00010d56-fe38-5e35-8aed-518019aa36a5,paper,cardkingdom,USD,buylist,foil,2024-05-19,5.5
4,00010d56-fe38-5e35-8aed-518019aa36a5,paper,cardkingdom,USD,buylist,foil,2024-05-20,5.5
