# Summary

The code to wrangle the data.  Deal with NAs, join data frames, etc.

In [1]:
# Setup Notebook
import os
if os.path.basename(os.getcwd()) != "mtg-modeling":
    get_ipython().run_line_magic("run", '-i "../../scripts/notebook_header.py"')  # type: ignore

Changed working directory to: /root/mtg-modeling


In [2]:
import pandas as pd
from tqdm.notebook import tqdm
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import time
import polars as pl

In [3]:
paths = {
    "raw": Path("data/raw/17lands/game_data/PremierDraft"),
    "interim": Path("data/interim/17lands/game_data/premier_draft"),
    "processed": Path("data/processed/17lands/game_data/premier_draft"),
}


In [4]:
csv_file = paths["raw"] / "game_data_public.BLB.PremierDraft.csv"
parquet_file = paths["interim"] / "game_data_public.BLB.PremierDraft.parquet"
final_file = paths["processed"] / "BLB_GameData_PremierDraft.parquet"

os.makedirs(paths["interim"], exist_ok=True)
os.makedirs(paths["processed"], exist_ok=True)

In [5]:
df_lazy = pl.scan_csv(csv_file)
df_lazy.collect().write_parquet(parquet_file)

: 

In [15]:
df_lazy = pl.scan_parquet(parquet_file)   # 680,000 rows; 1,400 columns
cols = df_lazy.collect_schema().names()

status_prefixes = ['tutored_', 'deck_', 'opening_', 'drawn_', 'sideboard_']
card_cols = [col for col in cols if any([col.startswith(s) for s in status_prefixes])]
index_cols = [col for col in cols if col not in card_cols]
index_cols

results = []
chunk_size = 1000
num_rows = df_lazy.select(pl.len()).collect().item()


for i in tqdm(range(0, 10000, chunk_size)):
    df_chunk = (
        df_lazy.slice(i, chunk_size)
        .unpivot(
            on=card_cols,
            index=index_cols,
            variable_name="card",
            value_name="count",
        )
        .with_columns(
            pl.col("card").str.extract(r"(.*)_.*").alias("status"),
            pl.col("card").str.extract(r".*_(.*)").alias("name"),
        )
        .drop("card")
        .collect()
        .pivot(
            on="status",
            values="count",
        )
    )

    results.append(df_chunk)

df_final = pl.concat(results)

print(df_final.shape)

  num_rows = df_lazy.select(pl.count()).collect().item()


  0%|          | 0/10 [00:00<?, ?it/s]

(2760000, 26)


In [7]:
df_lazy = pl.scan_parquet(parquet_file)  # 680,000 rows; 1,400 columns
cols = df_lazy.collect_schema().names()

status_prefixes = ["tutored_", "deck_", "opening_", "drawn_", "sideboard_"]
card_cols = [col for col in cols if any([col.startswith(s) for s in status_prefixes])]
deck_cols = [col for col in cols if col.startswith('deck_')]
index_cols = [col for col in cols if col not in card_cols]
index_cols

chunk_size = 25000

def unpivot_cols(df, prefix="deck_"):
    cols = df.collect_schema().names()
    card_cols = [
        col for col in cols if any([col.startswith(s) for s in status_prefixes])
    ]
    prefixed_cols = [col for col in cols if col.startswith(prefix)]
    index_cols = [col for col in cols if col not in card_cols]

    return (
        df.unpivot(
            on=prefixed_cols,
            index=index_cols,
            variable_name="card",
            value_name=f"{prefix}count",
        )
        .with_columns(
            pl.col("card").str.extract(r".*_(.*)").alias("card_name"),
        )
        .drop("card")
        .select(["card_name", f"{prefix}count", *index_cols])
    )


def unpivot_and_join(df):
    index_cols = ["card_name", "draft_id", "build_index", "match_number", "game_number"]

    for i, status_prefix in enumerate(status_prefixes):
        df_temp = unpivot_cols(df.slice(0, chunk_size), prefix=status_prefix)
        if i == 0:
            df_final = df_temp
        else:
            df_final = df_final.join(
                df_temp.select(index_cols + [f"{status_prefix}count"]), on=index_cols, how="inner"
            )
    return df_final


def sort_columns(df):
    cols = df.collect_schema().names()
    card_cols = [
        "card_name",
        "deck_count",
        "opening_count",
        "drawn_count",
        "tutored_count",
        "sideboard_count",
    ]
    index_cols = [col for col in cols if col not in card_cols]
    return df.select(card_cols + index_cols)

start_time = time.time()

chunk_size = 25000  # max efficiency ~10k
num_rows = df_lazy.select(pl.len()).collect().item()

for i in tqdm(range(0, num_rows, chunk_size)):
    df_chunk = df_lazy.slice(i, chunk_size)
    df_chunk = unpivot_and_join(df_chunk)
    df_chunk = sort_columns(df_chunk).collect()
    df_chunk.write_parquet(
        paths["interim"] / f"{i}_game_data_public.BLB.PremierDraft.parquet"
    )

    end_time = time.time()
    k_per_s = (i + chunk_size) / (end_time - start_time)
    print(
        f"rows: {i + chunk_size}, time: {(end_time - start_time):.2f}, rows/s: {k_per_s:.2f}"
    )

  0%|          | 0/28 [00:00<?, ?it/s]

rows: 25000, time: 21.19, rows/s: 1179.74
rows: 50000, time: 42.97, rows/s: 1163.47
rows: 75000, time: 64.79, rows/s: 1157.58
rows: 100000, time: 86.39, rows/s: 1157.56
rows: 125000, time: 108.09, rows/s: 1156.40
rows: 150000, time: 129.73, rows/s: 1156.27
rows: 175000, time: 151.29, rows/s: 1156.72
rows: 200000, time: 172.82, rows/s: 1157.29
rows: 225000, time: 194.43, rows/s: 1157.21
rows: 250000, time: 215.95, rows/s: 1157.65
rows: 275000, time: 237.51, rows/s: 1157.85
rows: 300000, time: 259.21, rows/s: 1157.37
rows: 325000, time: 280.78, rows/s: 1157.50
rows: 350000, time: 302.86, rows/s: 1155.65
rows: 375000, time: 324.70, rows/s: 1154.90
rows: 400000, time: 346.51, rows/s: 1154.37
rows: 425000, time: 368.50, rows/s: 1153.31
rows: 450000, time: 390.43, rows/s: 1152.56
rows: 475000, time: 412.35, rows/s: 1151.92
rows: 500000, time: 434.41, rows/s: 1151.00
rows: 525000, time: 456.61, rows/s: 1149.79
rows: 550000, time: 478.72, rows/s: 1148.90
rows: 575000, time: 500.86, rows/s: 114

In [5]:
results = []
tot_len = 0
chunk_size = 25000  # max efficiency ~10k
df_lazy = pl.scan_parquet(parquet_file)
num_rows = df_lazy.select(pl.len()).collect().item()
k = 0
j = 0

for i in tqdm(range(0, num_rows, chunk_size)):
    df_temp = pl.read_parquet(
            paths["interim"] / f"{i}_game_data_public.BLB.PremierDraft.parquet", use_pyarrow=True, memory_map=False
        )
    tot_len += len(df_temp)
    results.append(df_temp)
    print(j, tot_len, df_temp.shape)
    if j ==  3:
        df_final = pl.concat(results)
        print(k, df_final.shape)
        df_final.write_parquet(
            paths["interim"] / f"{k}_game_data_public.BLB.PremierDraft.parquet"
        )
        k += 1
        j = 0
        results = []
    else:
        j += 1


# df_final = pl.concat(results)
# print(df_final.shape)
# print(f"{k_per_s:.2f} rows per second")
# df_final.head()

  0%|          | 0/28 [00:00<?, ?it/s]

6900000 (6900000, 26)
13800000 (6900000, 26)
20700000 (6900000, 26)
27600000 (6900000, 26)
(27600000, 26)
34500000 (6900000, 26)
41400000 (6900000, 26)
48300000 (6900000, 26)
55200000 (6900000, 26)
(27600000, 26)
62100000 (6900000, 26)
69000000 (6900000, 26)
75900000 (6900000, 26)
82800000 (6900000, 26)
(27600000, 26)
89700000 (6900000, 26)
96600000 (6900000, 26)
103500000 (6900000, 26)
110400000 (6900000, 26)
(27600000, 26)
117300000 (6900000, 26)
124200000 (6900000, 26)
131100000 (6900000, 26)
138000000 (6900000, 26)
(27600000, 26)
144900000 (6900000, 26)
151800000 (6900000, 26)
158700000 (6900000, 26)
165600000 (6900000, 26)
(27600000, 26)
172500000 (6900000, 26)
179400000 (6900000, 26)
186300000 (6900000, 26)
187670616 (1370616, 26)
(22070616, 26)


In [6]:
k=7
results = []
tot_len = 0
chunk_size = 25000  # max efficiency ~10k
df_lazy = pl.scan_parquet(parquet_file)
j = 0
k2 = 0

for i in tqdm(range(k)):
    df_temp = pl.read_parquet(
        paths["processed"] / f"{i}_game_data_public.BLB.PremierDraft.parquet",
        use_pyarrow=True,
        memory_map=False,
    )
    tot_len += len(df_temp)
    results.append(df_temp)
    print(j, tot_len, df_temp.shape)
    if j == 1:
        df_final = pl.concat(results)
        print(k, df_final.shape)
        df_final.write_parquet(
            paths["processed"] / f"{k2}_2_game_data_public.BLB.PremierDraft.parquet"
        )
        k2 += 1
        j = 0
        results = []
    else:
        j += 1

  0%|          | 0/7 [00:00<?, ?it/s]

0 27600000 (27600000, 26)
1 55200000 (27600000, 26)
7 (55200000, 26)


: 

In [9]:
df = df_final.collect()

: 

In [None]:
df_final.write_parquet(final_file)

In [None]:
(
    df_final.select(["card_name", "deck_count"])
    .groupby("card_name")
    .agg(pl.max("deck_count"))
    .alias("max_deck_count")
    .sort("max_deck_count", reverse=True)
    .head(10)
)