# Summary

The code to wrangle the data.  Deal with NAs, join data frames, etc.

# Introduction

This data set is very large, in both memory and in size.  Each row is a game, with identifiers for the draft event and the player.  The columns have information on the given draft game, with the majority are for card counts per state, such as the number of a given card in the deck or the number of a given card in the opening hand.

Ideally I want to melt the data into a tall layout, with rows for each unique combination of card and game and columns for the card count states, such as deck, hand, etc.  This will allow me to easily filter and group the data for analysis.  Unfortunately, after attempting to wrangle in this format, I found that the data set is too large to handle in memory.  I could play with making spares arrays, but I think that would be too much work for this data set.

For this notebook, I will keep the data in the wide layout and aggregate down into useful tables, such as per-game and per-player summaries.

## Terms

The following terms are used in the data set:
- __game:__ a single game of MTG
- __match:__ a series of games between two players.  Typically best of 1 or best of 3.
- __draft:__ an initial draft event, followed by a series of matches between a group of players.
- __build:__ a instance of a deck in a draft.  Can be adjusted between games.

# Initial Setup

In [40]:
# Setup Notebook
import os

if os.path.basename(os.getcwd()) != "mtg-modeling":
    get_ipython().run_line_magic("run", '-i "../../scripts/notebook_header.py"')  # type: ignore

In [41]:
import pandas as pd
from tqdm.notebook import tqdm
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import time
import polars as pl

Define Paths

In [42]:
SET_CODE = "MH3"

paths = {
    "raw": Path("data/raw/17lands/game_data/PremierDraft"),
    "interim": Path("data/interim/17lands/game_data/premier_draft"),
    "processed": Path("data/processed/17lands/game_data/premier_draft"),
}

csv_file = paths["raw"] / f"game_data_public.{SET_CODE}.PremierDraft.csv"
parquet_file = paths["interim"] / f"game_data_public.{SET_CODE}.PremierDraft.parquet"

summary_file = paths["processed"] / f"{SET_CODE}_Game_PD_Summary.parquet"
game_file = paths["processed"] / f"{SET_CODE}_Game_PD_Games.parquet"
draft_file = paths["processed"] / f"{SET_CODE}_Game_PD_Drafts.parquet"
card_file = paths["processed"] / f"{SET_CODE}_Game_PD_Cards.parquet"

os.makedirs(paths["interim"], exist_ok=True)
os.makedirs(paths["processed"], exist_ok=True)

Convert the csv file to parquet if needed.

In [43]:
if not os.path.exists(parquet_file):
    # df = pd.read_csv(csv_file)  # pandas works faster than polars, unless >2GB then it breaks
    # df.to_parquet(parquet_file)
    df = pl.scan_csv(csv_file)
    df.sink_parquet(parquet_file)
    # print(f"Converted {csv_file} to parquet")

Scan file into a lazy frame.

In [44]:
df_lazy = pl.scan_parquet(parquet_file)

Parse column names, such as the card name and the card state.

In [45]:
cols = df_lazy.collect_schema().names()
state_prefixes = ["tutored_", "deck_", "opening_hand_", "drawn_", "sideboard_"]
card_cols = [col for col in cols if any([col.startswith(s) for s in state_prefixes])]
index_cols = [col for col in cols if col not in card_cols]

card_names = [col.split("_")[-1] for col in card_cols if col.startswith("deck_")]
land_cards = ["Plains", "Island", "Swamp", "Mountain", "Forest"]
non_land_cards = [card for card in card_names if card not in land_cards]
non_land_card_cols = [
    col for col in card_cols if all([land not in col for land in land_cards])
]
land_card_cols = [col for col in card_cols if any([land in col for land in land_cards])]
len(card_cols), len(non_land_card_cols), len(land_card_cols)

(1630, 1605, 25)

Check for extra underscores in card names.

In [46]:
counts = [col.count("_") for col in card_cols if col.startswith("deck_")]
assert max(counts) == 1, "There are some cards with underscores in their names"

Show first few rows of the data.

In [47]:
df_lazy.filter(pl.col("game_number") > 1).select(index_cols).head(30).collect()

expansion,event_type,draft_id,draft_time,game_time,build_index,match_number,game_number,rank,opp_rank,main_colors,splash_colors,on_play,num_mulligans,opp_num_mulligans,opp_colors,num_turns,won,user_n_games_bucket,user_game_win_rate_bucket
str,str,str,str,str,i64,i64,i64,str,str,str,str,bool,i64,i64,str,i64,bool,i64,f64
"""MH3""","""PremierDraft""","""1eb51dd54ef54385bfaeb5e38935a3…","""2024-06-24 12:18:07""","""2024-06-26 12:56:58""",0,6,2,"""mythic""","""None""","""WB""","""U""",False,0,0,"""WUB""",12,False,100,0.54
"""MH3""","""PremierDraft""","""974497b3df524cf383afe967f69398…","""2024-06-12 09:09:24""","""2024-06-13 17:33:07""",0,9,2,"""gold""","""None""","""WG""","""B""",True,1,1,"""WG""",7,False,100,0.52
"""MH3""","""PremierDraft""","""2b455807b08e4c528df0f6f6c874c1…","""2024-06-16 10:52:05""","""2024-06-17 10:32:42""",1,9,2,"""platinum""","""None""","""WG""","""B""",False,0,0,"""WR""",8,True,100,0.56
"""MH3""","""PremierDraft""","""5b0cce69d76042a196d6663dfc565a…","""2024-07-05 18:44:34""","""2024-07-07 09:28:02""",2,3,2,"""silver""","""None""","""WB""",,True,0,0,"""WURG""",12,False,10,0.38


Convert timestamps from strings to datetimes

In [48]:
df_lazy = df_lazy.with_columns(
    pl.col("draft_time").str.strptime(pl.Datetime, "%Y-%m-%d %H:%M:%S"),
    pl.col("game_time").str.strptime(pl.Datetime, "%Y-%m-%d %H:%M:%S"),
)

# Games Table

A table of each game, excludes card data.

In [49]:
df_games = df_lazy.select(index_cols)
df_games.collect().write_parquet(game_file)

df_games.head(2).collect()

expansion,event_type,draft_id,draft_time,game_time,build_index,match_number,game_number,rank,opp_rank,main_colors,splash_colors,on_play,num_mulligans,opp_num_mulligans,opp_colors,num_turns,won,user_n_games_bucket,user_game_win_rate_bucket
str,str,str,datetime[μs],datetime[μs],i64,i64,i64,str,str,str,str,bool,i64,i64,str,i64,bool,i64,f64
"""MH3""","""PremierDraft""","""5bf33eab9e7745a68d0a3cf59c2ca5…",2024-06-11 15:32:57,2024-06-11 16:06:01,0,1,1,"""gold""","""None""","""WUR""",,False,0,0,"""RG""",10,True,10,0.62
"""MH3""","""PremierDraft""","""5bf33eab9e7745a68d0a3cf59c2ca5…",2024-06-11 15:32:57,2024-06-11 16:50:17,0,2,1,"""gold""","""None""","""WUR""",,False,0,0,"""UB""",19,True,10,0.62


# Draft Table

A table of the aggregated draft data, excluding card data.

In [50]:
df_draft = (
    df_games.group_by("draft_id")
    .agg(
        pl.col("build_index").max().alias("n_builds"),
        pl.col("match_number").max().alias("n_matches"),
        pl.col("game_number").count().alias("n_games"),
        pl.col("won").sum().alias("n_wins"),
        pl.col("won").mean().alias("win_rate"),
        pl.col("rank").last(),
        pl.col("opp_rank").last(),
        pl.col("main_colors").last(),
        pl.col("opp_colors").last(),
        pl.col("splash_colors").last(),
        pl.col("on_play").sum().alias("n_starts"),
        pl.col("num_mulligans").sum().alias("n_muls"),
        pl.col("opp_num_mulligans").sum().alias("n_opp_muls"),
        pl.col("num_turns").mean().alias("mean_turns"),
        pl.col("draft_time").mean().alias("draft_time"),
        pl.col("game_time").first().alias("first_game_time"),
        pl.col("game_time").last().alias("last_game_time"),
    )
    .with_columns(
        (pl.col("n_games") - pl.col("n_wins")).alias("n_losses"),
    )
)

df_draft.collect().write_parquet(draft_file)

df_draft.head().collect()

draft_id,n_builds,n_matches,n_games,n_wins,win_rate,rank,opp_rank,main_colors,opp_colors,splash_colors,n_starts,n_muls,n_opp_muls,mean_turns,draft_time,first_game_time,last_game_time,n_losses
str,i64,i64,u32,u32,f64,str,str,str,str,str,u32,i64,i64,f64,datetime[μs],datetime[μs],datetime[μs],u32
"""79a08bea4574490ca34883687276b3…",0,8,8,5,0.625,"""platinum""","""None""","""URG""","""UB""",,4,1,0,7.5,2024-07-09 14:33:41,2024-07-09 14:58:21,2024-07-09 17:54:55,3
"""94de267b709d45ab80cbbd56b51f6a…",1,6,6,3,0.5,"""platinum""","""None""","""UR""","""WUR""","""B""",3,0,2,8.333333,2024-06-15 22:22:41,2024-06-15 22:45:35,2024-06-15 23:11:52,3
"""b59ff369db0949fba55d25f33a9a61…",1,7,7,4,0.571429,"""platinum""","""None""","""WB""","""WUR""",,4,0,0,7.857143,2024-06-14 06:17:55,2024-06-14 06:49:53,2024-06-14 22:03:44,3
"""f075826d79a6418ea86b7d55de101a…",0,5,5,3,0.6,"""diamond""","""None""","""UB""","""BG""","""WR""",4,2,0,10.0,2024-06-13 00:26:12,2024-06-13 00:54:09,2024-06-13 11:26:32,2
"""bb3a2f5028344240b90064fade9506…",1,7,7,4,0.571429,"""platinum""","""None""","""WR""","""WUR""",,3,3,1,7.285714,2024-06-22 18:14:24,2024-06-22 18:22:11,2024-06-22 20:19:47,3


# Card Data

In [51]:
id_cols = ["draft_id", "match_number", "game_number", "build_index"]
df_card = df_lazy.select(*id_cols, *land_card_cols, *non_land_card_cols)

df_card.collect().write_parquet(card_file)

df_card.head().collect()

draft_id,match_number,game_number,build_index,opening_hand_Forest,drawn_Forest,tutored_Forest,deck_Forest,sideboard_Forest,opening_hand_Island,drawn_Island,tutored_Island,deck_Island,sideboard_Island,opening_hand_Mountain,drawn_Mountain,tutored_Mountain,deck_Mountain,sideboard_Mountain,opening_hand_Plains,drawn_Plains,tutored_Plains,deck_Plains,sideboard_Plains,opening_hand_Swamp,drawn_Swamp,tutored_Swamp,deck_Swamp,sideboard_Swamp,opening_hand_Abstruse Appropriation,drawn_Abstruse Appropriation,tutored_Abstruse Appropriation,deck_Abstruse Appropriation,sideboard_Abstruse Appropriation,opening_hand_Accursed Marauder,drawn_Accursed Marauder,tutored_Accursed Marauder,…,deck_Witch Enchanter,sideboard_Witch Enchanter,opening_hand_Wither and Bloom,drawn_Wither and Bloom,tutored_Wither and Bloom,deck_Wither and Bloom,sideboard_Wither and Bloom,opening_hand_Wooded Foothills,drawn_Wooded Foothills,tutored_Wooded Foothills,deck_Wooded Foothills,sideboard_Wooded Foothills,opening_hand_Worn Powerstone,drawn_Worn Powerstone,tutored_Worn Powerstone,deck_Worn Powerstone,sideboard_Worn Powerstone,opening_hand_Wrath of the Skies,drawn_Wrath of the Skies,tutored_Wrath of the Skies,deck_Wrath of the Skies,sideboard_Wrath of the Skies,opening_hand_Writhing Chrysalis,drawn_Writhing Chrysalis,tutored_Writhing Chrysalis,deck_Writhing Chrysalis,sideboard_Writhing Chrysalis,opening_hand_Wumpus Aberration,drawn_Wumpus Aberration,tutored_Wumpus Aberration,deck_Wumpus Aberration,sideboard_Wumpus Aberration,opening_hand_Wurmcoil Larva,drawn_Wurmcoil Larva,tutored_Wurmcoil Larva,deck_Wurmcoil Larva,sideboard_Wurmcoil Larva
str,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,…,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64
"""5bf33eab9e7745a68d0a3cf59c2ca5…",1,1,0,0,0,0,0,0,1,0,0,3,0,0,1,0,4,0,1,2,0,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,…,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
"""5bf33eab9e7745a68d0a3cf59c2ca5…",2,1,0,0,0,0,0,0,0,1,0,3,0,1,2,0,4,0,1,3,0,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,…,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
"""5bf33eab9e7745a68d0a3cf59c2ca5…",3,1,0,0,0,0,0,0,0,1,0,3,0,1,2,0,4,0,2,0,0,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,…,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
"""5bf33eab9e7745a68d0a3cf59c2ca5…",4,1,0,0,0,0,0,0,0,1,0,3,0,1,2,0,4,0,1,2,0,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,…,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
"""5bf33eab9e7745a68d0a3cf59c2ca5…",5,1,0,0,0,0,0,0,1,2,0,3,0,1,2,0,4,0,1,1,0,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,…,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


# Summary Table

In [52]:
df_select = df_lazy.select(
    pl.min("game_time").alias("first_game"),
    pl.max("game_time").alias("last_game"),
    pl.max("game_number").alias("max_games"),
    pl.mean("won").alias("win_rate"),
    pl.mean("on_play").alias("start_rate"),
    pl.count("draft_id").alias("n_games"),
    pl.sum("num_mulligans").alias("n_mul"),
    pl.sum("opp_num_mulligans").alias("n_opp_mul"),
    pl.mean("num_turns").alias("mean_turns"),
    pl.sum("num_turns").alias("total_turns"),
    pl.max("num_turns").alias("max_turns"),
    pl.min("num_turns").alias("min_turns"),
)

df_draft_ct = df_draft.select(
    pl.count("draft_id").alias("n_drafts"),
    pl.sum("n_matches").alias("n_matches"),
    pl.mean("n_matches").alias("mean_matches"),
)

df_max_card = df_lazy.select(non_land_card_cols).max().collect()
df_sum_land = df_lazy.select(land_card_cols).collect().sum_horizontal()

df_summary = pl.concat([df_select, df_draft_ct], how="horizontal")

df_summary = df_summary.with_columns(
    (pl.col("n_games") / pl.col("n_drafts")).alias("n_games_per_draft"),
    pl.lit(len(card_names)).alias("n_cards"),
    df_max_card.max_horizontal().alias("max_card"),
    pl.lit(df_sum_land.mean()).alias("mean_land"),
    pl.lit(df_sum_land.max()).alias("max_land"),
    pl.lit(df_sum_land.min()).alias("min_land"),
)

df_summary = df_summary.collect()
df_summary.write_parquet(summary_file)

df_summary

first_game,last_game,max_games,win_rate,start_rate,n_games,n_mul,n_opp_mul,mean_turns,total_turns,max_turns,min_turns,n_drafts,n_matches,mean_matches,n_games_per_draft,n_cards,max_card,mean_land,max_land,min_land
datetime[μs],datetime[μs],i64,f64,f64,u32,i64,i64,f64,i64,i64,i64,u32,i64,f64,f64,i32,i64,f64,i32,i32
2024-06-11 15:25:09,2024-07-15 23:59:59,2,0.550959,0.49909,767772,92041,96863,8.384565,6437434,32,1,131813,770724,5.847102,5.824706,326,10,18.029247,37,4
