# Summary

The code to wrangle the data.  Deal with NAs, join data frames, etc.

# Introduction

This data set is very large, in both memory and in size.  Each row is a game, with identifiers for the draft event and the player.  The columns have information on the given draft game, with the majority are for card counts per state, such as the number of a given card in the deck or the number of a given card in the opening hand.

Ideally I want to melt the data into a tall layout, with rows for each unique combination of card and game and columns for the card count states, such as deck, hand, etc.  This will allow me to easily filter and group the data for analysis.  Unfortunately, after attempting to wrangle in this format, I found that the data set is too large to handle in memory.  I could play with making spares arrays, but I think that would be too much work for this data set.

For this notebook, I will keep the data in the wide layout and aggregate down into useful tables, such as per-game and per-player summaries.

## Terms

The following terms are used in the data set:
- __game:__ a single game of MTG
- __match:__ a series of games between two players.  Typically best of 1 or best of 3.
- __draft:__ an initial draft event, followed by a series of matches between a group of players.
- __build:__ a instance of a deck in a draft.  Can be adjusted between games.

# Initial Setup

In [57]:
# | output: false
path = "../../../../scripts/notebook_header.py"
import os

if os.path.basename(os.getcwd()) != "mtg-modeling":
    args = f"--path {path}"
    get_ipython().run_line_magic("run", f"-i {path} {args}")  # type: ignore  # type: ignore

In [58]:
import pandas as pd
from tqdm.notebook import tqdm
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import time
import polars as pl

Define Paths

In [59]:
SET_CODE = "MKM"
OVERWRITE = False


paths = {
    "raw": Path("data/raw/17lands/game_data/PremierDraft"),
    "interim": Path("data/interim/17lands/game_data/premier_draft"),
    "processed": Path("data/processed/17lands/game_data/premier_draft"),
}


csv_file = paths["raw"] / f"game_data_public.{SET_CODE}.PremierDraft.csv"


parquet_file = paths["interim"] / f"game_data_public.{SET_CODE}.PremierDraft.parquet"


summary_file = paths["processed"] / f"{SET_CODE}_Game_PD_Summary.parquet"


game_file = paths["processed"] / f"{SET_CODE}_Game_PD_Games.parquet"


draft_file = paths["processed"] / f"{SET_CODE}_Game_PD_Drafts.parquet"


card_file = paths["processed"] / f"{SET_CODE}_Game_PD_Cards.parquet"

os.makedirs(paths["interim"], exist_ok=True)


os.makedirs(paths["processed"], exist_ok=True)

Convert the csv file to parquet if needed.

In [60]:
if not os.path.exists(parquet_file) or OVERWRITE:
    # df = pd.read_csv(csv_file)  # pandas works faster than polars, unless >2GB then it breaks
    # df.to_parquet(parquet_file)
    df = pl.scan_csv(csv_file)
    df.sink_parquet(parquet_file)
    # print(f"Converted {csv_file} to parquet")

Scan file into a lazy frame and set the schema.

In [61]:
def standardize_schema(df):
    # Cast 'opp_rank' column to String if it exists
    cols = df_lazy.collect_schema().names()
    if "opp_rank" in cols:
        df = df.with_columns(pl.col("opp_rank").cast(pl.Utf8))
    return df

In [62]:
df_lazy = pl.scan_parquet(parquet_file)
df_lazy = standardize_schema(df_lazy)

Parse column names, such as the card name and the card state.

In [63]:
cols = df_lazy.collect_schema().names()
state_prefixes = ["tutored_", "deck_", "opening_hand_", "drawn_", "sideboard_"]
card_cols = [col for col in cols if any([col.startswith(s) for s in state_prefixes])]
index_cols = [col for col in cols if col not in card_cols]

card_names = [col.split("_")[-1] for col in card_cols if col.startswith("deck_")]
land_cards = ["Plains", "Island", "Swamp", "Mountain", "Forest"]
non_land_cards = [card for card in card_names if card not in land_cards]
non_land_card_cols = [
    col for col in card_cols if all([land not in col for land in land_cards])
]
land_card_cols = [col for col in card_cols if any([land in col for land in land_cards])]
len(card_cols), len(non_land_card_cols), len(land_card_cols)

(1630, 1600, 30)

Check for extra underscores in card names.

In [64]:
counts = [col.count("_") for col in card_cols if col.startswith("deck_")]
assert max(counts) == 1, "There are some cards with underscores in their names"

Show first few rows of the data.

In [65]:
df_lazy.filter(pl.col("game_number") > 1).select(index_cols).head(30).collect()

expansion,event_type,draft_id,draft_time,game_time,build_index,match_number,game_number,rank,opp_rank,main_colors,splash_colors,on_play,num_mulligans,opp_num_mulligans,opp_colors,num_turns,won,user_n_games_bucket,user_game_win_rate_bucket
str,str,str,str,str,i64,i64,i64,str,str,str,str,bool,i64,i64,str,i64,bool,i64,f64


Convert timestamps from strings to datetimes

In [66]:
df_lazy = df_lazy.with_columns(
    pl.col("draft_time").str.strptime(pl.Datetime, "%Y-%m-%d %H:%M:%S"),
    pl.col("game_time").str.strptime(pl.Datetime, "%Y-%m-%d %H:%M:%S"),
)

# Games Table

A table of each game, excludes card data.

Note that the 'opp_rank' does not have consistent type

In [67]:
df_games = df_lazy.select(index_cols)
df_games.collect().write_parquet(game_file)

df_games.head(2).collect()

expansion,event_type,draft_id,draft_time,game_time,build_index,match_number,game_number,rank,opp_rank,main_colors,splash_colors,on_play,num_mulligans,opp_num_mulligans,opp_colors,num_turns,won,user_n_games_bucket,user_game_win_rate_bucket
str,str,str,datetime[μs],datetime[μs],i64,i64,i64,str,str,str,str,bool,i64,i64,str,i64,bool,i64,f64
"""MKM""","""PremierDraft""","""8e1e92694dcd44e380720f2ec3d971…",2024-02-06 17:48:51,2024-02-06 18:41:03,0,1,1,,"""None""","""WB""","""G""",True,0,0,"""WB""",9,True,50,0.62
"""MKM""","""PremierDraft""","""8e1e92694dcd44e380720f2ec3d971…",2024-02-06 17:48:51,2024-02-06 18:49:03,0,2,1,"""silver""","""None""","""WB""","""G""",True,0,0,"""RG""",9,True,50,0.62


# Draft Table

A table of the aggregated draft data, excluding card data.

In [68]:
df_draft = (
    df_games.group_by("draft_id")
    .agg(
        pl.col("expansion").first(),
        pl.col("build_index").max().alias("n_builds"),
        pl.col("match_number").max().alias("n_matches"),
        pl.col("game_number").count().alias("n_games"),
        pl.col("won").sum().alias("n_wins"),
        pl.col("won").mean().alias("win_rate"),
        pl.col("rank").last(),
        pl.col("opp_rank").last(),
        pl.col("main_colors").last(),
        pl.col("opp_colors").last(),
        pl.col("splash_colors").last(),
        pl.col("on_play").sum().alias("n_starts"),
        pl.col("num_mulligans").sum().alias("n_muls"),
        pl.col("opp_num_mulligans").sum().alias("n_opp_muls"),
        pl.col("num_turns").mean().alias("mean_turns"),
        pl.col("draft_time").mean().alias("draft_time"),
        pl.col("game_time").first().alias("first_game_time"),
        pl.col("game_time").last().alias("last_game_time"),
    )
    .with_columns(
        (pl.col("n_games") - pl.col("n_wins")).alias("n_losses"),
    )
)

df_draft.collect().write_parquet(draft_file)

df_draft.head().collect()

draft_id,expansion,n_builds,n_matches,n_games,n_wins,win_rate,rank,opp_rank,main_colors,opp_colors,splash_colors,n_starts,n_muls,n_opp_muls,mean_turns,draft_time,first_game_time,last_game_time,n_losses
str,str,i64,i64,u32,u32,f64,str,str,str,str,str,u32,i64,i64,f64,datetime[μs],datetime[μs],datetime[μs],u32
"""dd1e9b19a8bf4b6d87d58798796063…","""MKM""",0,7,7,4,0.571429,"""platinum""","""None""","""WU""","""BG""",,5,0,4,8.428571,2024-03-07 22:47:01,2024-03-07 23:11:35,2024-03-08 21:04:31,3
"""38e6baea32d944c59235e03931ff55…","""MKM""",0,5,5,2,0.4,"""diamond""","""None""","""UG""","""WUB""","""B""",2,0,0,9.6,2024-03-11 11:27:09,2024-03-11 11:49:45,2024-03-11 12:27:10,3
"""162e396bf1a448f7b1ef9ab97e7615…","""MKM""",0,3,3,0,0.0,"""platinum""","""None""","""UBRG""","""B""","""W""",1,0,0,5.333333,2024-02-06 20:36:34,2024-02-06 20:58:30,2024-02-06 21:05:37,3
"""1e15f17a131649db8fa64c2991c656…","""MKM""",0,5,5,2,0.4,"""platinum""","""None""","""UG""","""WR""","""WB""",1,0,1,9.0,2024-02-15 20:12:26,2024-02-15 20:37:50,2024-02-16 15:11:45,3
"""9f3f793e47264c7db21c6a43979c2c…","""MKM""",0,7,7,4,0.571429,"""bronze""","""None""","""BR""","""WR""",,4,1,1,7.571429,2024-02-10 18:42:08,2024-02-10 19:02:46,2024-02-10 20:02:53,3


# Card Data

In [69]:
id_cols = ["expansion", "draft_id", "match_number", "game_number", "build_index"]
df_card = df_lazy.select(*id_cols, *land_card_cols, *non_land_card_cols)

df_card.collect().write_parquet(card_file)

df_card.head().collect()

expansion,draft_id,match_number,game_number,build_index,opening_hand_Forest,drawn_Forest,tutored_Forest,deck_Forest,sideboard_Forest,opening_hand_Island,drawn_Island,tutored_Island,deck_Island,sideboard_Island,opening_hand_Mistveil Plains,drawn_Mistveil Plains,tutored_Mistveil Plains,deck_Mistveil Plains,sideboard_Mistveil Plains,opening_hand_Mountain,drawn_Mountain,tutored_Mountain,deck_Mountain,sideboard_Mountain,opening_hand_Plains,drawn_Plains,tutored_Plains,deck_Plains,sideboard_Plains,opening_hand_Swamp,drawn_Swamp,tutored_Swamp,deck_Swamp,sideboard_Swamp,opening_hand_A Killer Among Us,drawn_A Killer Among Us,…,deck_Vitu-Ghazi Inspector,sideboard_Vitu-Ghazi Inspector,opening_hand_Warleader's Call,drawn_Warleader's Call,tutored_Warleader's Call,deck_Warleader's Call,sideboard_Warleader's Call,opening_hand_Wispdrinker Vampire,drawn_Wispdrinker Vampire,tutored_Wispdrinker Vampire,deck_Wispdrinker Vampire,sideboard_Wispdrinker Vampire,opening_hand_Wojek Investigator,drawn_Wojek Investigator,tutored_Wojek Investigator,deck_Wojek Investigator,sideboard_Wojek Investigator,opening_hand_Worldsoul's Rage,drawn_Worldsoul's Rage,tutored_Worldsoul's Rage,deck_Worldsoul's Rage,sideboard_Worldsoul's Rage,opening_hand_Worldspine Wurm,drawn_Worldspine Wurm,tutored_Worldspine Wurm,deck_Worldspine Wurm,sideboard_Worldspine Wurm,opening_hand_Wrench,drawn_Wrench,tutored_Wrench,deck_Wrench,sideboard_Wrench,"opening_hand_Yarus, Roar of the Old Gods","drawn_Yarus, Roar of the Old Gods","tutored_Yarus, Roar of the Old Gods","deck_Yarus, Roar of the Old Gods","sideboard_Yarus, Roar of the Old Gods"
str,str,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,…,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64
"""MKM""","""8e1e92694dcd44e380720f2ec3d971…",1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,7,0,2,1,0,8,0,0,0,…,0,0,0,0,0,0,0,0,1,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
"""MKM""","""8e1e92694dcd44e380720f2ec3d971…",2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,7,0,0,1,0,8,0,0,0,…,0,0,0,0,0,0,0,0,1,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
"""MKM""","""8e1e92694dcd44e380720f2ec3d971…",3,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,2,0,7,0,1,2,0,8,0,0,0,…,0,0,0,0,0,0,0,2,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
"""MKM""","""8e1e92694dcd44e380720f2ec3d971…",4,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,1,0,7,0,0,2,0,8,0,0,0,…,0,0,0,0,0,0,0,0,1,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
"""MKM""","""8e1e92694dcd44e380720f2ec3d971…",5,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,2,0,7,0,1,1,0,8,0,0,0,…,0,0,0,0,0,0,0,0,1,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


# Summary Table

In [70]:
df_select = df_lazy.select(
    pl.first("expansion").alias("expansion"),
    pl.min("game_time").alias("first_game"),
    pl.max("game_time").alias("last_game"),
    pl.max("game_number").alias("max_games"),
    pl.mean("won").alias("win_rate"),
    pl.mean("on_play").alias("start_rate"),
    pl.count("draft_id").alias("n_games"),
    pl.sum("num_mulligans").alias("n_mul"),
    pl.sum("opp_num_mulligans").alias("n_opp_mul"),
    pl.mean("num_turns").alias("mean_turns"),
    pl.sum("num_turns").alias("total_turns"),
    pl.max("num_turns").alias("max_turns"),
    pl.min("num_turns").alias("min_turns"),
)


df_draft_ct = df_draft.select(
    pl.count("draft_id").alias("n_drafts"),
    pl.sum("n_matches").alias("n_matches"),
    pl.mean("n_matches").alias("mean_matches"),
)


df_max_card = df_lazy.select(non_land_card_cols).max().collect()


df_sum_land = df_lazy.select(land_card_cols).collect().sum_horizontal()


df_summary = pl.concat([df_select, df_draft_ct], how="horizontal")


df_summary = df_summary.with_columns(
    (pl.col("n_games") / pl.col("n_drafts")).alias("n_games_per_draft"),
    pl.lit(len(card_names)).alias("n_cards"),
    df_max_card.max_horizontal().alias("max_card"),
    pl.lit(df_sum_land.mean()).alias("mean_land"),
    pl.lit(df_sum_land.max()).alias("max_land"),
    pl.lit(df_sum_land.min()).alias("min_land"),
)


df_summary = df_summary.collect()


df_summary.write_parquet(summary_file)


df_summary

expansion,first_game,last_game,max_games,win_rate,start_rate,n_games,n_mul,n_opp_mul,mean_turns,total_turns,max_turns,min_turns,n_drafts,n_matches,mean_matches,n_games_per_draft,n_cards,max_card,mean_land,max_land,min_land
str,datetime[μs],datetime[μs],i64,f64,f64,u32,i64,i64,f64,i64,i64,i64,u32,i64,f64,f64,i32,i64,f64,i32,i32
"""MKM""",2024-02-06 16:24:44,2024-03-18 23:59:48,1,0.549798,0.49982,964377,114014,121101,9.278472,8947945,32,1,165251,965887,5.844969,5.835832,326,8,22.525862,38,9
