# Summary

The code to wrangle the data.  Deal with NAs, join data frames, etc.

# Introduction

This data set is very large, in both memory and in size.  Each row is a game, with identifiers for the draft event and the player.  The columns have information on the given draft game, with the majority are for card counts per state, such as the number of a given card in the deck or the number of a given card in the opening hand.

Ideally I want to melt the data into a tall layout, with rows for each unique combination of card and game and columns for the card count states, such as deck, hand, etc.  This will allow me to easily filter and group the data for analysis.  Unfortunately, after attempting to wrangle in this format, I found that the data set is too large to handle in memory.  I could play with making spares arrays, but I think that would be too much work for this data set.

For this notebook, I will keep the data in the wide layout and aggregate down into useful tables, such as per-game and per-player summaries.

## Terms

The following terms are used in the data set:
- __game:__ a single game of MTG
- __match:__ a series of games between two players.  Typically best of 1 or best of 3.
- __draft:__ an initial draft event, followed by a series of matches between a group of players.
- __build:__ a instance of a deck in a draft.  Can be adjusted between games.

# Initial Setup

In [1]:
# Setup Notebook
import os

if os.path.basename(os.getcwd()) != "mtg-modeling":
    get_ipython().run_line_magic("run", '-i "../../scripts/notebook_header.py"')  # type: ignore

Changed working directory to: d:\mtg-modeling


In [2]:
import pandas as pd
from tqdm.notebook import tqdm
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import time
import polars as pl

Define Paths

In [3]:
SET_CODE = "BLB"

paths = {
    "raw": Path("data/raw/17lands/game_data/PremierDraft"),
    "interim": Path("data/interim/17lands/game_data/premier_draft"),
    "processed": Path("data/processed/17lands/game_data/premier_draft"),
}

csv_file = paths["raw"] / f"game_data_public.{SET_CODE}.PremierDraft.csv"
parquet_file = paths["interim"] / f"game_data_public.{SET_CODE}.PremierDraft.parquet"

summary_file = paths["processed"] / f"{SET_CODE}_Game_PD_Summary.parquet"
game_file = paths["processed"] / f"{SET_CODE}_Game_PD_Games.parquet"
draft_file = paths["processed"] / f"{SET_CODE}_Game_PD_Drafts.parquet"
card_file = paths["processed"] / f"{SET_CODE}_Game_PD_Cards.parquet"


os.makedirs(paths["interim"], exist_ok=True)
os.makedirs(paths["processed"], exist_ok=True)

Convert the csv file to parquet if needed.

In [4]:
if not os.path.exists(parquet_file):
    df = pd.read_csv(csv_file)
    df.to_parquet(parquet_file)
    print(f"Converted {csv_file} to parquet")

Scan file into a lazy frame.

In [5]:
df_lazy = pl.scan_parquet(parquet_file)

Parse column names, such as the card name and the card state.

In [6]:
cols = df_lazy.collect_schema().names()
state_prefixes = ["tutored_", "deck_", "opening_hand_", "drawn_", "sideboard_"]
card_cols = [col for col in cols if any([col.startswith(s) for s in state_prefixes])]
index_cols = [col for col in cols if col not in card_cols]

card_names = [col.split("_")[-1] for col in card_cols if col.startswith("deck_")]
land_cards = ["Plains", "Island", "Swamp", "Mountain", "Forest"]
non_land_cards = [card for card in card_names if card not in land_cards]
non_land_card_cols = [
    col for col in card_cols if all([land not in col for land in land_cards])
]
land_card_cols = [col for col in card_cols if any([land in col for land in land_cards])]
len(card_cols), len(non_land_card_cols), len(land_card_cols)

(1380, 1355, 25)

Check for extra underscores in card names.

In [7]:
counts = [col.count("_") for col in card_cols if col.startswith("deck_")]
assert max(counts) == 1, "There are some cards with underscores in their names"

Show first few rows of the data.

In [8]:
df_lazy.filter(pl.col("game_number") > 1).select(index_cols).head(30).collect()

expansion,event_type,draft_id,draft_time,game_time,build_index,match_number,game_number,rank,opp_rank,main_colors,splash_colors,on_play,num_mulligans,opp_num_mulligans,opp_colors,num_turns,won,user_n_games_bucket,user_game_win_rate_bucket
str,str,str,str,str,i64,i64,i64,str,str,str,str,bool,i64,i64,str,i64,bool,i64,f64
"""BLB""","""PremierDraft""","""19e2e5ee33d84b01888f4215aeff9c…","""2024-08-01 22:14:29""","""2024-08-02 15:37:05""",0,7,2,"""platinum""","""None""","""BR""",,True,0,0,"""UG""",13,True,100,0.56
"""BLB""","""PremierDraft""","""dd424f45c9fb44f2929aa0e6d900ed…","""2024-08-06 21:03:27""","""2024-08-07 23:03:23""",0,6,2,"""platinum""","""None""","""UG""",,False,0,1,"""UB""",7,False,100,0.56


Convert timestamps from strings to datetimes

In [9]:
df_lazy = df_lazy.with_columns(
    pl.col("draft_time").str.strptime(pl.Datetime, "%Y-%m-%d %H:%M:%S"),
    pl.col("game_time").str.strptime(pl.Datetime, "%Y-%m-%d %H:%M:%S"),
)

# Games Table

A table of each game, excludes card data.

In [10]:
df_games = df_lazy.select(index_cols)
df_games.collect().write_parquet(game_file)

df_games.head(2).collect()

expansion,event_type,draft_id,draft_time,game_time,build_index,match_number,game_number,rank,opp_rank,main_colors,splash_colors,on_play,num_mulligans,opp_num_mulligans,opp_colors,num_turns,won,user_n_games_bucket,user_game_win_rate_bucket
str,str,str,datetime[μs],datetime[μs],i64,i64,i64,str,str,str,str,bool,i64,i64,str,i64,bool,i64,f64
"""BLB""","""PremierDraft""","""deaa4cdcd3e84d8e8b5a0ea34a0f9d…",2024-07-30 22:30:39,2024-07-30 22:58:45,0,1,1,"""gold""","""None""","""BG""","""WR""",True,0,1,"""RG""",5,True,10,0.54
"""BLB""","""PremierDraft""","""deaa4cdcd3e84d8e8b5a0ea34a0f9d…",2024-07-30 22:30:39,2024-07-30 23:13:12,0,2,1,"""gold""","""None""","""BG""","""WR""",True,0,0,"""WG""",10,True,10,0.54


# Draft Table

A table of the aggregated draft data, excluding card data.

In [11]:
df_draft = (
    df_games.group_by("draft_id")
    .agg(
        pl.col("build_index").max().alias("n_builds"),
        pl.col("match_number").max().alias("n_matches"),
        pl.col("game_number").count().alias("n_games"),
        pl.col("won").sum().alias("n_wins"),
        pl.col("won").mean().alias("win_rate"),
        pl.col("rank").last(),
        pl.col("opp_rank").last(),
        pl.col("main_colors").last(),
        pl.col("opp_colors").last(),
        pl.col("splash_colors").last(),
        pl.col("on_play").sum().alias("n_starts"),
        pl.col("num_mulligans").sum().alias("n_muls"),
        pl.col("opp_num_mulligans").sum().alias("n_opp_muls"),
        pl.col("num_turns").mean().alias("mean_turns"),
        pl.col("draft_time").mean().alias("draft_time"),
        pl.col("game_time").first().alias("first_game_time"),
        pl.col("game_time").last().alias("last_game_time"),
    )
    .with_columns(
        (pl.col("n_games") - pl.col("n_wins")).alias("n_losses"),
    )
)

df_draft.collect().write_parquet(draft_file)

df_draft.head().collect()

draft_id,n_builds,n_matches,n_games,n_wins,win_rate,rank,opp_rank,main_colors,opp_colors,splash_colors,n_starts,n_muls,n_opp_muls,mean_turns,draft_time,first_game_time,last_game_time,n_losses
str,i64,i64,u32,u32,f64,str,str,str,str,str,u32,i64,i64,f64,datetime[μs],datetime[μs],datetime[μs],u32
"""25f0adf414204fd69cc93a361eca42…",2,4,4,1,0.25,"""gold""","""None""","""BR""","""WB""","""UG""",3,0,1,10.75,2024-08-05 17:25:06,2024-08-05 17:52:05,2024-08-05 18:36:03,3
"""8a90660af4104999bc39382bbc6e3a…",0,9,9,6,0.666667,"""mythic""","""None""","""BG""","""BG""",,4,2,0,9.222222,2024-08-14 20:59:54,2024-08-14 21:30:38,2024-08-14 22:55:21,3
"""1bbbf35893c14dc7bdc1feb1f2c330…",0,5,5,2,0.4,"""platinum""","""None""","""WR""","""UB""","""G""",5,1,1,10.4,2024-08-14 02:43:40,2024-08-14 03:02:41,2024-08-14 03:45:41,3
"""43d91ca15020407db217609ad000d7…",0,6,6,3,0.5,"""diamond""","""None""","""RG""","""WU""",,5,2,0,9.166667,2024-08-07 11:38:29,2024-08-07 12:07:52,2024-08-07 13:02:34,3
"""0e838dcb611e45aa8cb08ed5595223…",0,5,5,4,0.8,"""platinum""","""None""","""BG""","""UR""",,2,0,0,9.2,2024-08-04 20:09:27,2024-08-04 20:39:22,2024-08-04 21:13:01,1


# Card Data

In [12]:
id_cols = ["draft_id", "match_number", "game_number", "build_index"]
df_card = df_lazy.select(*id_cols, *land_card_cols, *non_land_card_cols)

df_card.collect().write_parquet(card_file)

df_card.head().collect()

draft_id,match_number,game_number,build_index,opening_hand_Forest,drawn_Forest,tutored_Forest,deck_Forest,sideboard_Forest,opening_hand_Island,drawn_Island,tutored_Island,deck_Island,sideboard_Island,opening_hand_Mountain,drawn_Mountain,tutored_Mountain,deck_Mountain,sideboard_Mountain,opening_hand_Plains,drawn_Plains,tutored_Plains,deck_Plains,sideboard_Plains,opening_hand_Swamp,drawn_Swamp,tutored_Swamp,deck_Swamp,sideboard_Swamp,opening_hand_Agate Assault,drawn_Agate Assault,tutored_Agate Assault,deck_Agate Assault,sideboard_Agate Assault,opening_hand_Agate-Blade Assassin,drawn_Agate-Blade Assassin,tutored_Agate-Blade Assassin,…,deck_Whiskerquill Scribe,sideboard_Whiskerquill Scribe,opening_hand_Whiskervale Forerunner,drawn_Whiskervale Forerunner,tutored_Whiskervale Forerunner,deck_Whiskervale Forerunner,sideboard_Whiskervale Forerunner,opening_hand_Wick's Patrol,drawn_Wick's Patrol,tutored_Wick's Patrol,deck_Wick's Patrol,sideboard_Wick's Patrol,"opening_hand_Wick, the Whorled Mind","drawn_Wick, the Whorled Mind","tutored_Wick, the Whorled Mind","deck_Wick, the Whorled Mind","sideboard_Wick, the Whorled Mind",opening_hand_Wildfire Howl,drawn_Wildfire Howl,tutored_Wildfire Howl,deck_Wildfire Howl,sideboard_Wildfire Howl,opening_hand_Wishing Well,drawn_Wishing Well,tutored_Wishing Well,deck_Wishing Well,sideboard_Wishing Well,"opening_hand_Ygra, Eater of All","drawn_Ygra, Eater of All","tutored_Ygra, Eater of All","deck_Ygra, Eater of All","sideboard_Ygra, Eater of All","opening_hand_Zoraline, Cosmos Caller","drawn_Zoraline, Cosmos Caller","tutored_Zoraline, Cosmos Caller","deck_Zoraline, Cosmos Caller","sideboard_Zoraline, Cosmos Caller"
str,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,…,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64
"""deaa4cdcd3e84d8e8b5a0ea34a0f9d…",1,1,0,2,1,0,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,8,0,0,0,0,0,0,0,0,0,…,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
"""deaa4cdcd3e84d8e8b5a0ea34a0f9d…",2,1,0,3,1,0,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,8,0,0,0,0,0,0,0,0,0,…,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0
"""deaa4cdcd3e84d8e8b5a0ea34a0f9d…",3,1,0,2,3,0,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,8,0,0,0,0,0,0,0,0,0,…,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0
"""deaa4cdcd3e84d8e8b5a0ea34a0f9d…",4,1,0,1,3,0,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,2,0,8,0,0,0,0,0,0,0,0,0,…,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0
"""deaa4cdcd3e84d8e8b5a0ea34a0f9d…",5,1,0,3,2,0,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,8,0,0,0,0,0,0,0,0,0,…,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0


# Summary Table

In [13]:
df_select = df_lazy.select(
    pl.min("game_time").alias("first_game"),
    pl.max("game_time").alias("last_game"),
    pl.max("game_number").alias("max_games"),
    pl.mean("won").alias("win_rate"),
    pl.mean("on_play").alias("start_rate"),
    pl.count("draft_id").alias("n_games"),
    pl.sum("num_mulligans").alias("n_mul"),
    pl.sum("opp_num_mulligans").alias("n_opp_mul"),
    pl.mean("num_turns").alias("mean_turns"),
    pl.sum("num_turns").alias("total_turns"),
    pl.max("num_turns").alias("max_turns"),
    pl.min("num_turns").alias("min_turns"),
)

df_draft_ct = df_draft.select(
    pl.count("draft_id").alias("n_drafts"),
    pl.sum("n_matches").alias("n_matches"),
    pl.mean("n_matches").alias("mean_matches"),
)

df_max_card = df_lazy.select(non_land_card_cols).max().collect()
df_sum_land = df_lazy.select(land_card_cols).collect().sum_horizontal()

df_summary = pl.concat([df_select, df_draft_ct], how="horizontal")

df_summary = df_summary.with_columns(
    (pl.col("n_games") / pl.col("n_drafts")).alias("n_games_per_draft"),
    pl.lit(len(card_names)).alias("n_cards"),
    df_max_card.max_horizontal().alias("max_card"),
    pl.lit(df_sum_land.mean()).alias("mean_land"),
    pl.lit(df_sum_land.max()).alias("max_land"),
    pl.lit(df_sum_land.min()).alias("min_land"),
)

df_summary = df_summary.collect()
df_summary.write_parquet(summary_file)

df_summary

first_game,last_game,max_games,win_rate,start_rate,n_games,n_mul,n_opp_mul,mean_turns,max_turns,min_turns,n_drafts,n_matches,mean_matches,n_games_per_draft,n_cards,max_card,mean_land,max_land,min_land
datetime[μs],datetime[μs],i64,f64,f64,u32,i64,i64,f64,i64,i64,u32,i64,f64,f64,i32,i64,f64,i32,i32
2024-07-30 15:41:04,2024-08-18 23:59:52,2,0.547108,0.500285,679966,92172,96719,8.88899,33,1,116804,680693,5.827651,5.821427,276,7,22.164273,37,11
