In [25]:
import pandas as pd
from tqdm.notebook import tqdm
from pathlib import Path

In [26]:
filebase = '/root/mtg-modeling/data/17lands/game_data_public.OTJ.PremierDraft'
csv_filename = Path(f'{filebase}.csv')
feather_filename = Path(f'{filebase}.feather')

In [27]:
# Read the first 5 lines of the CSV file
df = pd.read_csv(csv_filename, nrows=100)
df.to_csv(Path(f'{filebase}_100.csv'))
df.head()

Unnamed: 0,expansion,event_type,draft_id,draft_time,game_time,build_index,match_number,game_number,rank,opp_rank,...,tutored_Wrangler of the Damned,deck_Wrangler of the Damned,sideboard_Wrangler of the Damned,"opening_hand_Wylie Duke, Atiin Hero","drawn_Wylie Duke, Atiin Hero","tutored_Wylie Duke, Atiin Hero","deck_Wylie Duke, Atiin Hero","sideboard_Wylie Duke, Atiin Hero",user_n_games_bucket,user_game_win_rate_bucket
0,OTJ,PremierDraft,a067b94d44374400842c97a6e684b53f,2024-04-16 16:55:03,2024-04-16 17:21:22,0,1,1,bronze,,...,0,2,0,0,0,0,0,0,50,0.58
1,OTJ,PremierDraft,a067b94d44374400842c97a6e684b53f,2024-04-16 16:55:03,2024-04-16 17:37:41,0,2,1,bronze,,...,0,2,0,0,0,0,0,0,50,0.58
2,OTJ,PremierDraft,a067b94d44374400842c97a6e684b53f,2024-04-16 16:55:03,2024-04-16 18:18:54,0,3,1,silver,,...,0,2,0,0,0,0,0,0,50,0.58
3,OTJ,PremierDraft,a067b94d44374400842c97a6e684b53f,2024-04-16 16:55:03,2024-04-16 18:27:46,0,4,1,silver,,...,0,2,0,0,0,0,0,0,50,0.58
4,OTJ,PremierDraft,a067b94d44374400842c97a6e684b53f,2024-04-16 16:55:03,2024-04-16 18:41:40,0,5,1,silver,,...,0,2,0,0,0,0,0,0,50,0.58


In [28]:
cols = df.columns.to_list()
open_cols = cols[:cols.index('won')+1] + cols[-2:]
deck_cols = [col for col in cols if 'deck' in col]
open_cols = open_cols + deck_cols
print(len(open_cols), len(cols))
open_cols[:5]

401 1925


['expansion', 'event_type', 'draft_id', 'draft_time', 'game_time']

In [29]:
read_csv = False

if read_csv:

    file_size = csv_filename.stat().st_size
    file_size_100 = Path(f'{filebase}_100.csv').stat().st_size

    # Estimate the number of chunks
    num_rows = (file_size / file_size_100) * 100 // 1
    print(num_rows)

    mem = df.memory_usage(deep=True).sum() * num_rows / 100 // 1 / 1e9
    print(mem)

    # Define the chunk size
    chunk_size = 10000  # Adjust based on your memory capacity

    # Estimate the number of chunks
    num_chunks = num_rows // chunk_size + 15
    print(num_chunks)

    # Create an empty list to store chunks
    chunks = []

    # Read the CSV file in chunks
    for i, chunk in enumerate(tqdm(pd.read_csv(f'{filebase}.csv', usecols=open_cols, chunksize=chunk_size), total=num_chunks, desc="Processing CSV")):
        chunks.append(chunk)

    df = pd.concat(chunks)
    df.reset_index(drop=True, inplace=True)
    df.to_feather(f'{filebase}.feather')

else:
    df = pd.read_feather(f'{filebase}.feather')

In [30]:
df.columns[:50]

Index(['expansion', 'event_type', 'draft_id', 'draft_time', 'game_time',
       'build_index', 'match_number', 'game_number', 'rank', 'opp_rank',
       'main_colors', 'splash_colors', 'on_play', 'num_mulligans',
       'opp_num_mulligans', 'opp_colors', 'num_turns', 'won',
       'deck_Abraded Bluffs', 'deck_Abrupt Decay', 'deck_Akul the Unrepentant',
       'deck_Aloe Alchemist', 'deck_Ambush Gigapede',
       'deck_Ancient Cornucopia', 'deck_Anguished Unmaking',
       'deck_Ankle Biter', 'deck_Annie Flash, the Veteran',
       'deck_Annie Joins Up', 'deck_Another Round', 'deck_Archangel of Tithes',
       'deck_Archive Trap', 'deck_Archmage's Charm', 'deck_Archmage's Newt',
       'deck_Arid Archway', 'deck_Armored Armadillo',
       'deck_Assimilation Aegis', 'deck_At Knifepoint',
       'deck_Aven Interrupter', 'deck_Back for More', 'deck_Badlands Revival',
       'deck_Bandit's Haul', 'deck_Baron Bertram Graywater',
       'deck_Beastbond Outcaster', 'deck_Bedevil',
       'deck

In [31]:
deck_cols = [col for col in df.columns if 'deck_' in col]
df[deck_cols].max().to_dict()

{'deck_Abraded Bluffs': 3,
 'deck_Abrupt Decay': 2,
 'deck_Akul the Unrepentant': 3,
 'deck_Aloe Alchemist': 4,
 'deck_Ambush Gigapede': 5,
 'deck_Ancient Cornucopia': 2,
 'deck_Anguished Unmaking': 2,
 'deck_Ankle Biter': 5,
 'deck_Annie Flash, the Veteran': 2,
 'deck_Annie Joins Up': 3,
 'deck_Another Round': 3,
 'deck_Archangel of Tithes': 3,
 'deck_Archive Trap': 3,
 "deck_Archmage's Charm": 3,
 "deck_Archmage's Newt": 3,
 'deck_Arid Archway': 3,
 'deck_Armored Armadillo': 4,
 'deck_Assimilation Aegis': 3,
 'deck_At Knifepoint': 4,
 'deck_Aven Interrupter': 1024,
 'deck_Back for More': 3,
 'deck_Badlands Revival': 4,
 "deck_Bandit's Haul": 4,
 'deck_Baron Bertram Graywater': 4,
 'deck_Beastbond Outcaster': 5,
 'deck_Bedevil': 3,
 'deck_Betrayal at the Vault': 3,
 'deck_Binding Negotiation': 4,
 'deck_Blacksnag Buzzard': 5,
 'deck_Blood Hustler': 5,
 'deck_Blooming Marsh': 4,
 'deck_Boneyard Desecrator': 4,
 'deck_Bonny Pall, Clearcutter': 3,
 'deck_Boom Box': 4,
 'deck_Botanical Sa

In [32]:
deck_counts = df[deck_cols].sum(axis=1)
deck_counts.value_counts().sort_index() / len(deck_counts)

40      9.377734e-01
41      5.340104e-02
42      4.830292e-03
43      1.896122e-03
44      7.173363e-04
45      5.510098e-04
46      2.925640e-04
47      1.748560e-04
48      8.870746e-05
49      3.582416e-05
50      1.057666e-04
51      2.473573e-05
52      1.705913e-05
53      2.900051e-05
54      6.823650e-06
55      7.676607e-06
57      3.411825e-06
59      2.558869e-06
60      1.535321e-05
69      4.264782e-06
72      2.558869e-06
296     3.411825e-06
297     8.529563e-07
1064    1.450026e-05
1065    8.529563e-07
Name: count, dtype: float64

In [33]:
deck_mask = deck_counts < 43
df = df[deck_mask]
df[deck_cols].max().max()

1024

In [34]:
col = 'deck_Worldwalker Helm'
mem = df.memory_usage(deep=True).sum() / 1e9
print(mem)
df[deck_cols] = df[deck_cols].astype('int8')
mem = df.memory_usage(deep=True).sum() / 1e9
print(mem)

4.267646502
1.153366599


In [35]:
df.head()

Unnamed: 0,expansion,event_type,draft_id,draft_time,game_time,build_index,match_number,game_number,rank,opp_rank,...,deck_Voidslime,deck_Voracious Varmint,deck_Vraska Joins Up,"deck_Vraska, the Silencer",deck_Wanted Griffin,deck_Worldwalker Helm,deck_Wrangler of the Damned,"deck_Wylie Duke, Atiin Hero",user_n_games_bucket,user_game_win_rate_bucket
0,OTJ,PremierDraft,a067b94d44374400842c97a6e684b53f,2024-04-16 16:55:03,2024-04-16 17:21:22,0,1,1,bronze,,...,0,0,0,0,0,0,2,0,50,0.58
1,OTJ,PremierDraft,a067b94d44374400842c97a6e684b53f,2024-04-16 16:55:03,2024-04-16 17:37:41,0,2,1,bronze,,...,0,0,0,0,0,0,2,0,50,0.58
2,OTJ,PremierDraft,a067b94d44374400842c97a6e684b53f,2024-04-16 16:55:03,2024-04-16 18:18:54,0,3,1,silver,,...,0,0,0,0,0,0,2,0,50,0.58
3,OTJ,PremierDraft,a067b94d44374400842c97a6e684b53f,2024-04-16 16:55:03,2024-04-16 18:27:46,0,4,1,silver,,...,0,0,0,0,0,0,2,0,50,0.58
4,OTJ,PremierDraft,a067b94d44374400842c97a6e684b53f,2024-04-16 16:55:03,2024-04-16 18:41:40,0,5,1,silver,,...,0,0,0,0,0,0,2,0,50,0.58


In [36]:
ids = pd.DataFrame(df['draft_id'].unique()).reset_index(drop=False)
ids.columns = ['draft_idx', 'draft_id']
mem = df.memory_usage(deep=True).sum() / 1e9
print(mem)
df = df.merge(ids, on='draft_id', how='left')
df.drop(columns=['draft_id', 'draft_time', 'game_time'], inplace=True)
mem = df.memory_usage(deep=True).sum() / 1e9
print(mem)

1.153366599
0.899973878


In [37]:
df.to_feather(f'{filebase}.feather')
mem = df.memory_usage(deep=True).sum() / 1e9
print(mem)
df = pd.read_feather(f'{filebase}.feather')
mem = df.memory_usage(deep=True).sum() / 1e9
print(mem)

0.899973878
0.899973878
