# Serialize Datasets & Dataloaders

This notebook will create datasets and dataloaders for a single 17lands set. 

These datasets can be used for model training. 

In [47]:
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import time
pd.options.display.max_rows = 1000
pd.options.display.max_columns = 1000

## ETL 

In [48]:
# This is better. 
# compressed_csv_path = "../data/BLB/draft_data_public.BLB.PremierDraft.csv.gz"
csv_path = "../data/BLB_new/draft_data_public.BLB.PremierDraft.csv" # 5 weeks of data - can not be loaded by pandas. 


# t0 = time.time()
# raw_draft_chunk = pd.read_csv(csv_path)
# raw_draft_chunk = pd.read_csv(compressed_csv_path, compression="gzip")
# print(f"Time to read compressed: {time.time() - t0}")

In [49]:
def remove_basics(draft_chunk):
    # Remove basic lands from draft. 
    basic_names = ["Forest", "Island", "Mountain", "Plains", "Swamp"]
    columns_to_drop = ["pack_card_" + b for b in basic_names] + ["pool_" + b for b in basic_names]
    draft_chunk = draft_chunk.drop(columns=columns_to_drop)
    draft_chunk = draft_chunk[~draft_chunk["pick"].isin(basic_names)]
    return draft_chunk

In [50]:
# Create dataloaders in memory-efficient fashion
pack_chunks, pool_chunks, pick_chunks = [], [], []
chunk_size = 100000 
min_date_str = '9999-01-01 00:00:00' # First date, 

t0 = time.time()

# One time computation. 
for draft_chunk in pd.read_csv(csv_path, chunksize=chunk_size):

    # Remove basics. 
    draft_chunk = remove_basics(draft_chunk)
    
    # Get date 1 week after start of draft (assumes drafts sorted by draft time). 
    first_date_str = draft_chunk["draft_time"].min()
    first_date_obj = datetime.strptime(first_date_str, "%Y-%m-%d %H:%M:%S")
    min_date_obj = first_date_obj + timedelta(days=7)
    min_date_str = min_date_obj.strftime("%Y-%m-%d %H:%M:%S")
    
    # Get cardnames and ids. 
    pack_cols = [col for col in draft_chunk.columns if col.startswith("pack_card")]
    cardnames = [col[10:] for col in sorted(pack_cols)]
    class_to_index = {cls: idx for idx, cls in enumerate(cardnames)}
    
    print("Completed one-time computation.")
    break

for draft_chunk in pd.read_csv(csv_path, chunksize=chunk_size):

    # Remove basics. 
    draft_chunk = remove_basics(draft_chunk)
    
    # Filtering. 
    draft_chunk = draft_chunk[draft_chunk["draft_time"] > min_date_str] # Filter out first week. 
    draft_chunk = draft_chunk[draft_chunk["rank"].isin(["diamond", "mythic"])] # Only highly ranked. 
    
    # Extract packs. 
    pack_chunk = draft_chunk[sorted(pack_cols)].astype(bool)

    # Extract pools. 
    pool_cols = [col for col in draft_chunk.columns if col.startswith("pool_")]
    pool_chunk = draft_chunk[sorted(pool_cols)].astype(np.uint8)
    
    # Extract picks. 
    pick_chunk = np.zeros((len(draft_chunk), len(cardnames)), dtype=bool)
    for i, item in enumerate(draft_chunk["pick"]):
        pick_chunk[i, class_to_index[item]] = True

    # Append data (consider multiple files for memory efficiency). 
    pick_chunks.append(pick_chunk)
    pack_chunks.append(pack_chunk)
    pool_chunks.append(pool_chunk)
    
    print("processed a chunk, t=", round(time.time()-t0), "s")

print("Done")

# Concatenate all chunks into a single DataFrame
picks = np.vstack(pick_chunks)
packs = pd.concat(pack_chunks, ignore_index=True)
pools = pd.concat(pool_chunks, ignore_index=True)

print(packs.shape, pools.shape, picks.shape)

Completed one-time computation.
processed a chunk, t= 3 s
processed a chunk, t= 5 s
processed a chunk, t= 6 s
processed a chunk, t= 8 s
processed a chunk, t= 9 s
processed a chunk, t= 11 s
processed a chunk, t= 12 s
processed a chunk, t= 14 s
processed a chunk, t= 16 s
processed a chunk, t= 17 s
processed a chunk, t= 19 s
processed a chunk, t= 20 s
processed a chunk, t= 22 s
processed a chunk, t= 23 s
processed a chunk, t= 25 s
processed a chunk, t= 28 s
processed a chunk, t= 30 s
processed a chunk, t= 31 s
processed a chunk, t= 33 s
processed a chunk, t= 35 s
processed a chunk, t= 36 s
processed a chunk, t= 38 s
processed a chunk, t= 39 s
processed a chunk, t= 41 s
processed a chunk, t= 43 s
processed a chunk, t= 44 s
processed a chunk, t= 46 s
processed a chunk, t= 48 s
processed a chunk, t= 49 s
processed a chunk, t= 51 s
processed a chunk, t= 53 s
processed a chunk, t= 54 s
processed a chunk, t= 56 s
processed a chunk, t= 58 s
processed a chunk, t= 60 s
processed a chunk, t= 62 s
p

((978335, 271), (978335, 271), (978335, 271))

In [52]:
# Possible filters on strength: 
    # Rank
    # Event match wins
    # User_n_games, winrate

In [53]:
# # Apply filters. 

# # Platinum+ players. 
# # draft_chunk = raw_draft_chunk[raw_draft_chunk["rank"].isin(["diamond", "mythic", "platinum"])]
# draft_chunk = raw_draft_chunk # All ranks

# # Filter out first week. 
# min_date_str = draft_chunk["draft_time"].min()
# date_obj = datetime.strptime(min_date_str, "%Y-%m-%d %H:%M:%S")
# new_date = date_obj + timedelta(days=7)
# new_date_str = new_date.strftime("%Y-%m-%d %H:%M:%S")
# draft_chunk = draft_chunk[draft_chunk["draft_time"] > new_date_str]

In [54]:
# from sklearn.model_selection import train_test_split

## Pick Data Loader

In [55]:
import torch
from torch.utils.data import DataLoader, Dataset

# Here's the MTG dataset class. 
class PickDataset(Dataset):
    def __init__(self, pools, packs, pick_vectors, cardnames):
        # Input is numpy arrays
        self.pools = pools
        self.packs = packs
        self.pick_vectors = pick_vectors
        self.cardnames = cardnames
                            
    def __len__(self):
        return len(self.packs)

    def __getitem__(self,index):
        return torch.from_numpy(self.pools[index]), torch.from_numpy(self.packs[index]), torch.from_numpy(self.pick_vectors[index])

In [56]:
# Determine train-val split. 
train_fraction = 0.8
tsize = round(len(pools) * train_fraction)

pick_train_dataset = PickDataset(pools[:tsize].values, packs[:tsize].values, picks[:tsize], cardnames)
pick_train_dataloader = DataLoader(pick_train_dataset, batch_size=1000, shuffle=False)

pick_val_dataset = PickDataset(pools[tsize:].values, packs[tsize:].values, picks[tsize:], cardnames)
pick_val_dataloader = DataLoader(pick_val_dataset, batch_size=1, shuffle=False)

In [57]:
# This is how we trieve cardnames
# pick_val_dataset.cardnames
# pick_val_dataloader.dataset.cardnames

In [58]:
t0 = time.time()
for batch in pick_val_dataset:
    po, pa, pv = batch
print(f"Time to load pick data: {round(time.time() - t0, 2)}s")

Time to load pick data: 0.53s


In [59]:
# Serialize updated datasets. 
dataset_folder = "/Users/danielbrooks/Desktop/Code/statistical-drafting/datasets/BLB/"
!mkdir -p $dataset_folder

# Runs slowly... hmmm.
torch.save(pick_train_dataset, dataset_folder + "pick_train_dataset_test.pth")
torch.save(pick_val_dataset, dataset_folder + "pick_val_dataset_test.pth")

In [60]:
# This serialization strategy is only recommended for files that I completely control. 
# ds = torch.load(dataset_folder + "pick_val_dataset.pth")

In [61]:
# May run into memory issues on larger datasets. 
!du -sh $dataset_folder/*

1.7G	/Users/danielbrooks/Desktop/Code/statistical-drafting/datasets/BLB//pick_train_dataset_3days.pth
608M	/Users/danielbrooks/Desktop/Code/statistical-drafting/datasets/BLB//pick_train_dataset_test.pth
931M	/Users/danielbrooks/Desktop/Code/statistical-drafting/datasets/BLB//pick_val_dataset.pth
435M	/Users/danielbrooks/Desktop/Code/statistical-drafting/datasets/BLB//pick_val_dataset_3days.pth
160M	/Users/danielbrooks/Desktop/Code/statistical-drafting/datasets/BLB//pick_val_dataset_test.pth
 20G	/Users/danielbrooks/Desktop/Code/statistical-drafting/datasets/BLB//siamese_train_dataset.pth
4.2G	/Users/danielbrooks/Desktop/Code/statistical-drafting/datasets/BLB//siamese_train_dataset_3days.pth


In [None]:
# Neural net architecture: https://github.com/khakhalin/MTG/blob/master/bots/draftsimtools/nnet_architecture.py
# MLP training script: https://github.com/khakhalin/MTG/blob/master/bots/train_nnet.py