## Taking raw data from scryfall and forming a table with the relevant fields. This will be used downstream to create the actual input for Roberta finetuning

In [2]:
import os, sys, re
import json
import numpy as np
import torch 
import polars as pl
from polars import selectors as cs
import pyarrow.feather as feather
import pandas as pd

In [3]:
from torch.utils.data import Dataset, DataLoader

In [4]:
from sqlalchemy import (MetaData, Table, Column, Integer, create_engine)

In [5]:
import json

In [6]:
torch.cuda.is_available()

False

In [7]:
from transformers import AutoTokenizer, AutoModel

In [8]:
# scryfall_path = r"C:\Users\breuh\OneDrive\proggy\python\MTG\roberta\scryfall_json\default-cards-20230731210608.json"
scryfall_path = r"C:\Users\breuh\OneDrive\proggy\python\MTG\roberta\scryfall_json\oracle-cards-20250623212748.json"
scryfall_path = os.path.normpath(scryfall_path)

scryfall_temp = os.path.join(os.path.dirname(scryfall_path), "scryfall_temp.json")

lands_folder = r"C:\Users\breuh\OneDrive\proggy\python\MTG\roberta\17Lands"
lands_folder = os.path.normpath(lands_folder)

lands_csv_fnames = [f for f in os.listdir(lands_folder) if '.csv' in f if "combo" not in f]

In [9]:
lands_csv_fnames[0]

'game_data_public.BRO.Sealed.csv'

In [10]:
import unicodedata
# remove weird text accents (ex: Ã -> A)
def remove_accents(input_str):
    try:
        nfkd_form = unicodedata.normalize('NFKD', input_str)
        clean = ''.join([c for c in nfkd_form if not unicodedata.combining(c)])
    except:
        return "<unk>"
    return clean

In [11]:
with open(scryfall_path, 'r', encoding="utf-8") as f:
    scryfall_txt = json.load(f)

In [12]:
scryfall_legalities = ['standard', 'pioneer', 'modern', 'explorer', 'historic', 'vintage', 'legacy']

In [13]:
scryfall_feature_columns =[ 'mana_cost', 'type_line', 'power', 'toughness', 'oracle_text',\
                           'colors', 'name', 'cmc', 'color_identity', 'keywords', 'set']

filter_columns = ['lang', 'legalities']

In [14]:
for index, card in enumerate(scryfall_txt):
    new_card = dict()
    for k, v in card.items():
        if k == 'name':
            new_card[k] = remove_accents(v)
            continue
        elif k == 'oracle_text':
            new_card[k] = remove_accents(v)
            continue
        new_card[k] = v 
    scryfall_txt[index] = new_card


In [15]:
scryfall_txt_subset = [{k: v for k, v in card.items() if k in scryfall_feature_columns+filter_columns} for card in scryfall_txt]

In [16]:
df_utf8 = pl.from_dicts(scryfall_txt_subset)

In [17]:
legality_filter = [pl.col('legalities').struct.field(a) == 'legal' for a in scryfall_legalities]
df_utf8 = df_utf8.filter(pl.any_horizontal(legality_filter) & (pl.col('lang') == 'en'))

In [18]:
df_utf8.shape

(29849, 13)

In [74]:
scryfall_feature_columns

['mana_cost',
 'type_line',
 'power',
 'toughness',
 'oracle_text',
 'colors',
 'name',
 'cmc',
 'color_identity',
 'keywords',
 'set']

In [19]:
df_utf8 = (
    df_utf8
    .select(scryfall_feature_columns)
    .filter(~pl.col("name").str.starts_with("A-"))
    .sort("name")
)

In [20]:
print(df_utf8.shape)
df_utf8 = df_utf8.unique(subset='name')
print(df_utf8.shape)

(29632, 11)
(29632, 11)


In [21]:
#convert list columns to comma-separated strings
df_utf8 = df_utf8.with_columns(cs.by_dtype(pl.List(str)).list.join(", "))

In [22]:
df_utf8.columns

['mana_cost',
 'type_line',
 'power',
 'toughness',
 'oracle_text',
 'colors',
 'name',
 'cmc',
 'color_identity',
 'keywords',
 'set']

In [23]:
df_utf8.filter(pl.any_horizontal([pl.col(c) == "<UNK>" for c in ['name', 'oracle_text']])).select(pl.col("name")).sort(by='name')

name
str


In [24]:
df_for_export = df_utf8.filter(~pl.any_horizontal([pl.col(c) == "<UNK>" for c in ['name', 'oracle_text']]))
df_for_export.head()

mana_cost,type_line,power,toughness,oracle_text,colors,name,cmc,color_identity,keywords,set
str,str,str,str,str,str,str,f64,str,str,str
"""{1}{W}""","""Artifact — Equipment""",,,"""Equipped creature gets +2/+2. …","""W""","""+2 Mace""",2.0,"""W""","""Equip""","""afr"""
"""{2}{G}""","""Enchantment — Saga""",,,"""(As this Saga enters and after…","""G""","""A Golden Opportunity""",3.0,"""G""","""Conjure""","""ywoe"""
"""{1}{W}""","""Instant""",,,"""Tap up to two target creatures…","""W""","""A Good Day to Pie""",2.0,"""W""","""""","""unf"""
"""{4}{G}""","""Enchantment""",,,"""When this enchantment enters, …","""G""","""A Killer Among Us""",5.0,"""G""","""""","""mkm"""
"""{1}{U}""","""Instant""",,,"""Casualty 1 (As you cast this s…","""U""","""A Little Chat""",2.0,"""U""","""Casualty""","""snc"""


In [25]:
from pathlib import Path

In [26]:
export_path = os.path.normpath(r"C:\Users\breuh\OneDrive\proggy\python\MTG\roberta\data_setups\training_database.db")

In [27]:
engine_path = 'sqlite:///'+export_path
engine = create_engine(engine_path)

In [96]:
with engine.connect() as conn:
    df_for_export.to_pandas().to_sql('scryfall_pruned', conn, if_exists='replace', index=False)

    # A) exact, case-sensitive equality / prefix LIKE ('foo%')
    conn.exec_driver_sql("""
        CREATE INDEX IF NOT EXISTS ix_scryfall_name
        ON scryfall_pruned(name);
    """)

    
    # Gather stats & let SQLite tune plans
    conn.exec_driver_sql("ANALYZE;")
    conn.exec_driver_sql("PRAGMA optimize;")
    conn.commit()

In [109]:
with engine.connect() as conn:
    df = pl.read_database("SELECT * FROM scryfall_pruned", conn)

In [111]:
df.columns

['mana_cost',
 'type_line',
 'power',
 'toughness',
 'oracle_text',
 'colors',
 'name',
 'cmc',
 'color_identity',
 'keywords',
 'set']

In [104]:
metadata = MetaData()
metadata.reflect(bind=engine)

In [105]:
metadata.tables.keys()

dict_keys(['BRO_Sealed', 'BRO_TradSealed', 'DFT_Sealed', 'DFT_TradSealed', 'KHM_PremierDraft', 'KTK_Sealed', 'KTK_TradSealed', 'LCI_Sealed', 'LCI_TradSealed', 'LTR_PremierDraft', 'LTR_Sealed', 'LTR_TradSealed', 'MKM_Sealed', 'MKM_TradSealed', 'MOM_Sealed', 'MOM_TradSealed', 'NEO_Sealed', 'ONE_Sealed', 'ONE_TradSealed', 'PIO_Sealed', 'SIR_Sealed', 'SIR_TradSealed', 'SNC_TradSealed', 'WOE_PremierDraft', 'WOE_Sealed', 'WOE_TradSealed', 'scryfall_pruned'])

# Rule Book for Roberta Input

In [66]:
magic_ruleset_path = r"C:\Users\breuh\OneDrive\proggy\python\MTG\roberta\mtg_comprehensive_rules\pruned_MagicCompRules20230616.txt"
magic_ruleset_path = os.path.normpath(magic_ruleset_path)

In [67]:
chunk_size = 475

with open(magic_ruleset_path, 'r') as f:
    lines = f.readlines()
    lines = [l.strip("\n").strip("\ufeff").strip() for l in lines if l != '\n']

ruleset_chunked = []
for idx, line in enumerate(lines):
    rule = line 
    for i in range(idx+1, len(lines)):
        newline = lines[i]
        newrule = rule + " " + newline 
        if len(newrule.split(" ")) > chunk_size:
            #  print(newrule)
             break 
        rule = newrule 
    ruleset_chunked.append(rule)

In [69]:
ruleset_df = pl.DataFrame(pl.Series(ruleset_chunked), schema=["text"])

In [72]:
ruleset_df.write_database("ruleset", engine_path, engine="sqlalchemy", if_exists='replace')