# Load the set from JSON

In [1]:
import json
import numpy as np

In [2]:
json_file = 'dataset/THB.json'

In [3]:
with open(json_file, 'r') as fp:
    raw_data = json.load(fp)
    cards = raw_data['cards']

n = len(cards)

# Dump the cards in a JSON

In [4]:
with open('dataset/THB_cards.json', 'w') as fp:
    json.dump(cards, fp)

# Clean the data 

In [5]:
import pyspark.sql.functions as fn
from pyspark.sql.functions import udf
from pyspark.sql.types import LongType, StringType, StructField, StructType, BooleanType, ArrayType, IntegerType

In [6]:
rules = {
    "When {CARDNAME} enters the battlefield": "ETB_EFFECT",
    
    "Flash (You may cast this spell any time you could cast an instant.)": "FLASH",
    "Flash": "FLASH",
    
    "Reach (This creature can block creatures with flying.)": "REACH",
    "Reach": "REACH",
    
    "Flying (This creature can't be blocked except by creatures with flying or reach.)": "FLYING",
    "Flying": "FLYING",
    
    "Haste (This creature can attack and {T} as soon as it comes under your control.)": "HASTE",
    "Haste": "HASTE",
    
    "Trample (This creature can deal excess combat damage to the player or planeswalker it's attacking.)": "TRAMPLE",
    "Trample": "TRAMPLE",
    
    "Vigilance (Attacking doesn't cause this creature to tap.)": "VIGILANCE",
    "Vigilance": " VIGILANCE",

    "Double strike (This creature deals both first-strike and regular combat damage.)": "DOUBLE_STRIKE",
    "Double strike": "DOUBLE_STRIKE",

    "Deathtouch (Any amount of damage this deals to a creature is enough to destroy it.)": "DEATHTOUCH",
    "Deathtouch": "DEATHTOUCH",
    
    "Protection from green (This creature can't be blocked, targeted, dealt damage, enchanted, or equipped by anything green.)": "PROTECTION_FROM_GREEN",
    "Protection from red (This creature can't be blocked, targeted, dealt damage, enchanted, or equipped by anything red.)": "PROTECTION_FROM_RED",
    "Protection from black (This creature can't be blocked, targeted, dealt damage, enchanted, or equipped by anything black.)": "PROTECTION_FROM_BLACK",
    "Protection from blue (This creature can't be blocked, targeted, dealt damage, enchanted, or equipped by anything blue.)": "PROTECTION_FROM_BLUE",
    "Protection from white (This creature can't be blocked, targeted, dealt damage, enchanted, or equipped by anything white.)": "PROTECTION_FROM_WHITE"
}

In [7]:
@udf
def udf_filter_text(name, text):
    if isinstance(text, str):
        new_text = text
        new_text = new_text.replace(name, 'CARDNAME')
        for line in new_text:
            for rule, replace in rules.items():
                new_text = new_text.replace(rule, replace)

        return new_text

In [8]:
df = spark.read.json('dataset/THB_cards.json')

## Filter out duplicate cards

In [9]:
pd_names = df.select(['number', 'name']).toPandas()
unique_names, indices, counts = np.unique(pd_names['name'], return_index=True, return_counts=True)

In [10]:
pd_unique_names = pd_names.loc[indices]

In [11]:
df_filter = spark.createDataFrame(pd_unique_names)

In [12]:
df_filtered = df_filter.join(df, on='number', how='left').drop(df_filter.name)

In [13]:
num_cards  = df_filtered.count()

In [14]:
print(f'Final number of cards {num_cards}')

Final number of cards 273


## Filter the text

In [15]:
df_filtered = df_filtered.withColumn('filtered_text', udf_filter_text('name', 'text'))

## Explode the color identity in a string, separated by "," 

In [16]:
df_colorIdentities = df_filtered.selectExpr(["number", "colorIdentity"]).select('number', fn.expr("concat_ws(',', colorIdentity)").alias("str_colorIdentity"))

In [17]:
df_filtered = df_filtered.join(df_colorIdentities, on='number')

## Encode the color identity to discrete ints 

### scikit-learn + pandas 

In [18]:
# import pandas as pd
# import sklearn
# from sklearn import preprocessing

In [19]:
# df_colorId = df_filtered.select("str_colorIdentity").distinct()

In [20]:
# pd_allColorIds = df_filtered.select("str_colorIdentity").toPandas()
# pd_colorIds = df_colorId.toPandas()

In [21]:
# le = preprocessing.LabelEncoder().fit(pd_colorIds)
# encoded_colorIds = le.transform(pd_allColorIds)

In [22]:
# https://spark.apache.org/docs/latest/api/python/pyspark.ml.html#pyspark.ml.feature.StringIndexer

In [23]:
# pd_colorIds = pd.DataFrame(encoded_colorIds, columns=['encoded_colorIdentity'])
# pd_colorIds['number'] = df_filtered.select("number").toPandas().astype('int32')

In [24]:
# df_encoded_colorIds = spark.createDataFrame(pd_colorIds)

In [25]:
# df_filtered = df_filtered.join(df_encoded_colorIds, on='number')

### spark + ml

In [32]:
from pyspark.ml.feature import StringIndexer

In [33]:
indexer = StringIndexer(inputCol="str_colorIdentity", outputCol="encoded_colorIdentity", stringOrderType='alphabetAsc')
df_filtered = indexer.fit(df_filtered).transform(df_filtered)

In [34]:
df_filtered.select(["str_colorIdentity", "encoded_colorIdentity"]).limit(5).show()

+-----------------+---------------------+
|str_colorIdentity|encoded_colorIdentity|
+-----------------+---------------------+
|                G|                  6.0|
|                R|                 10.0|
|                R|                 10.0|
|                U|                 13.0|
|                W|                 15.0|
+-----------------+---------------------+



In [35]:
df_filtered.select(["str_colorIdentity", "encoded_colorIdentity"]).distinct().show()

+-----------------+---------------------+
|str_colorIdentity|encoded_colorIdentity|
+-----------------+---------------------+
|              B,U|                  4.0|
|                 |                  0.0|
|              G,W|                  9.0|
|              G,R|                  7.0|
|                U|                 13.0|
|                W|                 15.0|
|              U,W|                 14.0|
|                G|                  6.0|
|              G,U|                  8.0|
|              B,W|                  5.0|
|              B,G|                  2.0|
|              R,W|                 12.0|
|                B|                  1.0|
|              B,R|                  3.0|
|              R,U|                 11.0|
|                R|                 10.0|
+-----------------+---------------------+



#  Create an SQL table

In [36]:
df_filtered.createOrReplaceTempView("cards")

In [37]:
tbl = spark.sql("""
    SELECT
        CAST(number as Integer), 
        rarity,
        name,
        CAST(convertedManaCost as Integer),
        CAST(encoded_colorIdentity as Integer),
        str_colorIdentity as colorIdentity,
        text as originalText,
        filtered_text as filteredText,
        CAST(power as Integer),
        CAST(toughness as Integer)
    FROM
        cards
""")

# Save to Parquet

In [38]:
tbl.write.mode("overwrite").parquet('/tmp/THB_cards.parquet')

In [39]:
## this fails
# tbl.write.mode("overwrite").parquet('dataset/THB_cards.parquet')

In [40]:
rm -rf "dataset/THB_cards.parquet/"

In [41]:
mv "/tmp/THB_cards.parquet" "dataset/"