# Load the set from JSON

In [5]:
import json
import numpy as np

In [6]:
json_file = 'dataset/THB.json'

In [7]:
with open(json_file, 'r') as fp:
    raw_data = json.load(fp)
    cards = raw_data['cards']

n = len(cards)

# Dump a JSON containing only the cards 

In [8]:
with open('dataset/THB_cards.json', 'w') as fp:
    json.dump(cards, fp)

# Clean the data 

In [9]:
import pyspark.sql.functions as fn
from pyspark.sql.functions import udf
from pyspark.sql.types import LongType, StringType, StructField, StructType, BooleanType, ArrayType, IntegerType

In [None]:
# customSchema = StructType([
#     StructField("text", StringType(), True)
# ])

# df = spark.read.json('dataset/THB_cards.json', schema=customSchema)

In [None]:
df = spark.read.json('dataset/THB_cards.json')

In [None]:
df.colorIdentity

In [None]:
rules = {
    "When {CARDNAME} enters the battlefield": "ETB_EFFECT",
    
    "Flash (You may cast this spell any time you could cast an instant.)": "FLASH",
    "Flash": "FLASH",
    
    "Reach (This creature can block creatures with flying.)": "REACH",
    "Reach": "REACH",
    
    "Flying (This creature can't be blocked except by creatures with flying or reach.)": "FLYING",
    "Flying": "FLYING",
    
    "Haste (This creature can attack and {T} as soon as it comes under your control.)": "HASTE",
    "Haste": "HASTE",
    
    "Trample (This creature can deal excess combat damage to the player or planeswalker it's attacking.)": "TRAMPLE",
    "Trample": "TRAMPLE",
    
    "Vigilance (Attacking doesn't cause this creature to tap.)": "VIGILANCE",
    "Vigilance": " VIGILANCE",

    "Double strike (This creature deals both first-strike and regular combat damage.)": "DOUBLE_STRIKE",
    "Double strike": "DOUBLE_STRIKE",

    "Deathtouch (Any amount of damage this deals to a creature is enough to destroy it.)": "DEATHTOUCH",
    "Deathtouch": "DEATHTOUCH",
    
    "Protection from green (This creature can't be blocked, targeted, dealt damage, enchanted, or equipped by anything green.)": "PROTECTION_FROM_GREEN",
    "Protection from red (This creature can't be blocked, targeted, dealt damage, enchanted, or equipped by anything red.)": "PROTECTION_FROM_RED",
    "Protection from black (This creature can't be blocked, targeted, dealt damage, enchanted, or equipped by anything black.)": "PROTECTION_FROM_BLACK",
    "Protection from blue (This creature can't be blocked, targeted, dealt damage, enchanted, or equipped by anything blue.)": "PROTECTION_FROM_BLUE",
    "Protection from white (This creature can't be blocked, targeted, dealt damage, enchanted, or equipped by anything white.)": "PROTECTION_FROM_WHITE"
}

In [None]:
@udf
def udf_filter_text(name, text):
    if isinstance(text, str):
        new_text = text
        new_text = new_text.replace(name, "CARDNAME")
        for line in new_text:
            for rule, replace in rules.items():
                new_text = new_text.replace(rule, replace)

        return new_text

In [None]:
df = df.withColumn('filtered_text', udf_filter_text('name', 'text'))

In [None]:
df_colorIdentities = df.selectExpr(["number", "colorIdentity"]).select('number', fn.expr("concat_ws(',', colorIdentity)").alias("str_colorIdentity"))

In [None]:
df = df.join(df_colorIdentities, on='number')

## Encode the color identity to discrete ints 

In [None]:
df_colorId = df.select("str_colorIdentity").distinct()
# df_colorId.show()

In [None]:
import pandas as pd
import sklearn
from sklearn import preprocessing

In [None]:
pd_allColorIds = df.select("str_colorIdentity").toPandas()
pd_colorIds = df_colorId.toPandas()

In [None]:
le = preprocessing.LabelEncoder().fit(pd_colorIds)
encoded_colorIds = le.transform(pd_allColorIds)

In [None]:
# keep track of the dictionary!

In [None]:
pd_colorIds = pd.DataFrame(encoded_colorIds, columns=['encoded_colorIdentity'])
pd_colorIds['number'] = df.select("number").toPandas().astype('int32')

In [None]:
df_encoded_colorIds = spark.createDataFrame(pd_colorIds)

In [None]:
df_encoded_colorIds.first()

In [None]:
df = df.join(df_encoded_colorIds, on='number')

In [None]:
# df.first()

#  Create an SQL table

In [None]:
df.createOrReplaceTempView("cards")

In [None]:
tbl = spark.sql("""
    SELECT
        CAST(number as Integer), 
        rarity,
        name,
        CAST(convertedManaCost as Integer),
        CAST(encoded_colorIdentity as Integer) as colorIdentity,
        filtered_text,
        CAST(power as Integer),
        CAST(toughness as Integer)
    FROM
        cards
""")

In [None]:
tbl.first()

# Save to Parquet

In [None]:
tbl.write.mode("overwrite").parquet('/tmp/THB_cards.parquet')

In [None]:
## this fails
# tbl.write.mode("overwrite").parquet('dataset/THB_cards.parquet')

In [None]:
rm -rf "dataset/THB_cards.parquet/"

In [None]:
mv "/tmp/THB_cards.parquet" "dataset/"