# Load the set from JSON

In [None]:
import json
import numpy as np

In [None]:
json_file = 'dataset/THB.json'

In [None]:
with open(json_file, 'r') as fp:
    raw_data = json.load(fp)
    cards = raw_data['cards']

n = len(cards)

# Dump the cards in a JSON

In [None]:
with open('dataset/THB_cards.json', 'w') as fp:
    json.dump(cards, fp)

# Clean the data 

In [None]:
import pyspark.sql.functions as fn
from pyspark.sql.functions import udf
from pyspark.sql.types import LongType, StringType, StructField, StructType, BooleanType, ArrayType, IntegerType

In [None]:
rules = {
    "When {CARDNAME} enters the battlefiTHB": "ETB_EFFECT",
    
    "Flash (You may cast this spell any time you could cast an instant.)": "FLASH",
    "Flash": "FLASH",
    
    "Reach (This creature can block creatures with flying.)": "REACH",
    "Reach": "REACH",
    
    "Flying (This creature can't be blocked except by creatures with flying or reach.)": "FLYING",
    "Flying": "FLYING",
    
    "Haste (This creature can attack and {T} as soon as it comes under your control.)": "HASTE",
    "Haste": "HASTE",
    
    "Trample (This creature can deal excess combat damage to the player or planeswalker it's attacking.)": "TRAMPLE",
    "Trample": "TRAMPLE",
    
    "Vigilance (Attacking doesn't cause this creature to tap.)": "VIGILANCE",
    "Vigilance": " VIGILANCE",

    "Double strike (This creature deals both first-strike and regular combat damage.)": "DOUBLE_STRIKE",
    "Double strike": "DOUBLE_STRIKE",

    "Deathtouch (Any amount of damage this deals to a creature is enough to destroy it.)": "DEATHTOUCH",
    "Deathtouch": "DEATHTOUCH",
    
    "Protection from green (This creature can't be blocked, targeted, dealt damage, enchanted, or equipped by anything green.)": "PROTECTION_FROM_GREEN",
    "Protection from red (This creature can't be blocked, targeted, dealt damage, enchanted, or equipped by anything red.)": "PROTECTION_FROM_RED",
    "Protection from black (This creature can't be blocked, targeted, dealt damage, enchanted, or equipped by anything black.)": "PROTECTION_FROM_BLACK",
    "Protection from blue (This creature can't be blocked, targeted, dealt damage, enchanted, or equipped by anything blue.)": "PROTECTION_FROM_BLUE",
    "Protection from white (This creature can't be blocked, targeted, dealt damage, enchanted, or equipped by anything white.)": "PROTECTION_FROM_WHITE"
}

In [None]:
@udf
def udf_filter_text(name, text):
    if isinstance(text, str):
        new_text = text
        new_text = new_text.replace(name, 'CARDNAME')
        for line in new_text:
            for rule, replace in rules.items():
                new_text = new_text.replace(rule, replace)

        return new_text

In [None]:
df = spark.read.json('dataset/THB_cards.json')

In [None]:
df.count()

## Filter out duplicate cards

In [None]:
pd_names = df.select(['number', 'name']).toPandas()
unique_names, indices, counts = np.unique(pd_names['name'], return_index=True, return_counts=True)

In [None]:
pd_unique_names = pd_names.loc[indices]

In [None]:
df_filter = spark.createDataFrame(pd_unique_names)

In [None]:
df_filtered = df_filter.join(df, on='number', how='left').drop(df_filter.name)

In [None]:
num_cards  = df_filtered.count()

In [None]:
print(f'Final number of cards {num_cards}')

## Drop columns 

In [None]:
keep_cols = ['colorIdentity','convertedManaCost','colors','manaCost','name','number','text','power','rarity','subtypes','supertypes','toughness', 'types']

In [None]:
remove_cols = list(set(df.columns) - set(keep_cols))

In [None]:
df_filtered = df_filtered.drop(*remove_cols)

## Filter the text

In [None]:
df_filtered = df_filtered.withColumn('filtered_text', udf_filter_text('name', 'text'))

## Explode the selected arrays in a string, separated by "," 

In [None]:
def explode_to_strs(df, cols):
    for col in cols:
        df_edited = df.selectExpr(["number", col]).select('number', fn.expr(f"concat_ws(',', {col})").alias(f"str_{col}"))
        df = df.join(df_edited, on='number')
    return df

In [None]:
df_filtered = explode_to_strs(df_filtered, ["colorIdentity", "types", "subtypes", "supertypes"])

## Encode newly created strings

### scikit-learn + pandas 

In [None]:
# import pandas as pd
# import sklearn
# from sklearn import preprocessing

In [None]:
# df_colorId = df_filtered.select("str_colorIdentity").distinct()

In [None]:
# pd_allColorIds = df_filtered.select("str_colorIdentity").toPandas()
# pd_colorIds = df_colorId.toPandas()

In [None]:
# le = preprocessing.LabelEncoder().fit(pd_colorIds)
# encoded_colorIds = le.transform(pd_allColorIds)

In [None]:
# https://spark.apache.org/docs/latest/api/python/pyspark.ml.html#pyspark.ml.feature.StringIndexer

In [None]:
# pd_colorIds = pd.DataFrame(encoded_colorIds, columns=['encoded_colorIdentity'])
# pd_colorIds['number'] = df_filtered.select("number").toPandas().astype('int32')

In [None]:
# df_encoded_colorIds = spark.createDataFrame(pd_colorIds)

In [None]:
# df_filtered = df_filtered.join(df_encoded_colorIds, on='number')

### spark + ml

In [None]:
from pyspark.ml.feature import StringIndexer, IndexToString

In [None]:
def encode_strings(df, cols):
    for col in cols:
        indexer = StringIndexer(inputCol=f"{col}", outputCol=f"encoded_{col}", stringOrderType='alphabetAsc')
        model = indexer.fit(df)
        df = model.transform(df)
        
        indexer.save(f"/tmp/pyspark/stringindexer_{col}")
        model.save(f"/tmp/pyspark/stringindexer_model_{col}")
    return df

In [None]:
df_filtered = encode_strings(df_filtered, ["rarity", "str_colorIdentity", "str_types", "str_subtypes", "str_supertypes"])

In [None]:
df_filtered.select(["types", "str_types", "encoded_str_types"]).distinct().show()

In [None]:
rm -rf "models/pyspark/"

In [None]:
mv "/tmp/pyspark/" "models/"

In [None]:
# df_filtered.select(["str_colorIdentity", "encoded_colorIdentity"]).limit(5).show()

In [None]:
# df_filtered.select(["str_colorIdentity", "encoded_colorIdentity"]).distinct().show()

## Count the number of colors

In [None]:
df_filtered = df_filtered.withColumn("num_colors", fn.size("colors"))

## Clean up the dataframe / dropcolumns 

#  Create an SQL table

In [None]:
df_filtered.createOrReplaceTempView("cards")

In [None]:
tbl = spark.sql("""
    SELECT
        CAST(number as Integer), 
        rarity,
        name,
        CAST(convertedManaCost as Integer),
        CAST(num_colors as Integer) as numColors,
        str_colorIdentity as colorIdentity,
        CAST(encoded_str_colorIdentity as Integer) as encodedColorIdentity,
        str_types as types,
        CAST(encoded_str_types as Integer) as encodedTypes,
        str_subtypes as subTypes,
        CAST(encoded_str_subtypes as Integer) as encodedSubTypes,
        str_supertypes as superTypes,
        CAST(encoded_str_supertypes as Integer) as encodedSuperTypes,
        text as originalText,
        filtered_text as filteredText,
        CAST(power as Integer),
        CAST(toughness as Integer)
    FROM
        cards
""")

# Save to Parquet

In [None]:
tbl.write.mode("overwrite").parquet('/tmp/THB_cards.parquet')

In [None]:
## this fails
# tbl.write.mode("overwrite").parquet('dataset/THB_cards.parquet')

In [None]:
rm -rf "dataset/THB_cards.parquet/"

In [None]:
mv "/tmp/THB_cards.parquet" "dataset/"