# Imports

## Application-specific imports 

In [1]:
import sys

In [2]:
sys.path.append("../config/")
import config

In [3]:
sys.path.append("../metaflow/")
import preprocess_fn

## General 

In [4]:
import json
import numpy as np

In [5]:
from pyspark.sql import functions as fn

# Load the set from JSON

In [6]:
df = spark.read.json('../dataset/M20_cards.json')

In [7]:
df.count()

345

# Preprocess 

## Filter out duplicate cards

In [8]:
df_filtered = preprocess_fn.remove_duplicate_cards(df)

In [9]:
num_cards = df_filtered.count()
print(f"Final number of cards {num_cards}")

Final number of cards 329


## Drop columns 

In [10]:
df_filtered = preprocess_fn.drop_columns(df_filtered)

## Filter the text

In [11]:
df_filtered = df_filtered.withColumn('filtered_text',
                                     preprocess_fn.udf_filter_text('name', 'text')
)

## Explode the selected arrays in a string, separated by "," 

In [12]:
df_filtered = preprocess_fn.explode_to_strs(df_filtered,
                                            ["colorIdentity", "types", "subtypes", "supertypes"]
)

## Encode newly created strings

In [13]:
df_filtered = preprocess_fn.encode_strings(df_filtered, ["rarity", "str_colorIdentity", "str_types", "str_subtypes", "str_supertypes"])

In [14]:
mv /tmp/pyspark/stringindexer* /mtgp/artifacts/spark_models/

mv: inter-device move failed: '/tmp/pyspark/stringindexer_model_rarity' to '/mtgp/artifacts/spark_models/stringindexer_model_rarity'; unable to remove target: Directory not empty
mv: inter-device move failed: '/tmp/pyspark/stringindexer_model_str_colorIdentity' to '/mtgp/artifacts/spark_models/stringindexer_model_str_colorIdentity'; unable to remove target: Directory not empty
mv: inter-device move failed: '/tmp/pyspark/stringindexer_model_str_subtypes' to '/mtgp/artifacts/spark_models/stringindexer_model_str_subtypes'; unable to remove target: Directory not empty
mv: inter-device move failed: '/tmp/pyspark/stringindexer_model_str_supertypes' to '/mtgp/artifacts/spark_models/stringindexer_model_str_supertypes'; unable to remove target: Directory not empty
mv: inter-device move failed: '/tmp/pyspark/stringindexer_model_str_types' to '/mtgp/artifacts/spark_models/stringindexer_model_str_types'; unable to remove target: Directory not empty
mv: inter-device move failed: '/tmp/pyspark/strin

## Count the number of colors

In [15]:
df_filtered = df_filtered.withColumn("num_colors", fn.size("colors"))

#  Create an SQL table

In [16]:
df_filtered.createOrReplaceTempView("cards")

In [17]:
tbl = spark.sql("""
    SELECT
        CAST(number as Integer), 
        rarity,
        name,
        CAST(convertedManaCost as Integer),
        CAST(num_colors as Integer) as numColors,
        str_colorIdentity as colorIdentity,
        CAST(encoded_str_colorIdentity as Integer) as encodedColorIdentity,
        str_types as types,
        CAST(encoded_str_types as Integer) as encodedTypes,
        str_subtypes as subTypes,
        CAST(encoded_str_subtypes as Integer) as encodedSubTypes,
        str_supertypes as superTypes,
        CAST(encoded_str_supertypes as Integer) as encodedSuperTypes,
        text as originalText,
        filtered_text as filteredText,
        CAST(power as Integer),
        CAST(toughness as Integer)
    FROM
        cards
""")

# Save to Parquet

In [18]:
tbl.write.mode("overwrite").parquet(f"{config.TEMP}/M20_cards.parquet")

In [19]:
## this fails
# tbl.write.mode("overwrite").parquet('dataset/THB_cards.parquet')

In [20]:
cp -R "/tmp/M20_cards.parquet" "../artifacts/dataset/"