# Imports 

## Application-specific imports 

In [1]:
import sys

In [2]:
sys.path.append("../config/")
import config

In [3]:
sys.path.append("../metaflow/")
import preprocess_fn
import preprocess_fn_text_rules

## General 

In [4]:
import pickle
import itertools
import pandas as pd
import sklearn
from sklearn import preprocessing

In [5]:
import pyspark
import pyspark.sql.functions as fn
import pyspark.sql.types as t

# Load data from parquet

In [6]:
keyruneCode = "M20"

In [7]:
df = spark.read.parquet(f'{config.ARTIFACTS}/dataset/{keyruneCode}_cards.parquet')

# Preprocess 

In [8]:
df_filtered = df

## Replace text with keywords based on a dictionary

In [9]:
if "text_features1" in df_filtered.columns:
    df_filtered = df_filtered.drop("text_features1")

In [10]:
df_filtered = df_filtered.withColumn('text_features1', preprocess_fn.udf_text_to_keywords('name', 'originalText'))

In [11]:
if "text_features2" in df_filtered.columns:
    df_filtered = df_filtered.drop("text_features2")

In [12]:
from_patterns = [fn.when(fn.regexp_extract('originalText', r"{0}".format(pattern), 0) != '', replace).otherwise('') for pattern, replace in preprocess_fn_text_rules.text_patterns.items()]

In [13]:
df_filtered = df_filtered.withColumn('text_features2', fn.array(*from_patterns))

In [14]:
df_filtered = df_filtered.withColumn('text_features', fn.array_union('text_features1', 'text_features2'))

In [15]:
df_filtered.select("text_features").distinct().show(100, truncate=False)

+-----------------------------------------+
|text_features                            |
+-----------------------------------------+
|[ENTER_TAPPED, ETB_EFFECT, TAP, ]        |
|[YOUR_TURN, ]                            |
|[FLASH, FLYING, ]                        |
|[FLYING, VIGILANCE, ]                    |
|[TRAMPLE, ETB_EFFECT, ]                  |
|[FLYING, PAY_2, ]                        |
|[DEATHTOUCH, LIFELINK, ]                 |
|[ETB_EFFECT, PAY_5, ]                    |
|[VIGILANCE, ETB_EFFECT, ]                |
|[PAY_2, ]                                |
|[PAY_1, ETB_EFFECT, ]                    |
|[TAP, PAY_1, ]                           |
|[FLASH, FLYING, PAY_3, ]                 |
|[VIGILANCE, ]                            |
|[FLYING, ETB_EFFECT, PAY_3, ]            |
|[FLYING, PAY_5, ]                        |
|[PAY_5, ]                                |
|[PROTECTION_FROM_BLACK, PAY_2, ]         |
|[PAY_2, PAY_4, ]                         |
|[PROTECTION_FROM_GREEN, ]      

# Fetch all the text features from all the cards into one list 

In [16]:
all_text_feats = df_filtered.select("text_features").rdd.flatMap(lambda x: x).collect()

In [17]:
filtered_text_feats = [items for items in all_text_feats if len(items) > 0]

In [18]:
filtered_text_feats = list(itertools.chain.from_iterable(filtered_text_feats))

## Encode the text features into ints

In [19]:
label_encoder = preprocessing.LabelEncoder().fit(filtered_text_feats)

In [20]:
import pickle

In [21]:
with open(f"{config.TEMP}/labelencoder_text_feats.pkl", "wb") as fp:
    pickle.dump(label_encoder, fp)

In [22]:
@fn.udf(returnType=t.ArrayType(t.IntegerType()))
def text_to_vector(text_features):
    if len(text_features) > 0:
        enc_list = list()
        for item in text_features:
            item = str(item)
            encoded = label_encoder.transform([item])
            encoded = int(encoded[0])
            enc_list.append(encoded)
#             print(f"{item} \t {encoded}")
        return enc_list
    return list()

In [23]:
if "text_features_vect" in df_filtered.columns:
    df_filtered = df_filtered.drop("text_features_vect")

In [24]:
df_filtered = df_filtered.withColumn("text_features_vect", text_to_vector("text_features"))

In [25]:
all_text_feats = df_filtered.select("text_features").rdd.flatMap(lambda x: x).collect()

In [26]:
filtered_text_feats = [items for items in all_text_feats if len(items) > 0]

In [27]:
filtered_text_feats = list(itertools.chain.from_iterable(filtered_text_feats))

In [28]:
df_filtered.createOrReplaceTempView("cards_features")

In [29]:
tbl = spark.sql("""
    SELECT
        *
    FROM
        cards_features
""")

# Save to Parquet

In [30]:
tbl.write.mode("overwrite").parquet(f"{config.TEMP}/{keyruneCode}_cards_text.parquet")