**Short description**

Extract statistics and information about the keywords found in each card.

- - -

# Imports

In [1]:
import sys

In [2]:
sys.path.append("../config/")
import config

In [3]:
sys.path.append('../metaflow/')
import preprocess_fn_text_rules

## General

In [4]:
import pyspark.sql.functions as fn

In [5]:
import pyspark.sql.types as t

In [6]:
import pandas as pd
import numpy as np

In [7]:
import seaborn as sns
import matplotlib.pyplot as plt

In [8]:
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder

In [9]:
import pickle

# Load from Parquet

In [10]:
keyruneCode = "M20"

In [11]:
df = spark.read.parquet(f'{config.TEMP}/{keyruneCode}_cards_text.parquet')

In [12]:
df.createOrReplaceTempView("cards_features")

In [13]:
df.count()

329

# Groups 

## By text features 

In [14]:
lenc = None

with open(f"{config.TEMP}/labelencoder_text_feats.pkl", "rb") as fp:
# with open("f{config.TEMP}/{keyruneCode}/labelencoder_text_feats", "rb") as fp:
    lenc = pickle.load(fp)

In [15]:
if 'text_featuers_enc' in df.columns:
    df = df.drop("text_featuers_enc")

df_enc = df.withColumn("text_features_enc", fn.explode(df.text_features_vect))

In [16]:
df_count = df_enc.select(["text_features_enc", "colorIdentity"]).where(df_enc['text_features_enc'] > 0).groupBy('text_features_enc').count().orderBy('text_features_enc', ascending=True)

In [17]:
df_count.count()

27

In [18]:
df_count.toPandas()

Unnamed: 0,text_features_enc,count
0,1,4
1,2,5
2,3,1
3,4,1
4,5,17
5,6,73
6,7,5
7,8,49
8,9,7
9,10,1


In [19]:
@fn.udf#(returnType=t.StringType())
def resolve_encoded_label(label):
    label = int(label)

    result = lenc.inverse_transform([label])[0]
    result = str(result)
    
    return result

In [20]:
if "name" in df_count.columns:
    df_count.drop("name")

df_count = df_count.withColumn('name', resolve_encoded_label('text_features_enc'))

In [21]:
pd_count = df_count.toPandas()

In [22]:
pd_count

Unnamed: 0,text_features_enc,count,name
0,1,4,CANT_BE_COUNTER
1,2,5,DEATHTOUCH
2,3,1,DEFENDER
3,4,1,DOUBLE_STRIKE
4,5,17,ENTER_TAPPED
5,6,73,ETB_EFFECT
6,7,5,FLASH
7,8,49,FLYING
8,9,7,HASTE
9,10,1,IS_ATTACKING


In [24]:
reverse_lookup = dict((y,x) for x,y in preprocess_fn_text_rules.text_rules.items())

In [25]:
pd_count['resolved_names'] = pd_count['name'].map(reverse_lookup)

In [26]:
pd_count

Unnamed: 0,text_features_enc,count,name,resolved_names
0,1,4,CANT_BE_COUNTER,This spell can't be countered
1,2,5,DEATHTOUCH,Deathtouch
2,3,1,DEFENDER,Defender
3,4,1,DOUBLE_STRIKE,Double strike
4,5,17,ENTER_TAPPED,CARDNAME enters the battlefield tapped.
5,6,73,ETB_EFFECT,When CARDNAME enters the battlefield
6,7,5,FLASH,Flash
7,8,49,FLYING,Flying
8,9,7,HASTE,Haste
9,10,1,IS_ATTACKING,


In [None]:
stop_here()

In [None]:
fig, axes = plt.subplots(figsize=(13, 8))
sns.barplot(x="count", y="name", data=pd_count, orient="h", ax=axes)

In [None]:
df_count_mult = df_enc.select(["text_features_enc", "colorIdentity"]).\x
                    where(df['text_features_enc'] > 0).\
                    groupBy(['text_features_enc', "colorIdentity"]).count().\
                    orderBy('text_features_enc', ascending=True)

In [None]:
df_count_mult = df_count_mult.withColumn('name', resolve_encoded_label('text_features_enc'))

In [None]:
pd_count_mult = df_count_mult.toPandas()

In [None]:
pd_count_mult

## By color identity and text features

In [None]:
pivot = pd_count_mult.pivot("name", "colorIdentity", "count")

In [None]:
f, ax_hist = plt.subplots(1, figsize=(10, 10), dpi=100) #, sharex=True, sharey=False, gridspec_kw={"height_ratios": (.10, .90)})
 
# g = sns.lineplot(x="colorIdentity", y="count", data=pd_count_mult, ci=None, ax=ax_box)
# g = sns.countplot(x="colorIdentity", data=pd_count_mult, ax=ax_box)
sns.despine(top=True, bottom=True, left=True, right=True, offset=0.0)
ax_box.set(yticks=[])
ax_box.set(xticks=[])
ax_box.set(ylabel="")
ax_box.set(xlabel="")
ax_box.set(xlim=(-0.5, 23))
ax_box.set_xticklabels(range(0,23))

sns.heatmap(pivot, square=True, cmap="YlGnBu", cbar=False, linewidths=1.0, annot=True, ax=ax_hist)