# Application-specific

In [None]:
import sys

In [None]:
sys.path.append("../config/")
import config

## General

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(color_codes=True)

In [None]:
from IPython.display import display, HTML

In [None]:
import pyspark.sql.functions as fn
import pyspark.sql.types as t

# Load from Parquet

In [None]:
df = spark.read.parquet(f'{config.OUTPUT_DATASET}/M20_cards.parquet')

In [None]:
# df.createOrReplaceTempView("cards")

# Descriptive Statistics 

## Total number of cards 

In [None]:
print(f'Total cards int the set: {df.count()}')

In [None]:
print(f'Total cards int the set: {df.distinct().count()}')

## Types

In [None]:
df_result = df.select(fn.col('types').alias('Permanent Type')).distinct()
pd_df = df_result.toPandas()

In [None]:
pd_df

In [None]:
card_sum = df.groupBy("types").count().agg(
    fn.sum("count")
)

In [None]:
card_sum = int(card_sum.toPandas().loc[0])

In [None]:
df_result = df.groupBy(fn.col('types').alias('Parmanent Type')).count()
pd_df = df_result.toPandas()

In [None]:
pd_df

In [None]:
with open(f"{config.ARTIFACTS}/EDA/perma_types.md", "w") as fp:
    pd_df.to_markdown(fp)

In [None]:
bcast = sc.broadcast(card_sum)

@fn.udf(returnType=t.FloatType())
def udf_count_to_perc(count):
#     print(count, bcast.value)
    return count / bcast.value

In [None]:
df_result = df.groupBy(fn.col("types").alias('Permanent Type')).count().withColumn('perc', fn.format_number(udf_count_to_perc('count'), 2))
pd_df = df_result.toPandas()

In [None]:
pd_df

In [None]:
with open(f"{config.ARTIFACTS}/EDA/perma_types_stats.md", "w") as fp:
    pd_df.to_markdown(fp)

In [None]:
df_result = df.groupBy("encodedTypes").count()
pd_df = df_result.toPandas()

In [None]:
pd_df

## Power & Toughness

In [None]:
df_result = df.select("name", "power", "toughness").where("encodedTypes == 2")
df_result = df_result.na.fill(0)
pd_df = df_result.toPandas()

In [None]:
pd_df

In [None]:
with open(f"{config.ARTIFACTS}/EDA/creatures_power_toughness.md", "w") as fp:
    pd_df.to_markdown(fp)

In [None]:
ax = sns.pairplot(pd_df, height=4, kind='reg')

In [None]:
ax.savefig(f"{config.ARTIFACTS}/EDA/power_toughness_pairplot.png")

In [None]:
# df_result.corr("power", "toughness", method="pearson")

In [None]:
pd_df.corr(method="spearman")

In [None]:
with open(f"{config.ARTIFACTS}/EDA/creatures_power_toughness_correlation_spearman.md", "w") as fp:
    pd_df.to_markdown(fp)

In [None]:
df.agg({"toughness": "mean"}).collect()

In [None]:
df_result = df.agg(
    *[fn.min("power").alias("min"),
      fn.max("power").alias("max"),
      fn.format_number(fn.mean("power"), 2).alias("avg")
])
pd_df_power = df_result.toPandas().rename(index={0: 'power'})

In [None]:
df_result = df.agg(
    *[fn.min("toughness").alias("min"),
      fn.max("toughness").alias("max"),
      fn.format_number(fn.mean("toughness"), 2).alias("avg")
])
pd_df_toughness = df_result.toPandas().rename(index={0: 'toughness'})

In [None]:
pd.concat([pd_df_power, pd_df_toughness])

In [None]:
with open(f"{config.ARTIFACTS}/EDA/creatures_power_toughness_agg_min_max_avg.md", "w") as fp:
    pd_df.to_markdown(fp)

## Length of name 

In [None]:
df_result = df.selectExpr("name").select('name', fn.length("name").alias("length"))
pd_df = df_result.toPandas()

In [None]:
pd_df

In [None]:
df_result = df.selectExpr("name").\
    select("name", fn.split("name", "\s+").alias("tokens")).\
    select("name", "tokens", fn.size("tokens").alias("# tokens"))
pd_df = df_result.toPandas()

In [None]:
pd_df

In [None]:
with open(f"{config.ARTIFACTS}/EDA/cards_names_length_and_tokens.md", "w") as fp:
    pd_df.to_markdown(fp)

## Length of text 

In [None]:
df_result = df.selectExpr("name", "originalText").\
    select(fn.col("name").alias("card name"), fn.length("originalText").alias("length")).\
    orderBy(fn.desc("length"))
df_result = df_result.na.fill(value=0, subset=['length'])
pd_df = df_result.toPandas()

In [None]:
pd_df

In [None]:
with open(f"{config.ARTIFACTS}/EDA/cards_text_length.md", "w") as fp:
    pd_df.to_markdown(fp)

## Mana costs 

In [None]:
df_result = df.select(fn.col("name").alias("card name"),
                      fn.col("convertedManaCost").alias("converted mana cost")).\
                orderBy(fn.desc("converted mana cost"))
pd_df = df_result.toPandas()

In [None]:
pd_df

In [None]:
with open(f"{config.ARTIFACTS}/EDA/cards_cmc.md", "w") as fp:
    pd_df.to_markdown(fp)

In [None]:
max_cmc = pd_df['converted mana cost'].max()

In [None]:
ax = sns.distplot(pd_df['converted mana cost'], bins=range(max_cmc + 1))
_ = ax.set_ylabel('% cards')
_ = ax.set_title('M20 Set')
_ = ax.set_xticks(range(max_cmc + 1))

In [None]:
ax.figure.savefig(f"{config.ARTIFACTS}/EDA/cards_cmc_plot.png")

# Groups 

## By color identity

In [None]:
df_result = df.groupby(fn.col('colorIdentity').alias('Color Identity')).count()
pd_df = df_result.toPandas()

In [None]:
pd_df

In [None]:
with open(f"{config.ARTIFACTS}/EDA/cards_cmc_groups_count.md", "w") as fp:
    pd_df.to_markdown(fp)

In [None]:
df_result = df.groupby('colorIdentity').count().agg(
    fn.sum("count").alias("total number")
).collect()

In [None]:
print(df_result)

## From Models, show encoded types

In [None]:
from pyspark.ml.feature import StringIndexer, StringIndexerModel

In [None]:
indexer = StringIndexer.load(f"{config.SPARK_MODELS}/stringindexer_str_types")
model = StringIndexerModel.load(f"{config.SPARK_MODELS}/stringindexer_model_str_types")

In [None]:
# model.stringOrderType

In [None]:
model.labels

In [None]:
df.agg(
    *[fn.min(df.encodedTypes), fn.max(df.encodedTypes)]
).show()

## Make an overall histogram from encoded types. 

In [None]:
tmp = df.select("encodedTypes").rdd.flatMap(lambda x: x)

In [None]:
tmp.min(), tmp.max()

In [None]:
hist = df.select("encodedTypes").rdd.flatMap(lambda x: x).histogram(8)

In [None]:
hist

In [None]:
pd_hist = pd.DataFrame(data=list(zip(*hist)), columns=['bin', 'freq'])

In [None]:
pd_hist['perc'] = pd_hist['freq'] / pd_hist['freq'].sum()

In [None]:
sns.set(style="whitegrid")

f, ax = plt.subplots(figsize=(6, 6))

sns.set_color_codes("pastel")
sns.barplot(x="perc", y="bin", data=pd_hist, label="Total", orient='h', color="b")

# ax.set(xlim=(0, 0.4), xlabel="", ylabel="")
ax.set(xlim=(0, 0.5), xlabel="% of total cards in the Set", ylabel="Card Type")
ax.set(yticklabels=model.labels)

sns.despine(left=True, bottom=True)

In [None]:
ax.figure.savefig(f'{config.ARTIFACTS}/EDA/cards_types_hist.png')

## By color identity and encoded type

First, let's fetch the distinct color identities

In [None]:
pd_colorIdentities  = df.select("colorIdentity").alias("Color_Identity").distinct().sort("colorIdentity").toPandas()

In [None]:
pd_colorIdentities

In [None]:
pd_encodedTypes = pd.DataFrame(np.arange(8), columns=['Encoded_Types'])

In [None]:
pd_encodedTypes

Create a cartesian product of the color identities and the encoded types.

In [None]:
pd_colorIdentities['key'] = 0
pd_encodedTypes['key'] = 0

In [None]:
pd_cartesian = pd_colorIdentities.merge(pd_encodedTypes, how='outer').drop(columns=['key'])

In [None]:
pd_cartesian = pd_cartesian.rename(columns={'colorIdentity':'Color_Identity'})

In [None]:
pd_cartesian

Now, let's create a list which will hold the following: ((color identity, encoded type), 1).

In [None]:
map1 = df.select(["colorIdentity", "encodedTypes"]).rdd.map(
    lambda x: ((x[0], x[1]), 1)
)

map1.take(10)

Finally, let's compute the sum of the number of a found encoded types per color identity.

In [None]:
map2 = map1.reduceByKey(lambda a, b: a + b).sortByKey()

In [None]:
map2.take(5)

For convience, let's break down the (color identity, encoded type) key into a list of: (color identity, encoded type, sum). 

In [None]:
map3 = map2.map(lambda x: (x[0][0], x[0][1], x[1]))

In [None]:
pd_tab = map3.toDF().toPandas()

In [None]:
pd_tab.columns = ['Color_Identity', 'Encoded_Types', 'Total']

In [None]:
pd_tab.head(5)

In [None]:
with open(f"{config.ARTIFACTS}/EDA/cards_ci_type_counts.md", "w") as fp:
    pd_df.to_markdown(fp)

Based on the color identity and encoded types, assign a key that corresponds to the cartesian product.

In [None]:
def assign_index(row):
    filter1 = pd_cartesian['Color_Identity'] ==  row['Color_Identity']
    filter2 = pd_cartesian['Encoded_Types'] == row['Encoded_Types']
    
    idx = pd_cartesian.index[filter1 & filter2].tolist()[0]

    return idx

In [None]:
pd_tab['Cart_Index'] = pd_tab.apply (lambda row: assign_index(row), axis=1)

In [None]:
# pd_tab.max()

In [None]:
pd_tab.head(5)

Find the missing indexes from the cartesian product, and fill-in with "total = 0".

In [None]:
missing_cart_indexes = set(pd_cartesian.index.tolist()) - set(pd_tab.Cart_Index.tolist())

In [None]:
for index in missing_cart_indexes:
    row = pd_cartesian.iloc[index]

    new_df = row.copy()
    new_df['Total'] = 0
    new_df['Cart_Index'] = index
    
    pd_tab = pd_tab.append(new_df)

In [None]:
group_result = pd_tab.groupby('Encoded_Types')

In [None]:
for group_id, (name, indices) in enumerate(group_result.indices.items()):
    group = pd_tab.iloc[indices]
    group.fillna(0)
    
#     print(group)

    f_group = group[group['Total'] > 0]

    labels = f_group['Color_Identity']
    totals = f_group['Total']

    sns.set(style="whitegrid")
    # sns.set()
    sns.set_palette("pastel")

    colors = {
        '': 'lightgray',
        'W': 'w',
        'U': 'b',
        'B': 'k',
        'R': 'r',
        'G': 'g'
    }

    edge_colors = {
        '': '',
        'W': 'lightgray',
        'U': '',
        'B': '',
        'R': '',
        'G': ''
    }

    prev = 0 
    for i in range(len(totals)):
        val = totals.iloc[i]
        label = labels.iloc[i]
        color = colors[label]
        ecolor = edge_colors[label]
        plt.barh(i, val, color=color, linewidth=1.0, alpha=1.0, edgecolor=ecolor)#, hatch="/")

    plt.title(model.labels[group_id])
    plt.xlabel('Total')  
    plt.ylabel('Color Identity')
#     plt.yticks(range(6), labels)
    plt.yticks([])
    sns.despine(left=True, bottom=True)

    plt.show()  
    
    break

### Bars with Gradients

In [None]:
import matplotlib
from matplotlib.colors import LinearSegmentedColormap

def gradientbars(bars, color):
    grad = np.atleast_2d(np.linspace(0,1,256))
    rgb0 = matplotlib.colors.to_rgba('w')
    rgb1 = matplotlib.colors.to_rgba(color)
    cmap = LinearSegmentedColormap.from_list('tmp', (rgb0, rgb1))

    
    ax = bars[0].axes
    lim = ax.get_xlim() + ax.get_ylim()
    for bar in bars:
        bar.set_zorder(1)
        bar.set_facecolor("none")
        x, y = bar.get_xy()
        w, h = bar.get_width(), bar.get_height()
        ax.imshow(grad, extent=[x, x+w, y, y+h], cmap=cmap,aspect="auto", zorder=0)

In [None]:
f_group = group[group['Total'] > 0]

labels = f_group['Color_Identity']
totals = f_group['Total']

sns.set_palette("pastel")


with sns.color_palette("pastel"):
    colors = {
        '': 'lightgray',
        'W': 'w',
        'U': 'b',
        'B': 'k',
        'R': 'r',
        'G': 'g'
    }

    edge_colors = {
        '': '',
        'W': 'lightgray',
        'U': '',
        'B': '',
        'R': '',
        'G': ''
    }

    fig, ax = plt.subplots()

    prev = 0 
    for i in range(len(totals)):
        val = totals.iloc[i]
        label = labels.iloc[i]
        color = colors[label]
        ecolor = edge_colors[label]
        bar = ax.barh(i, val, color=color, linewidth=1.0, alpha=1.0, edgecolor=ecolor)#, hatch="/")
        
        gradientbars(bar, color)

    ax.set_title(model.labels[group_id])
    ax.set_xlabel('Total')  
    ax.set_ylabel('Color Identity')
    #     plt.yticks(range(6), labels)
    ax.set_yticks([])
    ax.axis('auto')
    sns.despine(left=True, bottom=True)

    plt.show()  

In [None]:
ax.figure.savefig(f'{config.ARTIFACTS}/EDA/cards_artifacts_hist_fx1.png')

### Bars with Gradient and Symbols

In [None]:
from matplotlib.offsetbox import OffsetImage, AnnotationBbox

In [None]:
sns.set()

In [None]:
f_group = group[group['Total'] > 0]

labels = f_group['Color_Identity']
totals = f_group['Total']

sns.set(style="whitegrid")
# sns.set()
sns.set_palette("pastel")

colors = {
    '': '#cac5c0ff',
    'W': '#f8f6d8ff',
    'U': '#c1d7e9ff',
    'B': '#bab1abff',
    'R': '#e49977ff',
    'G': '#a3c095ff'
}

edge_colors = {
    '': '',
    'W': '',
    'U': '',
    'B': '',
    'R': '',
    'G': ''
}

images = {
    '': '../assets/Mana/C.png',
    'W': '../assets/Mana/W.png',
    'U': '../assets/Mana/U.png',
    'B': '../assets/Mana/B.png',
    'R': '../assets/Mana/R.png',
    'G': '../assets/Mana/G.png'
}

fig, ax = plt.subplots()

prev = 0 
for i in range(len(totals)):
    val = totals.iloc[i]
    label = labels.iloc[i]
    color = colors[label]
    ecolor = ''
    image = images[label]
    bar = plt.barh(i, val, color=color, height=0.60, linewidth=0.0, alpha=1.0, edgecolor=ecolor)#, hatch='/')
    
    ab = AnnotationBbox(OffsetImage(plt.imread(image), zoom=0.075), (val, i), frameon=False)
    ax.add_artist(ab)
    
    gradientbars(bar, color)

plt.title(model.labels[group_id])
plt.xlabel('# Cards')  
plt.ylabel('Color Identity')
plt.yticks([])
plt.grid(linewidth=1.0, alpha=0.25)
sns.despine(left=True, bottom=True)
plt.axis('auto')


plt.show()  

In [None]:
ax.figure.savefig(f'{config.ARTIFACTS}/EDA/cards_artifacts_hist_fx2.png')

###  Cummulative Bar

In [None]:
sns.set(style="whitegrid")

prev = 0 
for i in range(len(totals)):
    val = totals.iloc[i]
    label = labels.iloc[i]
    
    plt.barh(i, val)
#     prev = prev + val

plt.xlabel('Total')  
plt.ylabel('Spell Type')

plt.show()  