# Imports 

In [1]:
from pyspark.sql import DataFrame, SparkSession, Window
from pyspark.sql import functions as F

# Initialize 

In [2]:
spark = (
    SparkSession.builder.appName("pyspark-notebook")
    .master("spark://spark-master:7077")
    .config("spark.executor.memory", "512m")
    .config("spark.sql.codegen.wholeStage", False)
    .getOrCreate()
)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/10/03 05:29:15 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
sc = spark.sparkContext

# Main 

In [33]:
from pipelines.stage.schemas import SCHEMA_STAGE

In [5]:
df = spark.read.json("/data/stage/DMU.json", schema=SCHEMA_STAGE)

In [6]:
keep_cols = [
    "colorIdentity",
    "colors",
    "convertedManaCost",
    "loyalty",
    "manaCost",
    "manaValue",
    "name",
    "number",
    "originalType",
    "rarity",
    "subtypes",
    "supertypes",
    "text",
    "toughness",
    "type",
    "types",
    "power",
]

remove_cols = list(set(df.columns) - set(keep_cols))

df_filtered = df.drop(*remove_cols)

In [7]:
window = Window.partitionBy("name").orderBy(F.col("name").desc())

In [8]:
df_filtered = (
    df.withColumn("index", F.row_number().over(window))
    .filter(F.col("index") == 1)
    .drop("index")
)

In [9]:
df_filtered = df_filtered.select(
    [
        F.col("colorIdentity"),
        F.col("colors"),
        F.col("convertedManaCost"),
        F.col("manaCost"),
        F.col("manaValue"),
        F.col("name"),
        F.col("originalType"),
        F.col("rarity"),
        F.col("subtypes"),
        F.col("supertypes"),
        F.col("text"),
        F.col("toughness"),
        F.col("type"),
        F.col("types"),
        F.col("power").cast("int"),
    ]
)

In [10]:
df_filtered.write.mode("overwrite").parquet("/data/store/DMU.parquet")

                                                                                

In [11]:
from pyspark.sql import types as T

In [34]:
from pipelines.clean.schemas import COLOR_SCHEMA

In [None]:
def enconde_colorIdentity(df):
    colors = spark.createDataFrame(
        [
            ["B", 1],
            ["U", 2],
            ["G", 3],
            ["R", 4],
            ["W", 5],
        ],
        schema=COLOR_SCHEMA,
    )

    df = (
        df.withColumn("row_id", F.monotonically_increasing_id())
        .select(F.col("row_id"), F.col("colorIdentity"), F.explode("colorIdentity"))
        .limit(5)
    )

    return df.join(F.broadcast(colors), on=[tmp.col == colors.color], how="inner")

In [15]:
(df_filtered.select(F.explode(F.col("types"))).distinct().show())

+------------+
|         col|
+------------+
|        Land|
|     Sorcery|
|     Instant|
| Enchantment|
|    Artifact|
|    Creature|
|Planeswalker|
+------------+

