<a href="https://colab.research.google.com/github/luasampaio/data-engineering/blob/main/41_AgruparPySpark.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


Qual Ideia desse Código em Pyspark?

- Agrupar


In [None]:
from pyspark.sql.types import *
from pyspark.sql import SparkSession

# Create a SparkSession
# Assessments (avaliações)
spark = SparkSession.builder.appName("Assessments").getOrCreate()

# Define the schema
schema = StructType([
    StructField("id", IntegerType(), True),
    StructField("experience", IntegerType(), True),
    StructField("sql", IntegerType(), True),
    StructField("algo", IntegerType(), True),
    StructField("bug_fixing", IntegerType(), True)
])

# Define the data
data = [
    (1, 3, 100, None, 50),
    (2, 5, None, 100, 100),
    (3, 1, 100, 100, 100),
    (4, 5, 100, 50, None),
    (5, 5, 100, 100, 100)
]

# Create the DataFrame
assessments_df = spark.createDataFrame(data, schema)

# display the DataFrame
assessments_df.show()

+---+----------+----+----+----------+
| id|experience| sql|algo|bug_fixing|
+---+----------+----+----+----------+
|  1|         3| 100|NULL|        50|
|  2|         5|NULL| 100|       100|
|  3|         1| 100| 100|       100|
|  4|         5| 100|  50|      NULL|
|  5|         5| 100| 100|       100|
+---+----------+----+----+----------+



In [None]:
from pyspark.sql import functions as F

# adicionando a flag maxima pontuacao
result_df = assessments_df.groupBy("experience").agg(
    F.count("*").alias("total_candidate"),
    F.count(
        F.when(
            ((F.col("sql").eqNullSafe(100) | F.col("sql").isNull()) &
             (F.col("algo").eqNullSafe(100) | F.col("algo").isNull()) &
             (F.col("bug_fixing").eqNullSafe(100) | F.col("bug_fixing").isNull())),
            1
        )
    ).alias("max_score_flag")
).orderBy("experience")

# display the results
result_df.show()

+----------+---------------+--------------+
|experience|total_candidate|max_score_flag|
+----------+---------------+--------------+
|         1|              1|             1|
|         3|              1|             0|
|         5|              3|             2|
+----------+---------------+--------------+



In [None]:
# cria uma view temporario com o nome de avaliacao
assessments_df.createOrReplaceTempView("assessments")

Em SQL

In [None]:
# Spark SQL query
query = """
SELECT
    experience,
    COUNT(*) AS total_candidate,
    COUNT(
        CASE
            WHEN
                (sql = 100 OR sql IS NULL) AND
                (algo = 100 OR algo IS NULL) AND
                (bug_fixing = 100 OR bug_fixing IS NULL)
            THEN 1
        END
    ) AS max_score_flag
FROM assessments
GROUP BY experience
ORDER BY experience asc
"""

# Execute the query
result_df = spark.sql(query)

# display the results
result_df.show()

+----------+---------------+--------------+
|experience|total_candidate|max_score_flag|
+----------+---------------+--------------+
|         1|              1|             1|
|         3|              1|             0|
|         5|              3|             2|
+----------+---------------+--------------+



In [None]:
result_df.printSchema()

root
 |-- experience: integer (nullable = true)
 |-- total_candidate: long (nullable = false)
 |-- max_score_flag: long (nullable = false)



Convertendo para inteiro os campos


In [None]:
result_df = result_df.withColumn("max_score_flag", result_df["max_score_flag"].cast("int"))
result_df = result_df.withColumn("total_candidate", result_df["total_candidate"].cast("int"))
result_df.printSchema()

root
 |-- experience: integer (nullable = true)
 |-- total_candidate: integer (nullable = false)
 |-- max_score_flag: integer (nullable = false)



In [None]:
result_df.show()

+----------+---------------+--------------+
|experience|total_candidate|max_score_flag|
+----------+---------------+--------------+
|         1|              1|             1|
|         3|              1|             0|
|         5|              3|             2|
+----------+---------------+--------------+

