<a href="https://colab.research.google.com/github/luasampaio/CienciaDados/blob/main/ClassificarAleatoriamente.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
from pyspark.sql.functions import *
from pyspark.sql import SparkSession # Import SparkSession

# Define the pizza toppings and their costs
data = [
    ("Pepperoni", 0.50),
    ("Sausage", 0.70),
    ("Chicken", 0.55),
    ("Extra Cheese", 0.40),
]

# Create a SparkSession
spark = SparkSession.builder.appName("PizzaToppings").getOrCreate() # Create a SparkSession


In [4]:
# Create a DataFrame from the input data
toppings_df = spark.createDataFrame(data, ["topping_name", "ingredient_cost"])
toppings_df.show() # Changed from display() to show() to correctly display the DataFrame

+------------+---------------+
|topping_name|ingredient_cost|
+------------+---------------+
|   Pepperoni|            0.5|
|     Sausage|            0.7|
|     Chicken|           0.55|
|Extra Cheese|            0.4|
+------------+---------------+



# CRIANDO COMBINAÇÕES POSSIVEIS

In [6]:
from pyspark.sql.functions import *

# Create combinations of three toppings
combinations = (
    toppings_df.alias("p1")
    .join(toppings_df.alias("p2"), col("p1.topping_name") < col("p2.topping_name"))
    .join(toppings_df.alias("p3"), col("p2.topping_name") < col("p3.topping_name"))
    .select(
        concat(
            col("p1.topping_name"),
            lit(","),
            col("p2.topping_name"),
            lit(","),
            col("p3.topping_name"),
        ).alias("pizza"),
        (
            col("p1.ingredient_cost")
            + col("p2.ingredient_cost")
            + col("p3.ingredient_cost")
        ).alias("total_cost"),
    )
)

# Round up the total cost
result = combinations.withColumn("total_cost", round(col("total_cost"), 2))
# Ordenando de forma descrecente
result.orderBy(col("total_cost").desc()).show()

+--------------------+----------+
|               pizza|total_cost|
+--------------------+----------+
|Chicken,Pepperoni...|      1.75|
|Chicken,Extra Che...|      1.65|
|Extra Cheese,Pepp...|       1.6|
|Chicken,Extra Che...|      1.45|
+--------------------+----------+



Cria View


In [13]:
toppings_df.createOrReplaceTempView("pizza_toppings")

In [21]:
from pyspark.sql import SparkSession

query = """
WITH Toppings AS (
  SELECT
    topping_name,
    ingredient_cost
  FROM
    pizza_toppings
)
SELECT
  CONCAT(
    p1.topping_name,
    ',',
    p2.topping_name,
    ',',
    p3.topping_name
  ) AS pizza,
  ROUND(
    (
      p1.ingredient_cost + p2.ingredient_cost + p3.ingredient_cost
    ),
    2
  ) AS total_cost
FROM
  Toppings AS p1
  INNER JOIN Toppings AS p2 ON p1.topping_name < p2.topping_name
  INNER JOIN Toppings AS p3 ON p2.topping_name < p3.topping_name
ORDER BY
  total_cost DESC;
"""
spark.sql(query).show()


+--------------------+----------+
|               pizza|total_cost|
+--------------------+----------+
|Chicken,Pepperoni...|      1.75|
|Chicken,Extra Che...|      1.65|
|Extra Cheese,Pepp...|       1.6|
|Chicken,Extra Che...|      1.45|
+--------------------+----------+

