In [1]:
import os
import sys
from pyspark.sql import SparkSession

os.environ['SPARK_HOME'] = "C:\spark\spark-3.5.1-bin-hadoop3"
os.environ['PYSPARK_PYTHON'] = sys.executable


# Create a Spark session
spark = SparkSession.builder \
                    .appName("example-app") \
                    .getOrCreate()


In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lit

def get_product_category_pairs_and_orphan_products(products_df, categories_df, product_categories_df):
    product_category_pairs = product_categories_df.join(
        products_df, product_categories_df.product_id == products_df.product_id, "inner"
    ).join(
        categories_df, product_categories_df.category_id == categories_df.category_id, "inner"
    ).select(
        products_df.product_name, categories_df.category_name
    )

    orphan_products = products_df.join(
        product_categories_df, products_df.product_id == product_categories_df.product_id, "left"
    ).select(
        products_df.product_name, product_categories_df.category_id
    ).filter(col("category_id").isNull()).select("product_name")

    return product_category_pairs, orphan_products

# Пример создания датафреймов
products_data = [(1, 'Product1'), (2, 'Product2'), (3, 'Product3')]
categories_data = [(1, 'Category1'), (2, 'Category2')]
product_categories_data = [(1, 1), (2, 1), (2, 2)]

products_df = spark.createDataFrame(products_data, ["product_id", "product_name"])
categories_df = spark.createDataFrame(categories_data, ["category_id", "category_name"])
product_categories_df = spark.createDataFrame(product_categories_data, ["product_id", "category_id"])

product_category_pairs, orphan_products = get_product_category_pairs_and_orphan_products(products_df, categories_df, product_categories_df)

product_category_pairs.show()
orphan_products.show()


+------------+-------------+
|product_name|category_name|
+------------+-------------+
|    Product2|    Category1|
|    Product1|    Category1|
|    Product2|    Category2|
+------------+-------------+

+------------+
|product_name|
+------------+
|    Product3|
+------------+

