In [0]:
"""
https://www.youtube.com/watch?v=02XLUeIVRSE&list=PLBTZqjSKn0IfuIqbMIqzS-waofsPHMS0E&index=17
Child Adult Pair, Assuming there are always more number of adults than children and adult,child all are sorted
"""

from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.window import Window

data = [
        ('A1','Adult',54),
        ('A2','Adult',53),
        ('A3','Adult',52),
        ('A4','Adult',58),
        ('A5','Adult',54),
        ('C1','Child',20),
        ('C2','Child',19),
        ('C3','Child',22),
        ('C4','Child',15)
]

schema = StructType([
    StructField("person", StringType()),
    StructField("type", StringType()),
    StructField("age", IntegerType())
])

df = spark.createDataFrame(data=data, schema=schema)
df.show()
df.printSchema()

+------+-----+---+
|person| type|age|
+------+-----+---+
|    A1|Adult| 54|
|    A2|Adult| 53|
|    A3|Adult| 52|
|    A4|Adult| 58|
|    A5|Adult| 54|
|    C1|Child| 20|
|    C2|Child| 19|
|    C3|Child| 22|
|    C4|Child| 15|
+------+-----+---+

root
 |-- person: string (nullable = true)
 |-- type: string (nullable = true)
 |-- age: integer (nullable = true)



In [0]:
adult_df = df.filter(col("type") == "Adult")\
    .withColumnRenamed("person", "person_a")\
    .withColumn("rn_a", row_number().over(Window.orderBy("person_a")))
adult_df.show()

child_df = df.filter(col("type") == "Child")\
    .withColumnRenamed("person", "person_c")\
    .withColumn("rn_c", row_number().over(Window.orderBy("person_c")))
child_df.show()

+--------+-----+---+----+
|person_a| type|age|rn_a|
+--------+-----+---+----+
|      A1|Adult| 54|   1|
|      A2|Adult| 53|   2|
|      A3|Adult| 52|   3|
|      A4|Adult| 58|   4|
|      A5|Adult| 54|   5|
+--------+-----+---+----+

+--------+-----+---+----+
|person_c| type|age|rn_c|
+--------+-----+---+----+
|      C1|Child| 20|   1|
|      C2|Child| 19|   2|
|      C3|Child| 22|   3|
|      C4|Child| 15|   4|
+--------+-----+---+----+



In [0]:
adult_df.join(child_df, adult_df["rn_a"]==child_df["rn_c"], "left") \
    .select("person_a", "person_c") \
    .show()

+--------+--------+
|person_a|person_c|
+--------+--------+
|      A1|      C1|
|      A2|      C2|
|      A3|      C3|
|      A4|      C4|
|      A5|    null|
+--------+--------+



In [0]:
"""
Tweak the problem, consider max aged adult with youngest child
"""
adult_df = df.filter(col("type") == "Adult")\
    .withColumnRenamed("person", "person_a")\
    .withColumn("rn_a", row_number().over(Window.orderBy(desc(col("age")))))
adult_df.show()

child_df = df.filter(col("type") == "Child")\
    .withColumnRenamed("person", "person_c")\
    .withColumn("rn_c", row_number().over(Window.orderBy(col("age"))))
child_df.show()


+--------+-----+---+----+
|person_a| type|age|rn_a|
+--------+-----+---+----+
|      A4|Adult| 58|   1|
|      A1|Adult| 54|   2|
|      A5|Adult| 54|   3|
|      A2|Adult| 53|   4|
|      A3|Adult| 52|   5|
+--------+-----+---+----+

+--------+-----+---+----+
|person_c| type|age|rn_c|
+--------+-----+---+----+
|      C4|Child| 15|   1|
|      C2|Child| 19|   2|
|      C1|Child| 20|   3|
|      C3|Child| 22|   4|
+--------+-----+---+----+



In [0]:
adult_df.join(child_df, adult_df["rn_a"]==child_df["rn_c"], "left") \
    .select("rn_a", "rn_c") \
    .show()

+--------+-----+---+--------+-----+----+
|person_a| type|age|person_c| type| age|
+--------+-----+---+--------+-----+----+
|      A4|Adult| 58|      C4|Child|  15|
|      A1|Adult| 54|      C2|Child|  19|
|      A5|Adult| 54|      C1|Child|  20|
|      A2|Adult| 53|      C3|Child|  22|
|      A3|Adult| 52|    null| null|null|
+--------+-----+---+--------+-----+----+

