In [6]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf , collect_list, explode,col,expr,posexplode,broadcast
from pyspark.sql.functions import posexplode

In [7]:
spark = SparkSession.builder\
    .appName("array-column-join")\
    .master("local[*]").getOrCreate()

In [8]:
tripDf = spark.createDataFrame([
        ("PMI",        "OPO",             [2, 1]),
        ("ATH", "BCN", [3]),
        ("JFK",        "MAD",          [5, 4, 6]),
        ("HND",        "LAX",       [8, 9, 7, 0])
    ], 
    ["origin","destination","internal_flight_ids"]
)
tripExploedDF = tripDf.select("*",expr("posexplode(internal_flight_ids) as (pos, internal_flight_id)"))
tripExploedDF.show(truncate=False)

+------+-----------+-------------------+---+------------------+
|origin|destination|internal_flight_ids|pos|internal_flight_id|
+------+-----------+-------------------+---+------------------+
|PMI   |OPO        |[2, 1]             |0  |2                 |
|PMI   |OPO        |[2, 1]             |1  |1                 |
|ATH   |BCN        |[3]                |0  |3                 |
|JFK   |MAD        |[5, 4, 6]          |0  |5                 |
|JFK   |MAD        |[5, 4, 6]          |1  |4                 |
|JFK   |MAD        |[5, 4, 6]          |2  |6                 |
|HND   |LAX        |[8, 9, 7, 0]       |0  |8                 |
|HND   |LAX        |[8, 9, 7, 0]       |1  |9                 |
|HND   |LAX        |[8, 9, 7, 0]       |2  |7                 |
|HND   |LAX        |[8, 9, 7, 0]       |3  |0                 |
+------+-----------+-------------------+---+------------------+



In [9]:
flightDF = spark.createDataFrame(
    [
    (0,'FR5763'),
    (1,'UT9586'),
    (2,'B4325'),
    (3,'RW35675'),
    (4,'LP656'),
    (5,'NB4321'),
    (6,'CX4599'),
    (7,'AZ8844'),
    (8,'KH8851'),
    (9,'OP8777')
    ],
    ["internal_flight_id","public_flight_number"]
)

In [10]:
flightTripDF = tripExploedDF\
    .join(flightDF,on="internal_flight_id")\
    .orderBy("origin","destination", "pos")

flightTripDF.groupBy("origin","destination")\
    .agg(collect_list(flightDF.public_flight_number).alias("internal_flight_list")).show(truncate=False)

+------+-----------+--------------------------------+
|origin|destination|internal_flight_list            |
+------+-----------+--------------------------------+
|PMI   |OPO        |[B4325, UT9586]                 |
|JFK   |MAD        |[NB4321, LP656, CX4599]         |
|HND   |LAX        |[KH8851, OP8777, AZ8844, FR5763]|
|ATH   |BCN        |[RW35675]                       |
+------+-----------+--------------------------------+

