In [0]:

from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType, DateType
from datetime import date

# Start Spark session
spark = SparkSession.builder.appName("CustomerProductOrder").getOrCreate()

fligt_schema = StructType([
StructField("Flightnumber",StringType()),
StructField("FlightName",StringType()),
StructField("Origin_City",StringType()),
StructField("Destination_City",StringType())
]
)

flight_date =[
("AIR100","AirIndia","Delhi","Hyderabad"),
("AIR100","AirIndia","Hyderabad","Banglore"),
("INDIGO50","Indigo","Indore","Udaipur"),
("JET1700","Jet Airways","Mumbai","Agra"),
("JET1700","Jet Airways","Agra","Kolkata"),
("JET1700","Jet Airways","Kolkata","Pune"),
("JET1700","Jet Airways","Pune","Hyderabad"),
]

flight_df = spark.createDataFrame(data=flight_date, schema=fligt_schema)
flight_df.show()

flight_df.createOrReplaceTempView("flights")


In [0]:
from pyspark.sql.functions import last,first
final_df=flight_df.orderBy('Flightnumber').groupBy('Flightnumber').agg(first('Origin_City').alias('origin'),last('Destination_City').alias('destination'))
display(final_df)

In [0]:
%sql

with base as (
    SELECT *,
           ROW_NUMBER() OVER(order by flightnumber) AS rn         
    FROM flights
)
    SELECT *,
           ROW_NUMBER() OVER (PARTITION BY Flightnumber ORDER BY rn) AS row_num_asc,
           ROW_NUMBER() OVER (PARTITION BY Flightnumber ORDER BY rn DESC) AS row_num_desc
    FROM base


In [0]:
%sql
with base as (
    SELECT *,          
           ROW_NUMBER() OVER (PARTITION BY Flightnumber ORDER BY ROW_NUMBER() OVER(order by flightnumber)) AS row_num_asc,
           ROW_NUMBER() OVER (PARTITION BY Flightnumber ORDER BY ROW_NUMBER() OVER(order by flightnumber) DESC) AS row_num_desc         
    FROM flights
)
SELECT DISTINCT
    f.Flightnumber,
    f.FlightName,
    first_leg.Origin_city,
    last_leg.Destination_city
FROM base f
JOIN base first_leg 
    ON f.Flightnumber = first_leg.Flightnumber 
    AND first_leg.row_num_asc = 1
JOIN base last_leg 
    ON f.Flightnumber = last_leg.Flightnumber 
    AND last_leg.row_num_desc = 1


In [0]:
%sql
with base as (
    SELECT *,
           ROW_NUMBER() OVER(order by flightnumber) AS rn         
    FROM flights
),
ranked AS (
    SELECT *,
           ROW_NUMBER() OVER (PARTITION BY Flightnumber ORDER BY rn) AS row_num_asc,
           ROW_NUMBER() OVER (PARTITION BY Flightnumber ORDER BY rn DESC) AS row_num_desc
    FROM base
)
SELECT DISTINCT
    f.Flightnumber,
    f.FlightName,
    first_leg.Origin_city,
    last_leg.Destination_city
FROM ranked f
JOIN ranked first_leg 
    ON f.Flightnumber = first_leg.Flightnumber 
    AND first_leg.row_num_asc = 1
JOIN ranked last_leg 
    ON f.Flightnumber = last_leg.Flightnumber 
    AND last_leg.row_num_desc = 1


In [0]:
flights_data = [(1,'Flight2' , 'Los Angeles' , 'London'),
(1,'Flight1' , 'London' , 'Vatican'),
(1,'Flight3' , 'Vatican' , 'Nantes'),
(2,'Flight1' , 'Houston' , 'Paris'),
(2,'Flight2' , 'Paris' , 'Nice')
]

schema = "cust_id int, flight_id string , origin string , destination string"
flights_df = spark.createDataFrame(flights_data,schema)
display(flights_df)

from pyspark.sql.functions import last,first
final_df=flights_df.orderBy('cust_id','flight_id').groupBy('cust_id').agg(first('origin').alias('origin'),last('destination').alias('destination'))
display(final_df)