In [0]:
"""
Write a query to return list of teams.

Teams are formed using following rules:
1. Team members must live in the city they represent.
2. For each city, create teams of 3 until there are fewer than 3 who are unassigned.
3. When there are fewer than 3 unassigned, create a team.

Report requirements:
1. City should be ordered alphabetically
2. Players must be selected in the order that they appear
3. Their names should appear alphabetically ordered within the comma separated list
4. team_name column should concat "Team" and a row number in which they appear in the output
s

Input
+--------+-------------+
|emp_name|         city|
+--------+-------------+
|     Sam|     New York|
|   David|     New York|
|   Peter|     New York|
|   Chris|     New York|
|    John|     New York|
|   Steve|San Francisco|
|  Rachel|San Francisco|
|  Robert|  Los Angeles|
+--------+-------------+

Output
+-------------+---------------+---------+
|         city|           team|team_name|
+-------------+---------------+---------+
|  Los Angeles|         Robert|    Team1|
|     New York|David,Peter,Sam|    Team2|
|     New York|     Chris,John|    Team3|
|San Francisco|   Rachel,Steve|    Team4|
+-------------+---------------+---------+
"""

from pyspark.sql import functions as F

emp_details_df = spark.createDataFrame([
    ('Sam', 'New York'),
    ('David', 'New York'),
    ('Peter', 'New York'),
    ('Chris', 'New York'),
    ('John', 'New York'),
    ('Steve', 'San Francisco'),
    ('Rachel', 'San Francisco'),
    ('Robert', 'Los Angeles')
], ["emp_name", "city"]
)

emp_details_df.show()


+--------+-------------+
|emp_name|         city|
+--------+-------------+
|     Sam|     New York|
|   David|     New York|
|   Peter|     New York|
|   Chris|     New York|
|    John|     New York|
|   Steve|San Francisco|
|  Rachel|San Francisco|
|  Robert|  Los Angeles|
+--------+-------------+



In [0]:
from pyspark.sql import functions as F
from pyspark.sql.window import *

emp_details_df \
    .withColumn("rn", F.row_number().over(Window.partitionBy(F.col("city")).orderBy(F.col("city")))) \
    .withColumn("city_grp", F.ceil(F.col("rn")/3.0)) \
    .groupBy("city", "city_grp").agg(
        F.concat_ws(",", F.sort_array(F.collect_list("emp_name"))).alias("team")
    ) \
    .withColumn("team_name", F.concat(F.lit('Team'), F.row_number().over(Window.orderBy(F.col("city"))))) \
    .drop("city_grp") \
    .show()
    



+-------------+---------------+---------+
|         city|           team|team_name|
+-------------+---------------+---------+
|  Los Angeles|         Robert|    Team1|
|     New York|David,Peter,Sam|    Team2|
|     New York|     Chris,John|    Team3|
|San Francisco|   Rachel,Steve|    Team4|
+-------------+---------------+---------+



In [0]:

emp_details_df.createOrReplaceTempView("emp_details")

spark.sql("""
          with cte as (
            select 
                *, row_number() over(partition by city order by city) as rn
            from emp_details
          ), cte2 as (
            select
                *, ceiling(rn/3.0) as city_grp      
            from cte 
          ), cte3 as (
            select 
                city, city_grp, string_agg(emp_name, ',') within group(order by emp_name) as team
            from cte2
            group by city, city_grp
          )
          select
            city, team, concat('Team', row_number() over(order by city)) as team_name
          from cte3
          """).show()



+-------------+---------------+---------+
|         city|           team|team_name|
+-------------+---------------+---------+
|  Los Angeles|         Robert|    Team1|
|     New York|David,Peter,Sam|    Team2|
|     New York|     Chris,John|    Team3|
|San Francisco|   Rachel,Steve|    Team4|
+-------------+---------------+---------+

