In [0]:
"""
Return a list of movie genres and the best movie in that genre based on max avg rating
"""
from pyspark.sql.functions import *
from pyspark.sql.window import Window

movies = spark.createDataFrame(
[
  (1, 'Action', 'The Dark Knight'), (2, 'Action', 'Avengers: Infinity War'), (3, 'Action', 'Gladiator'), 
  (4, 'Action', 'Die Hard'), (5, 'Action', 'Mad Max: Fury Road'), (6, 'Drama', 'The Shawshank Redemption'),
  (7, 'Drama', 'Forrest Gump'), (8, 'Drama', 'The Godfather'), (9, 'Drama', 'Schindler''s List'),
  (10, 'Drama', 'Fight Club'), (11, 'Comedy', 'The Hangover'), (12, 'Comedy', 'Superbad'),
  (13, 'Comedy', 'Dumb and Dumber'), (14, 'Comedy', 'Bridesmaids'), (15, 'Comedy', 'Anchorman: The Legend of Ron Burgundy')
], ["id", "genre", "title"]
)

reviews = spark.createDataFrame([
(1, 4.5), (1, 4.0), (1, 5.0), (2, 4.2), (2, 4.8), (2, 3.9), (3, 4.6), (3, 3.8), (3, 4.3), (4, 4.1), (4, 3.7), (4, 4.4), (5, 3.9), (5, 4.5), (5, 4.2), (6, 4.8), (6, 4.7), (6, 4.9), (7, 4.6), (7, 4.9), (7, 4.3), (8, 4.9), (8, 5.0), (8, 4.8), (9, 4.7), (9, 4.9), (9, 4.5), (10, 4.6), (10, 4.3), (10, 4.7), (11, 3.9), (11, 4.0), 
(11, 3.5), (12, 3.7), (12, 3.8), (12, 4.2), (13, 3.2), (13, 3.5), (13, 3.8), (14, 3.8), (14, 4.0), 
(14, 4.2), (15, 3.9), (15, 4.0), (15, 4.1)
], ["movie_id", "rating"])

movies.show(truncate=False)
reviews.show()

+---+------+-------------------------------------+
|id |genre |title                                |
+---+------+-------------------------------------+
|1  |Action|The Dark Knight                      |
|2  |Action|Avengers: Infinity War               |
|3  |Action|Gladiator                            |
|4  |Action|Die Hard                             |
|5  |Action|Mad Max: Fury Road                   |
|6  |Drama |The Shawshank Redemption             |
|7  |Drama |Forrest Gump                         |
|8  |Drama |The Godfather                        |
|9  |Drama |Schindlers List                      |
|10 |Drama |Fight Club                           |
|11 |Comedy|The Hangover                         |
|12 |Comedy|Superbad                             |
|13 |Comedy|Dumb and Dumber                      |
|14 |Comedy|Bridesmaids                          |
|15 |Comedy|Anchorman: The Legend of Ron Burgundy|
+---+------+-------------------------------------+

+--------+------+
|movie_id|ra

In [0]:
movies.join(reviews, movies["id"] == reviews["movie_id"], "inner") \
    .select(movies["genre"], movies["title"], reviews["rating"]) \
    .groupBy("genre", "title").agg(avg("rating").alias("avg_rating")) \
    .withColumn("rn", row_number().over(Window.partitionBy("genre").orderBy(desc(col("avg_rating"))))) \
    .filter(col("rn") == 1) \
    .drop("rn") \
    .withColumn("avg_rating", round(col("avg_rating"), 0)) \
    .show(truncate=False)

"""
.withColumn("stars", repeat(lit("*"), col("avg_rating"))) \
repeat is pyspark equivalent of replicate in ssms, repeating * with col "avg_rating" is not working  
"""

+------+-------------------------------------+----------+
|genre |title                                |avg_rating|
+------+-------------------------------------+----------+
|Action|The Dark Knight                      |5.0       |
|Comedy|Anchorman: The Legend of Ron Burgundy|4.0       |
|Drama |The Godfather                        |5.0       |
+------+-------------------------------------+----------+

Out[31]: '\n.withColumn("stars", repeat(lit("*"), col("avg_rating"))) repeat is pyspark equivalent of replicate in ssms, repeating * with col "avg_rating" is not working  \n'