In [5]:
"""
2. Airbnb SQL Interview Question | Convert Comma Separated Values into Rows | Data Analytics
https://lnkd.in/gpMbU-dF

Find the room types which are searched most number of times. Display room type with corresponding search count.
"""

from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

spark = SparkSession.builder.getOrCreate()

data = [
  (1,'2022-01-01','entire home,private room'),
  (2,'2022-01-02','entire home,shared room'),
  (3,'2022-01-02','private room,shared room'),
  (4,'2022-01-03','private room')
]

schema = StructType(
  [
    StructField("user_id", IntegerType()),
    StructField("date_searched", StringType()),
    StructField("filter_room_types", StringType())
  ]
)

df = spark.createDataFrame(data=data, schema=schema)
df.show(truncate=False)
df.printSchema()


+-------+-------------+------------------------+
|user_id|date_searched|filter_room_types       |
+-------+-------------+------------------------+
|1      |2022-01-01   |entire home,private room|
|2      |2022-01-02   |entire home,shared room |
|3      |2022-01-02   |private room,shared room|
|4      |2022-01-03   |private room            |
+-------+-------------+------------------------+

root
 |-- user_id: integer (nullable = true)
 |-- date_searched: string (nullable = true)
 |-- filter_room_types: string (nullable = true)



In [20]:
df.withColumn("room_types", explode(split(col("filter_room_types"), ","))) \
    .groupBy("room_types").agg(count("room_types").alias("cnt")) \
    .show(truncate=False)

+------------+---+
|room_types  |cnt|
+------------+---+
|private room|3  |
|entire home |2  |
|shared room |2  |
+------------+---+

