In [0]:
"""
https://www.youtube.com/watch?v=TLE-xW9rZfQ&list=PLBTZqjSKn0IfuIqbMIqzS-waofsPHMS0E&index=8

Output: profile_id 1, 3, 4, 5

Problem statement: The linkedin creator team is looking for power creators who use their personal profile as a company or influencer page. If someone's Linkedin page has more followers than the company they work for, we can safely assume that person is a power creator. Write a query to return the IDs of these linkedin power creators. If there are multiple entries for company_id for each profile then get the max of company_followers
"""

from pyspark.sql.functions import *
from pyspark.sql.window import Window

personal_profile = spark.createDataFrame([
    (1, 'Nick Singh', 92000),
    (2, 'Zach Wilson', 199000),
    (3, 'Daliana Liu', 171000),
    (4, 'Ravit Jain', 107000),
    (5, 'Vin Vashishta', 139000),
    (6, 'Susan W', 39000)
], ["profile_id", "profile_name", "profile_followers"])

employee_company = spark.createDataFrame([
    (1, 4),
    (1, 9),
    (2, 2),
    (3, 1),
    (4, 3),
    (5, 6),
    (6, 5)
], ["personal_profile_id", "company_id"])

company_pages = spark.createDataFrame([
    (1, 'The Data Science Podcast', 8000),
    (2, 'Airbnb', 700000),
    (3, 'The Ravit Show', 6000),
    (4, 'DataLemur', 200),
    (5, 'YouTube', 16000000),
    (6, 'DataScience.Vin', 4500),
    (9, 'Ace the data science interview', 4479)
], ["c_company_id", "company_name", "company_followers"])

personal_profile.show()
employee_company.show()
company_pages.show(truncate=False)

+----------+-------------+-----------------+
|profile_id| profile_name|profile_followers|
+----------+-------------+-----------------+
|         1|   Nick Singh|            92000|
|         2|  Zach Wilson|           199000|
|         3|  Daliana Liu|           171000|
|         4|   Ravit Jain|           107000|
|         5|Vin Vashishta|           139000|
|         6|      Susan W|            39000|
+----------+-------------+-----------------+

+-------------------+----------+
|personal_profile_id|company_id|
+-------------------+----------+
|                  1|         4|
|                  1|         9|
|                  2|         2|
|                  3|         1|
|                  4|         3|
|                  5|         6|
|                  6|         5|
+-------------------+----------+

+------------+------------------------------+-----------------+
|c_company_id|company_name                  |company_followers|
+------------+------------------------------+------------

In [0]:
personal_profile.join(employee_company, personal_profile["profile_id"] == employee_company["personal_profile_id"], "inner")\
    .join(company_pages, employee_company["company_id"] == company_pages["c_company_id"], "inner") \
    .select("profile_id", "profile_name", "profile_followers", "company_followers") \
    .withColumn("max_company_followers", max("company_followers").over(Window.partitionBy("profile_id"))) \
    .filter(col("profile_followers") > col("max_company_followers")) \
    .select("profile_id").distinct()\
    .show(truncate=False) 

+----------+
|profile_id|
+----------+
|1         |
|3         |
|4         |
|5         |
+----------+

