In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count, row_number, year, sum, max, when, expr
from pyspark.sql.window import Window


# Read the managed Delta tables into Spark DataFrames
df_pin = spark.read.table("workspace.default.df_pin")
df_user = spark.read.table("workspace.default.df_user")
df_geo = spark.read.table("workspace.default.df_geo")

# Perform full outer joins on 'ind'
df_combined = df_pin.join(
    df_user, on="ind", how="fullouter"
).join(
    df_geo, on="ind", how="fullouter"
)

#dropping rows with any null values
df_combined=df_combined.dropna("any")

#Ordering the rows
df_combined=df_combined.orderBy("ind")

# Show the combined DataFrame
display(df_combined)

In [0]:
#Task 4

# Initialize Spark session
spark = SparkSession.builder.appName("PopularCategory").getOrCreate()

# Assuming df_combined is your existing PySpark DataFrame
# Group by country and category, then count occurrences
df_category_count = df_combined.groupBy("country", "category").agg(count("*").alias("category_count"))

# Define a window specification to get the most popular category in each country
window_spec = Window.partitionBy("country").orderBy(col("category_count").desc())

# Add row number based on category count within each country
df_most_popular_category_by_country = df_category_count.withColumn("rank", row_number().over(window_spec))

# Filter the top-ranked category for each country
df_most_popular_category_by_country = df_most_popular_category_by_country.filter(col("rank") == 1).drop("rank")

# Show the result
display(df_most_popular_category_by_country)

In [0]:
#Task 5
# Convert timestamp column to datetime format
df_combined = df_combined.withColumn("post_year", year(col("timestamp")))

# Group by year and category, then count occurrences
df_category_count = df_combined.groupBy("post_year", "category").agg(count("*").alias("category_count"))

# Define a window specification to find the most popular category per year
window_spec = Window.partitionBy("post_year").orderBy(col("category_count").desc())

# Assign a ranking to each category within each year
df_ranked = df_category_count.withColumn("rank", row_number().over(window_spec))

# Filter only the most popular category (rank = 1) per year
df_most_popular_category_by_year = df_ranked.filter(col("rank") == 1).drop("rank")

display(df_most_popular_category_by_year)

In [0]:
#Task 6
#Step 1
#Find the user with most followers in each country

# Group by country and poster_name, then sum up follower counts
df_followers = df_combined.groupBy("country", "poster_name").agg(sum("follower_count").alias("total_followers"))

# Define a window specification to rank users by follower count within each country
window_spec = Window.partitionBy("country").orderBy(col("total_followers").desc())

# Assign ranking to users within each country
df_ranked = df_followers.withColumn("rank", row_number().over(window_spec))

# Filter only the top-ranked user per country
df_top_users_per_country = df_ranked.filter(col("rank") == 1).drop("rank")

display(df_top_users_per_country)

In [0]:
#Task 6
#Step 2
# Define a window specification to find the country with the highest follower count
window_spec = Window.orderBy(col("total_followers").desc())

# Assign a ranking based on the highest follower count across all countries
df_ranked = df_top_users_per_country.withColumn("rank", row_number().over(window_spec))

# Filter only the country with the highest follower count
df_top_country = df_ranked.filter(col("rank") == 1).select("country", "total_followers").drop("rank")

display(df_top_country)

In [0]:
#Task 7

# Define age bins using PySpark
df_combined = df_combined.withColumn(
    "age_group",
    when(col("age") <= 24, "18-24")
    .when((col("age") > 24) & (col("age") <= 35), "25-35")
    .when((col("age") > 35) & (col("age") <= 50), "36-50")
    .otherwise("50+")
)

# Group by age_group and category, then count occurrences
df_category_count = df_combined.groupBy("age_group", "category").agg(count("*").alias("category_count"))

# Define a window specification to rank categories within each age group
window_spec = Window.partitionBy("age_group").orderBy(col("category_count").desc())

# Assign ranking to categories within each age group
df_ranked = df_category_count.withColumn("rank", row_number().over(window_spec))

# Filter only the most popular category per age group
df_most_popular_category_by_age_group = df_ranked.filter(col("rank") == 1).drop("rank")

# Show the result
df_most_popular_category_by_age_group.show()

In [0]:
#Task 8
# Create a new DataFrame with age groups (without modifying df_combined)
df_age_grouped = df_combined.withColumn(
    "age_group",
    when(col("age") <= 24, "18-24")
    .when((col("age") > 24) & (col("age") <= 35), "25-35")
    .when((col("age") > 35) & (col("age") <= 50), "36-50")
    .otherwise("50+")
)

# Compute the median follower count for each age group using approxQuantile
df_median_followers = df_age_grouped.groupBy("age_group").agg(
    expr("percentile_approx(follower_count, 0.5)").alias("median_follower_count")
)

# Define a custom sorting column based on the age_group order
df_median_followers = df_median_followers.withColumn(
    "sort_order",
    when(col("age_group") == "18-24", 1)
    .when(col("age_group") == "25-35", 2)
    .when(col("age_group") == "36-50", 3)
    .when(col("age_group") == "50+", 4)
)

# Sort by the custom order and drop the helper column
df_median_followers = df_median_followers.orderBy("sort_order").drop("sort_order")

# Show the sorted result
df_median_followers.show()

In [0]:
#Task 9
# Convert 'date_joined' column to year (extracting the year)
df_with_year = df_combined.withColumn("post_year", year(col("date_joined")))

# Filter data for years between 2015 and 2020
df_filtered = df_with_year.filter((col("post_year") >= 2015) & (col("post_year") <= 2020))

# Count number of users joined per year
df_users_per_year = df_filtered.groupBy("post_year").agg(count("*").alias("number_users_joined"))

# Sort by year (optional, to ensure ascending order)
df_users_per_year = df_users_per_year.orderBy("post_year")

# Show the result
df_users_per_year.show()

In [0]:
#Task 10

# Convert 'date_joined' column to extract year
df_with_year = df_combined.withColumn("post_year", year(col("date_joined")))

# Filter data for users who joined between 2015 and 2020
df_filtered = df_with_year.filter((col("post_year") >= 2015) & (col("post_year") <= 2020))

# Calculate median follower count per year using approxQuantile
df_median_followers = df_filtered.groupBy("post_year").agg(
    expr("percentile_approx(follower_count, 0.5)").alias("median_follower_count")
)

# Sort by year (optional, to ensure ascending order)
df_median_followers = df_median_followers.orderBy("post_year")

# Show the result
df_median_followers.show()

In [0]:

# Convert 'date_joined' column to extract year
df_with_year = df_combined.withColumn("post_year", year(col("date_joined")))

# Define age bins using PySpark's `when` function (if not already assigned)
df_with_age_group = df_with_year.withColumn(
    "age_group",
    when(col("age") <= 24, "18-24")
    .when((col("age") > 24) & (col("age") <= 35), "25-35")
    .when((col("age") > 35) & (col("age") <= 50), "36-50")
    .otherwise("50+")
)

# Filter data for users who joined between 2015 and 2020
df_filtered = df_with_age_group.filter((col("post_year") >= 2015) & (col("post_year") <= 2020))

# Group by post_year and age_group, then calculate the median follower count
df_median_followers_by_age_group = df_filtered.groupBy("post_year", "age_group").agg(
    expr("percentile_approx(follower_count, 0.5)").alias("median_follower_count")
)

# Sort by post_year and age_group for better readability
df_median_followers_by_age_group = df_median_followers_by_age_group.orderBy("post_year", "age_group")

# Show the result
df_median_followers_by_age_group.show()