In [0]:
from pyspark.sql import SparkSession
import pandas as pd

# Read the managed Delta tables into Spark DataFrames
df_pin_spark = spark.read.table("workspace.default.df_pinterest")
df_user_spark = spark.read.table("workspace.default.df_user")
df_geo_spark = spark.read.table("workspace.default.df_geo")

# Convert Spark DataFrames to Pandas
df_pin = df_pin_spark.toPandas()
df_user = df_user_spark.toPandas()
df_geo = df_geo_spark.toPandas()

# Concatenate the DataFrames column-wise
df_combined = pd.concat([
    df_pin.reset_index(drop=True),
    df_user.reset_index(drop=True),
    df_geo.reset_index(drop=True)
], axis=1)


In [0]:
# Show all rows
pd.set_option('display.max_rows', None)

In [0]:
df_combined

In [0]:
#Task 4
# Group by country and category, then count occurrences
df_category_count = df_combined.groupby(['country', 'category']).size().reset_index(name='category_count')

# Find the most popular category in each country
df_most_popular_category_by_country = df_category_count.loc[df_category_count.groupby('country')['category_count'].idxmax()]

# Display the resulting DataFrame
df_most_popular_category_by_country

In [0]:
#Task 5
# Convert timestamp column to datetime format
df_combined['post_year'] = pd.to_datetime(df_combined['timestamp']).dt.year

# Group by year and category, then count occurrences
df_category_count = df_combined.groupby(['post_year', 'category']).size().reset_index(name='category_count')

# Find the most popular category each year
df_most_popular_category_by_year = df_category_count.loc[df_category_count.groupby('post_year')['category_count'].idxmax()]

# Display the resulting DataFrame
df_most_popular_category_by_year

In [0]:
#Task 6
#Step 1
#Find the user with most followers in each country

# Group by country and poster_name, then sum up follower counts
df_followers = df_combined.groupby(['country', 'poster_name'])['follower_count'].sum().reset_index()

# Find the user with the most followers in each country
df_top_users_per_country = df_followers.loc[df_followers.groupby('country')['follower_count'].idxmax()]

# Display the resulting DataFrame
df_top_users_per_country

In [0]:
#Task 6
#Step 2
# Find the country where the user has the highest follower count
df_top_country = df_top_users_per_country.loc[df_top_users_per_country['follower_count'].idxmax(), ['country', 'follower_count']].to_frame().T

# Display the final DataFrame (single row)
df_top_country


In [0]:
#Task 7
# Define age group bins and labels
bins = [0, 24, 35, 50, float('inf')]
labels = ['18-24', '25-35', '36-50', '50+']

# Create age_group column based on age
df_combined['age_group'] = pd.cut(df_combined['age'], bins=bins, labels=labels, right=True)

# Group by age_group and category, then count occurrences
df_category_count = df_combined.groupby(['age_group', 'category']).size().reset_index(name='category_count')

# Find the most popular category in each age group
df_most_popular_category_by_age_group = df_category_count.loc[df_category_count.groupby('age_group')['category_count'].idxmax()]

# Display the resulting DataFrame
df_most_popular_category_by_age_group

In [0]:
#Task 8
# Group by age_group and calculate median follower count
df_median_followers = df_combined.groupby('age_group')['follower_count'].median().reset_index(name='median_follower_count')

# Display the resulting DataFrame
df_median_followers

In [0]:
#Task 9
# Convert date_joined column to datetime format and extract the year
df_combined['post_year'] = pd.to_datetime(df_combined['date_joined']).dt.year

# Filter data for years between 2015 and 2020
df_filtered = df_combined[(df_combined['post_year'] >= 2015) & (df_combined['post_year'] <= 2020)]

# Count number of users joined per year
df_users_per_year = df_filtered.groupby('post_year').size().reset_index(name='number_users_joined')

# Display the resulting DataFrame
df_users_per_year

In [0]:
#Task 10
# Convert date_joined column to datetime format and extract the year
df_combined['post_year'] = pd.to_datetime(df_combined['date_joined']).dt.year

# Filter data for users who joined between 2015 and 2020
df_filtered = df_combined[(df_combined['post_year'] >= 2015) & (df_combined['post_year'] <= 2020)]

# Calculate median follower count per year
df_median_followers = df_filtered.groupby('post_year')['follower_count'].median().reset_index(name='median_follower_count')

# Display the resulting DataFrame
df_median_followers

In [0]:
#Task 11
# Convert date_joined column to datetime format and extract the year
df_combined['post_year'] = pd.to_datetime(df_combined['date_joined']).dt.year

# Filter data for users who joined between 2015 and 2020
df_filtered = df_combined[(df_combined['post_year'] >= 2015) & (df_combined['post_year'] <= 2020)]

# Group by post_year and age_group, then calculate the median follower count
df_median_followers_by_age_group = df_filtered.groupby(['post_year', 'age_group'])['follower_count'].median().reset_index(name='median_follower_count')

# Display the resulting DataFrame
df_median_followers_by_age_group