In [0]:
import pyspark.sql.functions as F
from pyspark.sql.functions import substring, length, col, expr, when
from pyspark.sql.types import IntegerType
from pyspark.sql import Window

In [0]:
%run "./utils/create_schemas"

In [0]:
file_type = "json"

def read_s3(topic, file_type, schema):
    """Given a topic in a s3 bucket, returns a dataframe with the contents"""
    file_location = f"mnt/0e36c8cd403d_s3_bucket/topics/{topic}/partition=0/*.json"

    df = (spark.read.format(file_type)
          .schema(schema)
          .load(file_location)
    )

    return df

In [0]:
df_geo_pre_cleaning = read_s3('0e36c8cd403d.geo', file_type, geo_schema)
df_pin_pre_cleaning = read_s3('0e36c8cd403d.pin', file_type, pin_schema)
df_user_pre_cleaning = read_s3('0e36c8cd403d.user', file_type, user_schema)

In [0]:
%run "./utils/clean"

In [0]:
w = Window.partitionBy('country')

(
    df_pin.join(df_geo,
                on="ind",
                how="inner"
                )
    # Group by and add column category_count
    .groupBy("country", "category")
    .agg(F.count("category").alias("category_count"))

    # Get max category per country
    .withColumn('max_count', F.max('category_count').over(w))
    .where(F.col('category_count') == F.col('max_count'))
    .drop('max_count')
    .show(truncate=100)
)



In [0]:
w = Window.partitionBy("post_year")

(
    df_pin.join(df_geo,
                on="ind",
                how="inner"
    )
    .withColumn("post_year", F.year("timestamp"))
    .where(F.col("post_year").between("2018", "2022"))

    # Group by and add column category_count
    .groupBy("post_year", "category")
    .agg(F.count("category").alias("category_count"))

    # Get max category per year
    .withColumn("max_count", F.max("category_count").over(w))
    .where(F.col("category_count") == F.col("max_count"))
    .drop("max_count")
    .show()
)

In [0]:
w = Window.partitionBy("country")

df_user_with_most_followers_by_country = (
    df_pin.join(df_geo,
                on="ind",
                how="inner"
                )
            # For each country return the user with the most followers
            .drop_duplicates(["country", "poster_name", "follower_count"])
            .select("country", "poster_name", "follower_count")
            .withColumn("max_follower_count", F.max("follower_count").over(w))
            .where(F.col("follower_count") == F.col("max_follower_count"))
            .drop("max_follower_count")
)

df_user_with_most_followers_by_country.show(truncate=100)

In [0]:

(
    df_user_with_most_followers_by_country
    .select("country", "follower_count")
    .sort("follower_count", "country", ascending=[False, True])
    .limit(1)
    .show()
)



In [0]:
def add_age_group_column(df):
    """Add age_group column"""
    df = df.withColumn('age_group', 
                       when( (F.col("age") >= 18) & (F.col("age") < 25), "18-24")
                       .when( (F.col("age") >= 25) & (F.col("age") < 36), "25-35")
                       .when( (F.col("age") >= 36) & (F.col("age") < 51), "36-50")
                       .otherwise("+50")
               )

    return df

In [0]:

w = Window.partitionBy("age_group")

(
    df_pin.join(df_user,
                on="ind",
                how="inner")
    .transform(add_age_group_column)
    .groupby("age_group", "category")
    .agg(F.count("category").alias("category_count"))
    .withColumn("max_category_count", F.max("category_count").over(w))
    .where(F.col("category_count") == F.col("max_category_count"))
    .drop("max_category_count")
    .show()
)

In [0]:
magic_percentile = F.expr('percentile_approx(follower_count, 0.5)')

(
    df_pin.join(df_user,
                on="ind",
                how="inner")
    .transform(add_age_group_column)
    .groupBy('age_group').agg(magic_percentile.alias('median_follower_count'))
    .show()
)

In [0]:
(
    df_user.join(df_geo,
                 on="ind",
                 how="inner")
    .withColumn("joined_year", F.year("date_joined"))
    .where( ((F.col("joined_year") > 2014) & (F.col("joined_year") < 2021)) )
    .withColumn("post_year", F.year("timestamp"))
    .groupby("post_year").agg(F.count(F.col("post_year")).alias("number_users_joined"))
    .sort("post_year", ascending=True)
    .show()
)

In [0]:
magic_percentile = F.expr('percentile_approx(follower_count, 0.5)')

(
    df_user.join(df_pin,
                 on="ind",
                 how="inner")
    .join(df_geo,
          on="ind",
          how="inner")
    .withColumn("joined_year", F.year("date_joined"))
    .where( ((F.col("joined_year") > 2014) & (F.col("joined_year") < 2021)) )
    .withColumn("post_year", F.year("timestamp"))
    .groupBy('post_year').agg(magic_percentile.alias('median_follower_count'))
    .show()
)

In [0]:
magic_percentile = F.expr('percentile_approx(follower_count, 0.5)')

(
    df_user.join(df_pin,
                 on="ind",
                 how="inner")
    .join(df_geo,
          on="ind",
          how="inner")
    .withColumn("joined_year", F.year("date_joined"))
    .where( ((F.col("joined_year") > 2014) & (F.col("joined_year") < 2021)) )
    .withColumn("post_year", F.year("timestamp"))
    .transform(add_age_group_column)
    .groupBy('post_year', 'age_group').agg(magic_percentile.alias('median_follower_count'))
    .sort(["post_year", "age_group"], ascending=[True, True])
    .show()
)