In [0]:
# pyspark functions
from pyspark.sql.functions import *
from pyspark.sql.types import IntegerType
import pyspark
# URL processing
import urllib

In [0]:
%run "/Users/neeraj.adsul@gmail.com/utils/aws_connector"

In [0]:
%run "/Users/neeraj.adsul@gmail.com/utils/pinterest_data_cleaning"

## Connect and Mount S3 Bucket to Databricks File System

In [0]:
MOUNT_NAME = '/mnt/pin_pipe'
s3_conn = S3Connector(
    bucket_name='user-0a1d8948160f-bucket',
    mount_name=MOUNT_NAME,
    credential_file='authentication_credentials.csv')
s3_conn.mount()

In [0]:
pin_data_location = f"{MOUNT_NAME}/topics/0a1d8948160f.pin/partition=0/"
geo_data_location = f"{MOUNT_NAME}/topics/0a1d8948160f.geo/partition=0/"
user_data_location = f"{MOUNT_NAME}/topics/0a1d8948160f.user/partition=0/"
print(len(dbutils.fs.ls(pin_data_location)))
print(len(dbutils.fs.ls(geo_data_location)))
print(len(dbutils.fs.ls(user_data_location)))


## Read All Data for pinterest posts from S3

In [0]:
df_pin = prepare_dataframe(pin_data_location)
df_geo = prepare_dataframe(geo_data_location)
df_user = prepare_dataframe(user_data_location)

## Clean pin, geo and user post dataframes

In [0]:
df_pin = clean_pin_dataframe(df_pin)
df_geo = clean_geo_dataframe(df_geo)
df_user = clean_user_dataframe(df_user)

## Analytics Queries

In [0]:
print('df_pin: \n', df_pin.columns)
print('df_geo: \n', df_geo.columns)
print('df_user: \n', df_user.columns)

### M7-T4 Find the most popular Pinterest category people post to based on their country.

Your query should return a DataFrame that contains the following columns:
- country
- category
- category_count, a new column containing the desired query output

In [0]:
# First we inner join two dataframes on index of the post
# Count the posts grouped by country and category to get
# a table showing total number of posts per category for each country
joined = df_pin.join(df_geo, 'ind') \
    .groupBy('country', 'category') \
    .agg(count('*').alias('category_count')) \
    .sort('country', 'category', 'category_count', ascending=False) \
# display(joined)
# Now we need to only pick top category for each country therefore
# we partition the joined dataframe it by country using a Windowing function 
# and then sort within each partition by number of posts for each category
# Using this, we create a new column in joined dataframe which has this
# sorted and partitioned results from highest to lowest leading to highest number of posts
# to row 1 for each partition
window = pyspark.sql.Window.partitionBy('country').orderBy(desc('category_count'))
ranked = joined.withColumn('rank', row_number().over(window))
# display(ranked)
# Finally, we filter the results picking only the row 1 for each country
top_category_per_country = ranked.filter(col('rank')==1).select('country', 'category', 'category_count')
display(top_category_per_country)

### M7-T5 Find most popular category each year
Find how many posts each category had between 2018 and 2022.

Your query should return a DataFrame that contains the following columns:

- `post_year`, a new column that contains only the year from the timestamp column
- `category`
- `category_count`, a new column containing the desired query output


In [0]:
# First we join the dataframes containing posts and geolocation data on index
# Grouping by the category
joined = df_pin.join(
    df_geo.filter(df_geo.timestamp > lit('2017')).filter(df_geo.timestamp < lit('2023')), 'ind'
    ).groupBy(year('timestamp').alias('post_year'), 'category')\
    .agg(count('*').alias('category_count')).orderBy('post_year', 'category_count', ascending=False)
display(joined)

### M7-T6 User with most followers and the country

#### Step 1: For each country find the user with the most followers.

Your query should return a DataFrame that contains the following columns:

`country`
`poster_name`
`follower_count`

In [0]:
# The query has same pattern as previous M7-T5
joined = df_pin.join(df_geo, 'ind').groupby('poster_name', 'country').agg(sum('follower_count').alias('follower_count'))
window = pyspark.sql.Window.partitionBy('country').orderBy(desc('follower_count'))
ranked = joined.withColumn('rank', row_number().over(window))
top_user_by_follower_per_country = ranked.filter(col('rank')==1).select('country', 'poster_name', 'follower_count')

In [0]:
display(top_user_by_follower_per_country)

#### Step 2: Based on the above query, find the country with the user with most followers.

Your query should return a DataFrame that contains the following columns:

`country`
`follower_count`

This DataFrame should have only one entry.

In [0]:
user_country_most_followers = top_user_by_follower_per_country.select('country', 'follower_count')\
    .orderBy('follower_count', ascending=False)

In [0]:
display(user_country_most_followers.limit(1))

### M7-T7 Most popular category for each age group
#### What is the most popular category people post to based on the following age groups:

* 18-24
* 25-35
* 36-50
* +50

Your query should return a DataFrame that contains the following columns:\
`age_group`, a new column based on the original age column \
`category` \
`category_count`, a new column containing the desired query output


In [0]:
# Drop the columns not required to try increasing the speed
joined = df_pin.join(df_user, 'ind')
        # .drop('unique_id','title','description','follower_count','poster_name','tag_list',
        #                                    'is_image_or_video', 'image_src', 'save_location','user_name', 'date_joined')


In [0]:
age_demongraphics_categorical = joined.withColumn('age_group', 
                  when(joined.age.between(18, 24), lit('18-24')).otherwise(
                      when(joined.age.between(25, 35), lit('25-35')).otherwise(
                          when(joined.age.between(36, 50), lit('36-50')).otherwise(lit('50+'))
                      )
                  )
).groupBy('age_group', 'category').agg(count('*').alias('category_count'))

In [0]:
window = pyspark.sql.Window.partitionBy('age_group').orderBy(desc('category_count'))
ranked = age_demongraphics_categorical.withColumn('rank', row_number().over(window))
top_category_by_age = ranked.filter(col('rank')==1).select('age_group', 'category', 'category_count')

In [0]:
display(top_category_by_age)

### M7-T8 Median Follower Count by Age Group
What is the median follower count for users in the following age groups:

* 18-24
* 25-35
* 36-50
* +50

Your query should return a DataFrame that contains the following columns:
- `age_group`, a new column based on the original `age` column
- `median_follower_count`, a new column containing the desired query output


In [0]:
joined = df_pin.join(df_user, 'ind')
followers_by_age = joined.withColumn('age_group', 
                when(joined.age.between(18, 24), lit('18-24')
                     ).otherwise(
                        when(joined.age.between(25, 35), lit('25-35')
                             ).otherwise(
                                when(joined.age.between(36, 50), lit('36-50')).otherwise(lit('50+'))
                            )
                    )
).groupBy('age_group').agg(percentile_approx('follower_count', 0.5).alias('median_follower_count')).orderBy('age_group')

In [0]:
display(followers_by_age)

### M7-T9 Users Joining Per Year 2015 - 2020
Find how many users have joined between 2015 and 2020.

Your query should return a DataFrame that contains the following columns:

* `post_year`, a new column that contains only the year from the timestamp column
* `number_users_joined`, a new column containing the desired query output


In [0]:
users_per_year = df_user.withColumn('post_year', year('date_joined'))\
    .groupBy('post_year') \
    .agg(count('user_name').alias('number_users_joined'))


In [0]:
display(users_per_year)

### M7-T10 Median Follower Count of Users based On Joining Year
Find the median follower count of users have joined between 2015 and 2020.
Your query should return a DataFrame that contains the following columns:
- `post_year`, a new column that contains only the year from the timestamp column
- `median_follower_count`, a new column containing the desired query output


In [0]:
joined = df_pin.join(df_user, 'ind')\
    .withColumn('post_year', year('date_joined'))\
    .groupBy('post_year')\
    .agg(percentile_approx('follower_count', 0.5).alias('median_follower_count'))
    
filtered = joined.filter(
    joined.post_year.between(lit('2015'), lit('2020'))
)

In [0]:
display(filtered)

### M7-T11 The median follower count of users based on their joining year and age group
Find the median follower count of users that have joined between 2015 and 2020, based on which age group they are part of.\
Your query should return a DataFrame that contains the following columns:

- `age_group`, a new column based on the original `age` column
    * 18-24
    * 25-35
    * 36-50
    * +50
- `post_year`, a new column that contains only the year from the timestamp column
- `median_follower_count`, a new column containing the desired query output


In [0]:
joined = df_pin.join(df_user, 'ind').withColumn('post_year', year('date_joined'))
followers_by_age_by_joined_year = joined.withColumn('age_group', 
                when(joined.age.between(18, 24), lit('18-24')
                     ).otherwise(
                        when(joined.age.between(25, 35), lit('25-35')
                             ).otherwise(
                                when(joined.age.between(36, 50), lit('36-50')).otherwise(lit('50+'))
                            )
                    )
).groupBy('age_group', 'post_year')\
  .agg(percentile_approx('follower_count', 0.5).alias('median_follower_count'))\
    .orderBy('age_group')

In [0]:
display(followers_by_age_by_joined_year)

## Unmount S3 Bucket

In [0]:
dbutils.fs.unmount("/mnt/pin_pipe")
# dbutils.fs.unmount("/mnt")