<h3>Define dataset path</h3>

In [1]:
import os

big_data_dir = 'D://Datasets/bigdata20/bigdata20'
followers = os.path.join(big_data_dir, 'followers.parquet')
followers_posts = os.path.join(big_data_dir, 'followers_posts_api_final.json')
followers_posts_likes = os.path.join(big_data_dir, 'followers_posts_likes.parquet')
posts = os.path.join(big_data_dir, 'posts_api.json')
posts_likes = os.path.join(big_data_dir, 'posts_likes.parquet')

<h3>Spark initialization</h3>

In [2]:
import pyspark
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession, SQLContext

conf = SparkConf().setAppName('appName').setMaster('local')
sc = SparkContext(conf=conf)
spark = SparkSession(sc)
sqlContext = SQLContext(sc)

In [3]:
from pyspark.sql.functions import col, isnan, when, trim, sort_array, explode

<h3>Load dataset</h3>

In [4]:
posts_df = sqlContext.read.json(posts)
posts_df.describe()

DataFrame[summary: string, date: string, from_id: string, id: string, key: string, marked_as_ads: string, owner_id: string, post_type: string, signer_id: string, text: string, unavailable: string]

In [5]:
posts_likes_df = spark.read.load(posts_likes)
posts_likes_df.first()

Row(itemType='post', ownerId=-94, itemId=13499, likerId=2070090)

In [6]:
followers_posts_df = sqlContext.read.json(followers_posts)

<h3>Get top followers</h3>

<h5>Sort by likes</h5>

In [7]:
likes_count = posts_likes_df.groupby('likerId').count()
answer = likes_count.orderBy(col('count').desc(), asc=False).limit(20)
answer.show()
answer_json = answer.toPandas().to_json()
with open('task2_top_followers_by_likes.json', 'w') as f:
    f.write(answer_json)

+---------+-----+
|  likerId|count|
+---------+-----+
|  2070090| 4801|
|  2397858| 2055|
|  1475301| 1829|
|    18239| 1569|
|   546612| 1245|
|     6371|  907|
|  1841959|  746|
| 78440957|  709|
|   120248|  699|
| 40981497|  611|
|    22158|  553|
|207628162|  548|
|329377723|  504|
| 76071304|  474|
| 14805173|  440|
|   317799|  385|
| 56355640|  375|
| 52042971|  338|
|  7437271|  336|
|136506644|  335|
+---------+-----+



<h5>Sort by reposts</h5>

In [8]:
df_exploded = followers_posts_df.withColumn('copy_history', explode('copy_history'))
answer = df_exploded.filter(df_exploded.copy_history.owner_id == '-94').groupby('owner_id').count().sort('count', ascending=False).limit(20)
answer.show()
answer_json = answer.toPandas().to_json()

+---------+-----+
| owner_id|count|
+---------+-----+
|180907432|   48|
|   317799|   16|
|  4068532|   13|
|  2547211|    9|
|484122052|    8|
|268247082|    5|
|  1077823|    5|
|  2070090|    5|
|281951154|    5|
|217400123|    4|
|172808182|    4|
|  1533614|    4|
| 44361144|    4|
|527580876|    4|
|157728618|    4|
| 18467645|    4|
|168543860|    4|
|   256973|    3|
|    86002|    3|
|113773552|    3|
+---------+-----+



<h5>Save answer</h5>

In [9]:
with open('task2_top_followers_by_reposts.json', 'w') as f:
    f.write(answer_json)