<h3>Define dataset path</h3>

In [2]:
import os

big_data_dir = 'D://Datasets/bigdata20/bigdata20'
followers = os.path.join(big_data_dir, 'followers.parquet')
followers_posts = os.path.join(big_data_dir, 'followers_posts_api_final.json')
followers_posts_likes = os.path.join(big_data_dir, 'followers_posts_likes.parquet')
posts = os.path.join(big_data_dir, 'posts_api.json')
posts_likes = os.path.join(big_data_dir, 'posts_likes.parquet')

<h3>Spark initialization</h3>

In [3]:
import pyspark
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession, SQLContext

conf = SparkConf().setAppName('appName').setMaster('local')
sc = SparkContext(conf=conf)
spark = SparkSession(sc)
sqlContext = SQLContext(sc)

In [4]:
from pyspark.sql.functions import col, isnan, when, trim, sort_array, explode

<h3>Load dataset</h3>

In [5]:
followers_df = spark.read.load(followers)
followers_df.printSchema()
followers_df.groupby('profile').count().show()

root
 |-- profile: integer (nullable = true)
 |-- follower: integer (nullable = true)

+-------+-----+
|profile|count|
+-------+-----+
|    -94|43988|
+-------+-----+



In [6]:
posts_df = sqlContext.read.json(posts)
posts_df.describe()

DataFrame[summary: string, date: string, from_id: string, id: string, key: string, marked_as_ads: string, owner_id: string, post_type: string, signer_id: string, text: string, unavailable: string]

In [7]:
followers_posts_df = sqlContext.read.json(followers_posts)
followers_posts_df.describe()

DataFrame[summary: string, date: string, final_post: string, from_id: string, id: string, is_pinned: string, key: string, owner_id: string, post_type: string, signer_id: string, text: string, unavailable: string]

In [8]:
followers_posts_df.groupby('owner_id').count().orderBy(col('count').desc()).limit(20).show()

+---------+-----+
| owner_id|count|
+---------+-----+
|  2547211|40204|
|357231922|26715|
|168543860|18853|
| 50348231|11906|
| 25646344|11122|
|     null| 9858|
|176861294| 9033|
|141687240| 8808|
|445159771| 8704|
|143207077| 8471|
|194073434| 8146|
|    29840| 7293|
|524656784| 7263|
|459339006| 6814|
|514384760| 6578|
|483715951| 6140|
|412181460| 5813|
|461319529| 5724|
|451211328| 5651|
|426396104| 5545|
+---------+-----+



<h3>Get posts and reposts</h3>

<h5>Split the field "copy history" into several lines</h5>

In [10]:
df_exploded = followers_posts_df.withColumn('copy_history', explode('copy_history'))

<h5>Get reposts only from the ITMO group</h5>

In [11]:
_filtered = df_exploded.filter(df_exploded.copy_history.owner_id == '-94')
_selected = _filtered.select(_filtered.copy_history.id.alias("group_post_id"), _filtered.id.alias("user_post_id"))
_selected = _selected.sort('copy_history.id')

In [12]:
_selected.show()

+-------------+------------+
|group_post_id|user_post_id|
+-------------+------------+
|        38730|        9523|
|        38730|        8187|
|        38730|        2590|
|        38738|        8188|
|        38740|       31900|
|        38740|        9561|
|        38740|        2649|
|        38740|        1060|
|        38740|        9496|
|        38740|        1133|
|        38740|        8186|
|        38740|         185|
|        38748|        9574|
|        38751|        8054|
|        38754|       10318|
|        38755|         303|
|        38764|        5076|
|        38767|         778|
|        38767|         364|
|        38791|         622|
+-------------+------------+
only showing top 20 rows



<h5>Group reposts by post_id</h5>

In [13]:
from pyspark.sql.functions import col, isnan, when, trim, sort_array, collect_list
answer = _selected.groupby("group_post_id").agg(collect_list("user_post_id")).sort('group_post_id')

In [14]:
answer.show()

+-------------+--------------------------+
|group_post_id|collect_list(user_post_id)|
+-------------+--------------------------+
|        38730|        [9523, 2590, 8187]|
|        38738|                    [8188]|
|        38740|      [1060, 31900, 113...|
|        38748|                    [9574]|
|        38751|                    [8054]|
|        38754|                   [10318]|
|        38755|                     [303]|
|        38764|                    [5076]|
|        38767|                [778, 364]|
|        38791|                     [622]|
|        38814|                    [9678]|
|        38818|                    [9664]|
|        38823|                    [3512]|
|        38847|              [3373, 9697]|
|        38854|                    [2613]|
|        38857|                    [2618]|
|        38858|                    [2620]|
|        38859|                    [2633]|
|        38862|                    [2642]|
|        38867|              [2644, 2914]|
+----------

<h5>Save answer</h5>

In [15]:
answer_json = answer.toPandas().to_json()
with open('task3_followers_reposts.json', 'w') as f:
    f.write(answer_json)