In [0]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.config('spark.driver.maxResultSize', '18g').getOrCreate()

In [0]:
import time

In [0]:
def time_convert1(unix_time):
    data_time = time.localtime(int(unix_time))
    data_time = time.strftime("%Y%m%d", data_time)
    data_time = data_time[0:4]+"-"+data_time[4:]
    return data_time

In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [0]:
time_convert_f=udf(time_convert1,StringType())

In [0]:
spark.udf.register(name="time_convert_f", f=time_convert_f)

In [0]:
from pyspark.sql import functions as sf

In [0]:
def handle_comments(year_range = [2014,2015],month_range=[1,2,3]):
    result=None
    for year in year_range:
        for month in month_range:
            month_literal = str(month)
            data_path = "/mnt/lsde/group05/" + "filtered_comments/" + str(year) + "-" + month_literal + ".parquet"
            df = spark.read.load(data_path ,format="parquet")
            df_convert = df.withColumn("date",time_convert_f('created_utc'))
            df_st_cur =df_convert.groupBy("date").agg(countDistinct('author').alias('count_posters'),sf.count('id').alias('count_posts')).orderBy('date')
            if result is None:
                result =df_st_cur
            else:
                result = result.union(df_st_cur)
            print("finish"+ str(year) + "-" + month_literal)
    return result

In [0]:
def handle_submissions(year_range = [2014,2015],month_range=[1,2,3]):
    result=None
    for year in year_range:
        for month in month_range:
            month_literal = str(month)
            data_path = "/mnt/lsde/group05/" + "kwfilter_top3/" + str(year) + "-" + month_literal + ".parquet"
            df = spark.read.load(data_path ,format="parquet")
            df_convert = df.withColumn("date",time_convert_f('created_utc'))
            df_st_cur =df_convert.groupBy("date").agg(countDistinct('author').alias('count_posters'),sf.count('id').alias('count_posts')).orderBy('date')
            if result is None:
                result =df_st_cur
            else:
                result = result.union(df_st_cur)
            print("finish"+ str(year) + "-" + month_literal)
    return result

In [0]:
comments_2014 = handle_comments([2014],range(2,13))

In [0]:
comments_2014.show()

In [0]:
submissions_2014 = handle_submissions([2014],range(2,13))

In [0]:
comments_from2015 =handle_comments(range(2015,2022),range(2,13))

In [0]:
submissions_from2015 = handle_submissions(range(2015,2022),range(2,13))

In [0]:
comments_2022= handle_comments([2022],range(1,9))

In [0]:
submissions_2022 = handle_submissions([2022],range(1,9))

In [0]:
comments_union= comments_2014.union(comments_from2015).union(comments_2022)

In [0]:
submissions_union= submissions_2014.union(submissions_from2015).union(submissions_2022)

In [0]:
finaldf = comments_union.union(submissions_union)

In [0]:
finaldf.show()

In [0]:
finaldf1 = finaldf.groupBy('date').agg(sf.sum('count_posters').alias('poster_sum'),sf.sum('count_posts').alias('post_sum')).orderBy('date')

In [0]:
finaldf.show()

In [0]:
finaldf.write.mode("overwrite").format("parquet").save("/mnt/lsde/group05/statistics/new-posts-posters.parquet")