In [0]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.config('spark.driver.maxResultSize', '18g').getOrCreate()

from pyspark.sql.functions import lit, lower, col

substrings = ["ukraine", "russia", "russian", "ukraina", "putin", "kyiv", "moscow", "invasion", "vladimir"]

def read_dataset(year_range = [2015, 2016, 2017], month_range = range(1, 13), comments_or_submissions = "submissions"):

    if comments_or_submissions == "comments":
        link = "comments/RC_"
    elif comments_or_submissions == "submissions":
        link = "submissions/RS_"
    else:
        link = "non-exist"
    result = None

    for year in year_range:
        for month in month_range:
            month_literal = ""
            if month < 10:
                month_literal = "0" + str(month)
            else:
                month_literal = str(month)

            data_path = "/mnt/lsde/datasets/reddit/" + link + str(year) + "-" + month_literal + ".json.bz2"
            subtime = str(year) + "-" + month_literal

            result = spark.read.option("samplingRatio",0.000001).load(data_path, format = "json").\
                filter("author!='[deleted]' AND title!='[deleted]'").\
                select("author", "title", "selftext", "subreddit", "subreddit_id", "id", "score", "created_utc").\
                filter(lower(col("title")).rlike('|'.join(substrings)) | lower(col("selftext")).rlike('|'.join(substrings)))
            result.write.mode("overwrite").format("parquet").save("/mnt/lsde/group05/" + comments_or_submissions + "_tree_shaking" + "/" + str(year) + "-" + str(month) + ".parquet")
            print(str(year) + "-" + str(month) + " to parquet finished")

In [0]:
read_dataset([2014],range(2, 13), "submissions")

In [0]:
read_dataset([2015],range(1, 13), "submissions")

In [0]:
read_dataset([2016],range(1, 13), "submissions")

In [0]:
read_dataset([2017],range(1, 13), "submissions")

In [0]:
read_dataset([2018],range(1, 13), "submissions")

In [0]:
read_dataset([2019],range(1, 13), "submissions")

In [0]:
read_dataset([2020],range(1, 13), "submissions")

In [0]:
read_dataset([2021],range(11, 13), "submissions")

In [0]:
read_dataset([2022],range(1, 9), "submissions")