In [0]:
! pip install -q pyspark==3.1.2 spark-nlp

In [0]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.config('spark.driver.maxResultSize', '18g').getOrCreate()

In [0]:
import json
import pandas as pd
import numpy as np
# Import pyspark
from pyspark.sql import SparkSession
from pyspark.ml import PipelineModel
from pyspark.sql import functions as F

# Import SparkNLP
import sparknlp
from sparknlp.annotator import *
from sparknlp.base import *


In [0]:
stopwords = StopWordsCleaner().getStopWords()

In [0]:
document = DocumentAssembler() \
            .setInputCol("alltext") \
            .setOutputCol("document")

sentenceDetector = SentenceDetector() \
            .setInputCols("document") \
            .setOutputCol("sentence")

token = Tokenizer() \
            .setInputCols("sentence") \
            .setOutputCol("token") \
            .setContextChars(["(", ")", "?", "!", ".", ","])

keywords = YakeKeywordExtraction() \
            .setInputCols("token") \
            .setOutputCol("keywords") \
            .setMinNGrams(1) \
            .setMaxNGrams(3)\
            .setNKeywords(3)\
            .setStopWords(stopwords)

yake_pipeline = Pipeline(stages=[document, sentenceDetector, token, keywords])

empty_df = spark.createDataFrame([['']]).toDF("alltext")

yake_Model = yake_pipeline.fit(empty_df)

In [0]:
from pyspark.sql.functions import udf
from pyspark.sql.types import *

In [0]:
from pyspark.sql.functions import split, explode, concat, concat_ws

In [0]:
def isValid(lst):
    setA = set(lst)
    setB ={"ukraine","russia","russian","ukraina","putin","kyiv","moscow","invasion","vladimir"}
    return len(setA & setB)>0

In [0]:
isValid_f = udf(isValid, BooleanType())

In [0]:
spark.udf.register(name="isValid_f", f=isValid_f)

In [0]:
# extract keywords from sparkdataframe
def keyword_extract_fromdf(df):
    df1 = df.withColumn("alltext",concat(df['title'],df['selftext']))
    result = yake_pipeline.fit(df1).transform(df1)
    result = result.withColumn('unique_keywords', F.array_distinct("keywords.result"))
    result1 =result.withColumn("isValid",isValid_f('unique_keywords'))
    finaldf=result1.select('author','title','selftext','subreddit','id','score','created_utc').filter('isValid')
    return finaldf

In [0]:
def handle(year_range = [2015, 2016, 2017], month_range = range(1, 13)):
    for year in year_range:
        for month in month_range:
            
            month_literal = str(month)
            data_path = "/mnt/lsde/group05/submissions_tree_shaking/" + str(year) + "-" + month_literal + ".parquet"
            print("start handling "+str(year) + "-" + month_literal)
            df =spark.read.load(data_path,format="parquet")
            df_done = keyword_extract_fromdf(df)
            df_done.write.mode("overwrite").format("parquet").save("/mnt/lsde/group05/"+"kwfilter_top3/"+str(year) + "-" + month_literal+".parquet")
            print("finish handling "+str(year) + "-" + month_literal)
            
    
    
    

In [0]:
handle([2014],[2])

In [0]:
handle([2014],range(3,13))

In [0]:
handle([2018],range(8,13))

In [0]:
handle([2019,2020,2021],range(1,13))

In [0]:
handle([2021],[11,12])

In [0]:
handle([2022],range(1,9))

In [0]:
handle([2015,2016,2017,2018,2019,2020,2021],range(1,13))

In [0]:
path="dbfs:/mnt/lsde/group05/submissions_tree_shaking/2014-2.parquet"