### The code is partly from https://towardsdatascience.com/text-classification-in-spark-nlp-with-bert-and-universal-sentence-encoders-e644d618ca32

In [0]:
import sparknlp
spark = sparknlp.start() 
# sparknlp.start(gpu=True) >> 在GPU上训练
from sparknlp.base import *
from sparknlp.annotator import *
from pyspark.ml import Pipeline
from pyspark.sql.functions import col
import pandas as pd
print("Spark NLP version", sparknlp.version())
print("Apache Spark version:", spark.version)

In [0]:
data = pd.read_csv('/dbfs/mnt/lsde/group05/labeled_data_comment.csv',encoding = "ISO-8859-1")

In [0]:
clist = []
for i in range(1825):
    if data["position"][i] == 0 or data["position"][i]==1:
        clist.append(i)

In [0]:
data1 = pd.read_csv('/dbfs/mnt/lsde/group05/labeled_data_submission.csv',encoding = "ISO-8859-1")

In [0]:
slist = []
for i in range(1717):
    if data1["position"][i] == 0 or data1["position"][i]==1:
        slist.append(i)

In [0]:
d = {"label":[],"message":[]}
df = pd.DataFrame(data=d)

In [0]:
for i in clist:
    idx = len(df["label"])
    df.loc[idx] = [data["position"][i],data["body"][i]]
for i in slist:
    idx = len(df["label"])
    df.loc[idx] = [data1["position"][i],data1["title"][i]]

In [0]:
Dataset =df.sample(frac=1)
trainDataset = Dataset[:800]
testDataset = Dataset[800:]

In [0]:
sample = []
for i in trainDataset.index:
    if trainDataset["label"][i] == 0:
        sample.append(i)
    elif trainDataset["label"][i] == 1:
        for _ in range(4):
            sample.append(i)

In [0]:
sample1 = []
for i in testDataset.index:
    if testDataset["label"][i] == 0:
        sample1.append(i)
    elif testDataset["label"][i] == 1:
        for _ in range(4):
            sample1.append(i)

In [0]:
import random 
random.shuffle(sample)
random.shuffle(sample1)

In [0]:
#balance data
train =  pd.DataFrame(data=d)
test =  pd.DataFrame(data=d)
for i in sample:
    idx = len(train["label"])
    train.loc[idx] = [str(trainDataset["label"][i]),trainDataset["message"][i]]
for i in sample1:
    idx = len(test["label"])
    test.loc[idx] = [str(testDataset["label"][i]),testDataset["message"][i]]

In [0]:
values = train.values.tolist()
columns = train.columns.tolist()
spark_train = spark.createDataFrame(values, columns)

values = test.values.tolist()
columns = test.columns.tolist()
spark_test = spark.createDataFrame(values, columns)

In [0]:
from pyspark.sql.functions import col
spark_train.groupBy("label") \
    .count() \
    .orderBy(col("count").desc()) \
    .show()

spark_test.groupBy("label") \
    .count() \
    .orderBy(col("count").desc()) \
    .show()

In [0]:
document_assembler = DocumentAssembler() \
    .setInputCol("message")\
    .setOutputCol("document")

sentenceDetector = SentenceDetector()\
      .setInputCols(['document'])\
      .setOutputCol('sentences')

tokenizer = Tokenizer()\
    .setInputCols(["sentences"])\
    .setOutputCol("token")

normalizer = Normalizer()\
    .setInputCols("token")\
    .setOutputCol("normalized")


stopwords_cleaner = StopWordsCleaner()\
    .setInputCols("normalized")\
    .setOutputCol("cleanTokens")\
    .setCaseSensitive(False)\
    .setLocale("en_US")

lemma = LemmatizerModel.pretrained("lemma_antbnc")\
    .setInputCols(["cleanTokens"])\
    .setOutputCol("lemma")

word_embeddings = WordEmbeddingsModel().pretrained()\
    .setInputCols(["document","lemma"])\
    .setOutputCol("embeddings")\
    .setCaseSensitive(False)

embeddingsSentence = SentenceEmbeddings()\
    .setInputCols(["document","embeddings"])\
    .setOutputCol("sentence_embeddings")\
    .setPoolingStrategy("AVERAGE")

classifierdl = ClassifierDLApproach()\
    .setInputCols(["sentence_embeddings"])\
    .setOutputCol("class")\
    .setLabelColumn("label")\
    .setMaxEpochs(10)\
    .setEnableOutputLogs(True)

In [0]:
PUR_pipeline = Pipeline(
    stages = [
        document_assembler,
        sentenceDetector,
        tokenizer,
        normalizer,
        stopwords_cleaner,
        lemma,
        word_embeddings,
        embeddingsSentence,
        classifierdl
    ]
)

In [0]:
PUR_pipelineModel = PUR_pipeline.fit(spark_train)

In [0]:
from sklearn.metrics import classification_report, accuracy_score
df = PUR_pipelineModel.transform(spark_test).select('label','message','class.result').toPandas()
df['result'] = df['result'].apply(lambda x:x[0])
print(classification_report(df.label,df.result))
print(accuracy_score(df.label,df.result))

Train all data for model

In [0]:
train = pd.concat([train,test])

In [0]:
values = train.values.tolist()
columns = train.columns.tolist()
spark_train = spark.createDataFrame(values, columns)

In [0]:
PUR_pipeline = Pipeline(
    stages = [
        document_assembler,
        sentenceDetector,
        tokenizer,
        normalizer,
        stopwords_cleaner,
        lemma,
        word_embeddings,
        embeddingsSentence,
        classifierdl
    ]
)

In [0]:
PRU_pipelineModel = PUR_pipeline.fit(spark_train)

In [0]:
def handle_s(year_range = [2015, 2016, 2017], month_range = range(1, 13)):
    for year in year_range:
        for month in month_range:
            month_literal = str(month)
            data_path = "/mnt/lsde/group05/kwfilter_top3/" + str(year) + "-" + month_literal + ".parquet"
            print("start handling "+str(year) + "-" + month_literal)
            df =spark.read.load(data_path,format="parquet")
            df_pre = PRU_pipelineModel.transform(df.select(col("id"),col("title").alias("message"),col("created_utc")))
            df_pre.select("id","class.result","created_utc").write.mode("overwrite").format("parquet").save("/mnt/lsde/group05/"+"submission_position/"+str(year)+'-'+month_literal+".parquet")
            print("finish handling "+str(year) + "-" + month_literal)
            

In [0]:
handle_s([2014],range(2,13))

In [0]:
handle_s([2015],range(1,13))

In [0]:
handle_s(range(2016,2022),range(1,13))

In [0]:
handle_s([2022],range(1,9))

In [0]:
def handle_c(year_range = [2015, 2016, 2017], month_range = range(1, 13)):
    for year in year_range:
        for month in month_range:
            month_literal = str(month)
            data_path = "/mnt/lsde/group05/filtered_comments/" + str(year) + "-" + month_literal + ".parquet"
            print("start handling "+str(year) + "-" + month_literal)
            df =spark.read.load(data_path,format="parquet")
            df_pre = PRU_pipelineModel.transform(df.select(col("parent_id"),col("id"),col("body").alias("message"),col("created_utc")))
            df_pre.select("parent_id","id","class.result","created_utc").write.mode("overwrite").format("parquet").save("/mnt/lsde/group05/"+
                                                                                                                        "comment_position/"+str(year)+
                                                                                                                        '-'+month_literal+".parquet")
            print("finish handling "+str(year) + "-" + month_literal)

In [0]:
handle_c([2014],range(2,13))

In [0]:
handle_c(range(2015,2022),range(1,13))

In [0]:
handle_c([2022],range(1,9))