### The code is partly from https://towardsdatascience.com/text-classification-in-spark-nlp-with-bert-and-universal-sentence-encoders-e644d618ca32

In [0]:
import sparknlp
spark = sparknlp.start() 
# sparknlp.start(gpu=True) >> 在GPU上训练
from sparknlp.base import *
from sparknlp.annotator import *
from pyspark.ml import Pipeline
import pandas as pd
print("Spark NLP version", sparknlp.version())
print("Apache Spark version:", spark.version)

In [0]:
data = pd.read_csv('/dbfs/mnt/lsde/group05/labeled_data_comment.csv',encoding = "ISO-8859-1")

In [0]:
clist = []
for i in range(1825):
    if data["position"][i] == 0 or data["position"][i]==1:
        clist.append(i)
    if data["position"][i] == 2 and i <900:
        clist.append(i)

In [0]:
data1 = pd.read_csv('/dbfs/mnt/lsde/group05/labeled_data_submission.csv',encoding = "ISO-8859-1")

In [0]:
slist = []
for i in range(1717):
    if data1["position"][i] == 0 or data1["position"][i]==1:
        slist.append(i)

In [0]:
d = {"label":[],"message":[]}
df = pd.DataFrame(data=d)

In [0]:
for i in clist:
    idx = len(df["label"])
    df.loc[idx] = [data["position"][i],data["body"][i]]
for i in slist:
    idx = len(df["label"])
    df.loc[idx] = [data1["position"][i],data1["title"][i]]

In [0]:
df
#0:823
#1:210->840
#2:816

Unnamed: 0,label,message
0,1.0,"the title is ""Obama announces expanded sanctio..."
1,0.0,"The thing is, it would not be a war between tw..."
2,0.0,Time to dust off Grandpappys Bomb Shelter make...
3,2.0,**Article summary:** \n\n---\n\n\n&gt;* Super-...
4,2.0,This must be that mass extinction event /r/sci...
...,...,...
1844,0.0,"Wer Ukraine unterstÃÂ¼tzen will, kann ÃÂ¼ber..."
1845,0.0,Who are Anonymous and why are they fighting al...
1846,0.0,UkraineÃ¢ÂÂs air defense unit shoots down Ru...
1847,0.0,Witches for Ukraine!


In [0]:
import random 

sample_l = list(range(1849))
random.shuffle(sample_l)

In [0]:
train_l = sample_l[:1500]
test_l = sample_l[1500:]

In [0]:
temp_l = train_l.copy()
for i in temp_l:
    if df["label"][i]==1:
        for _ in range(3):train_l.append(i)

In [0]:
temp_l = test_l.copy()
for i in temp_l:
    if df["label"][i]==1:
        for _ in range(3):test_l.append(i)

In [0]:
random.shuffle(train_l)
random.shuffle(test_l)

In [0]:
train =  pd.DataFrame(data=d)
test =  pd.DataFrame(data=d)
for i in train_l:
    idx = len(train["label"])
    train.loc[idx] = [str(df["label"][i]),df["message"][i]]
for i in test_l:
    idx = len(test["label"])
    test.loc[idx] = [str(df["label"][i]),df["message"][i]]

In [0]:
values = train.values.tolist()
columns = train.columns.tolist()
spark_train = spark.createDataFrame(values, columns)

values = test.values.tolist()
columns = test.columns.tolist()
spark_test = spark.createDataFrame(values, columns)

In [0]:
from pyspark.sql.functions import col
spark_train.groupBy("label") \
    .count() \
    .orderBy(col("count").desc()) \
    .show()

spark_test.groupBy("label") \
    .count() \
    .orderBy(col("count").desc()) \
    .show()

In [0]:
document_assembler = DocumentAssembler() \
    .setInputCol("message")\
    .setOutputCol("document")

sentenceDetector = SentenceDetector()\
      .setInputCols(['document'])\
      .setOutputCol('sentences')

tokenizer = Tokenizer()\
    .setInputCols(["sentences"])\
    .setOutputCol("token")

normalizer = Normalizer()\
    .setInputCols("token")\
    .setOutputCol("normalized")

stopwords_cleaner = StopWordsCleaner()\
    .setInputCols("normalized")\
    .setOutputCol("cleanTokens")\
    .setCaseSensitive(False)\
    .setLocale("en_US")

lemma = LemmatizerModel.pretrained("lemma_antbnc")\
    .setInputCols(["cleanTokens"])\
    .setOutputCol("lemma")

word_embeddings = WordEmbeddingsModel().pretrained()\
    .setInputCols(["document","lemma"])\
    .setOutputCol("embeddings")\
    .setCaseSensitive(False)

embeddingsSentence = SentenceEmbeddings()\
    .setInputCols(["document","embeddings"])\
    .setOutputCol("sentence_embeddings")\
    .setPoolingStrategy("AVERAGE")



In [0]:
classifierdl = ClassifierDLApproach()\
    .setInputCols(["sentence_embeddings"])\
    .setOutputCol("class")\
    .setLabelColumn("label")\
    .setMaxEpochs(20)\
    .setEnableOutputLogs(True)

In [0]:
PRU_pipeline = Pipeline(
    stages = [
        document_assembler,
        sentenceDetector,
        tokenizer,
        normalizer,
        stopwords_cleaner,
        lemma,
        word_embeddings,
        embeddingsSentence,
        classifierdl
    ]
)

In [0]:
PRU_pipelineModel = PRU_pipeline.fit(spark_train)

In [0]:
#0:ProUkraine 1:ProRussia 2: Cantsee
from sklearn.metrics import classification_report, accuracy_score
df = PRU_pipelineModel.transform(spark_test).select('label','message','class.result').toPandas()
df['result'] = df['result'].apply(lambda x:x[0])
print(classification_report(df.label,df.result))
print(accuracy_score(df.label,df.result))

In [0]:
'''  precision    recall  f1-score   support

         0.0       0.61      0.82      0.70       140
         1.0       0.87      0.63      0.73       196
         2.0       0.85      0.88      0.87       160

    accuracy                           0.77       496
   macro avg       0.78      0.78      0.77       496
weighted avg       0.79      0.77      0.77       496

0.7661290322580645'''

train all data for model

In [0]:
train = pd.concat([train,test])

In [0]:
values = train.values.tolist()
columns = train.columns.tolist()
spark_train = spark.createDataFrame(values, columns)

In [0]:
spark_train.groupBy("label") \
    .count() \
    .orderBy(col("count").desc()) \
    .show()

In [0]:
PRU_pipeline = Pipeline(
    stages = [
        document_assembler,
        sentenceDetector,
        tokenizer,
        normalizer,
        stopwords_cleaner,
        lemma,
        word_embeddings,
        embeddingsSentence,
        classifierdl
    ]
)

In [0]:
PRU_pipelineModel = PRU_pipeline.fit(spark_train)

In [0]:
PRU_pipelineModel.save('/mnt/lsde/group05/PRU_3class')

Transform all submissions and comments

In [0]:
PRU_pipelineModel = PipelineModel.load('/mnt/lsde/group05/PRU_3class')

In [0]:
def handle_s(year_range = [2015, 2016, 2017], month_range = range(1, 13)):
    for year in year_range:
        for month in month_range:
            month_literal = str(month)
            data_path = "/mnt/lsde/group05/kwfilter_top3/" + str(year) + "-" + month_literal + ".parquet"
            print("start handling "+str(year) + "-" + month_literal)
            df =spark.read.load(data_path,format="parquet")
            df_pre = PRU_pipelineModel.transform(df.select(col("id"),col("title").alias("message"),col("created_utc")))
            df_pre.select("id","class.result","created_utc").write.mode("overwrite").format("parquet").save("/mnt/lsde/group05/"+"submission_position_3class/"+
                                                                                                            str(year)+'-'+month_literal+".parquet")
            print("finish handling "+str(year) + "-" + month_literal)

In [0]:
def handle_c(year_range = [2015, 2016, 2017], month_range = range(1, 13)):
    for year in year_range:
        for month in month_range:
            month_literal = str(month)
            data_path = "/mnt/lsde/group05/filtered_comments/" + str(year) + "-" + month_literal + ".parquet"
            print("start handling "+str(year) + "-" + month_literal)
            df =spark.read.load(data_path,format="parquet")
            df_pre = PRU_pipelineModel.transform(df.select(col("parent_id"),col("id"),col("body").alias("message"),col("created_utc")))
            df_pre.select("parent_id","id","class.result","created_utc").write.mode("overwrite").format("parquet").save("/mnt/lsde/group05/"+
                                                                                                                        "comment_position_3class/"+str(year)+
                                                                                                                        '-'+month_literal+".parquet")
            print("finish handling "+str(year) + "-" + month_literal)

In [0]:
handle_c([2016],[3])

In [0]:
handle_c([2014],range(2,13))

In [0]:
handle_s([2014],range(2,13))

In [0]:
handle_c(range(2015,2022),range(1,13))


In [0]:
handle_s(range(2015,2022),range(1,13))

In [0]:
handle_c([2022],range(1,9))


In [0]:
handle_s([2022],range(1,9))

In [0]:
from sklearn.metrics import classification_report, accuracy_score
df = PRU_pipelineModel.transform(spark_train).select('label','message','class.result').toPandas()
df['result'] = df['result'].apply(lambda x:x[0])
print(classification_report(df.label,df.result))
print(accuracy_score(df.label,df.result))