In [1]:
import findspark
findspark.init()

In [2]:
import pyspark.sql.functions as F
import pyspark.sql.types as T

from pyspark.sql import SparkSession
from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

from models.bloom_filter import BloomFilterBasedModel
from models.emb_logreg import EmbeddingsLogReg

In [3]:
spark = SparkSession.builder \
    .appName("WikimediaStreamProcessor") \
    .config("spark.driver.memory", "12G") \
    .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:5.5.1") \
    .getOrCreate()

:: loading settings :: url = jar:file:/home/andrii/spark-3.5.3-bin-hadoop3/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/andrii/.ivy2/cache
The jars for the packages stored in: /home/andrii/.ivy2/jars
com.johnsnowlabs.nlp#spark-nlp_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-7385079c-4505-4a8b-ba9c-9d899ca49a51;1.0
	confs: [default]
	found com.johnsnowlabs.nlp#spark-nlp_2.12;5.5.1 in central
	found com.typesafe#config;1.4.2 in central
	found org.rocksdb#rocksdbjni;6.29.5 in central
	found com.amazonaws#aws-java-sdk-s3;1.12.500 in central
	found com.amazonaws#aws-java-sdk-kms;1.12.500 in central
	found com.amazonaws#aws-java-sdk-core;1.12.500 in central
	found commons-logging#commons-logging;1.1.3 in central
	found commons-codec#commons-codec;1.15 in central
	found org.apache.httpcomponents#httpclient;4.5.13 in central
	found org.apache.httpcomponents#httpcore;4.4.13 in central
	found software.amazon.ion#ion-java;1.0.2 in central
	found joda-time#joda-time;2.8.1 in central
	found com.amazonaws#jmespath-java;1.12.500 in centra

## Data preparation

In [35]:
train_df = spark.read.csv(
    "./data/train/train_dataset_big.csv",
    header=True,
    inferSchema=True,
    multiLine=True,
    escape="\"",
    quote="\"",
    sep=",",
)

In [36]:
train_df_wtext = train_df.withColumn(
    "text",
    F.concat(
        F.col("title"),
        F.when(F.col("comment").isNotNull(), F.col("comment")).otherwise(F.lit("NULL"))
    )
)

In [37]:
train_df_wtext = train_df_wtext.withColumn("label", F.udf(lambda x: 1 if x else 0, T.IntegerType())(F.col("bot")))

## Pure Logistic Regression with Embeddings training

In [40]:
total_rows = train_df_wtext.count()
train_split_index = int(total_rows * 0.9)

train_df_wtext = train_df_wtext.withColumn("row_index", F.monotonically_increasing_id())

train_df_ = train_df_wtext.filter(F.col("row_index") <= train_split_index).drop("row_index")
val_df = train_df_wtext.filter(F.col("row_index") > train_split_index).drop("row_index")

print("Training Dataset Count: " + str(train_df_.count()))
print("Val Dataset Count: " + str(val_df.count()))


Training Dataset Count: 36001
Val Dataset Count: 3999


In [26]:
embeddingLogReg = EmbeddingsLogReg(maxIter=10, regParam=0.3, elasticNetParam=0, weightType="balanced")

word2vec_gigaword_300 download started this may take some time.
Approximate size to download 312.3 MB
[OK!]


In [27]:
embeddingLogReg.fit(train_df_)

                                                                                

In [28]:
predictions = embeddingLogReg.predict(val_df)

In [29]:
val_labels = predictions.select("label").rdd.flatMap(lambda x: x).collect()
val_preds = predictions.select("prediction").rdd.flatMap(lambda x: x).collect()

                                                                                

In [30]:
accuracy_score(val_labels, val_preds), precision_score(val_labels, val_preds), recall_score(val_labels, val_preds), f1_score(val_labels, val_preds)

(0.9446297531687792,
 np.float64(0.9448946515397083),
 np.float64(0.9224683544303798),
 np.float64(0.933546837469976))

In [31]:
embeddingLogReg.save("./data/logreg_tiny")

ValueError: Target folder should be empty or nonexistent

## LogReg Parameter search

In [44]:
class F1WithPrecisionConstraint:
    def __init__(self, spark, precision_boundary=0.95):
        self.spark = spark
        self.precision_boundary = precision_boundary

    def evaluate(self, predictions):
        metrics = predictions.withColumn(
            "tp", F.expr("CASE WHEN bot AND prediction = 1 THEN 1 ELSE 0 END")
        ).withColumn(
            "fp", F.expr("CASE WHEN NOT bot AND prediction = 1 THEN 1 ELSE 0 END")
        ).withColumn(
            "fn", F.expr("CASE WHEN bot AND prediction = 0 THEN 1 ELSE 0 END")
        )

        metrics = metrics.groupBy().agg(
            F.sum("tp").alias("true_positives"),
            F.sum("fp").alias("false_positives"),
            F.sum("fn").alias("false_negatives"),
            F.count("*").alias("total_observations")
        ).withColumn(
            "precision", F.expr("CASE WHEN false_positives = 0 AND true_positives = 0 THEN 0 ELSE true_positives / (true_positives + false_positives) END")
        ).withColumn(
            "recall", F.expr("CASE WHEN false_negatives = 0 AND true_positives = 0 THEN 0 ELSE true_positives / (true_positives + false_negatives) END")
        )

        precision = metrics.select("precision").first()[0]
        recall = metrics.select("recall").first()[0]
        if precision == 0 and recall == 0:
            f1 = 0
        else:
            f1 = 2*precision*recall/(precision + recall)

        print(f"Precision = {precision:.3f}, Recall = {recall:.3f}")

        if precision >= self.precision_boundary:
            return 0.2 + f1 # If precision is self.precision_boundary+, it's better to start maximizing recall too
        else:
            return f1

In [45]:
param_grid = [
    {"maxIter": max_iter, "regParam": reg_param, "elasticNetParam": elastic_net_param, "weightType": weight_type}
    for max_iter in [5, 10]
    for reg_param in [0.1, 0.3]
    for elastic_net_param in [0.0, 0.3]
    for weight_type in [None, "balanced", "sqrt"]
]

best_model = None
best_score = float("-inf")
evaluator = F1WithPrecisionConstraint(spark, precision_boundary=0.8)

for params in param_grid:
    print(f"Training with parameters: {params}")
    
    model = EmbeddingsLogReg(
        maxIter=params["maxIter"], 
        regParam=params["regParam"], 
        elasticNetParam=params["elasticNetParam"],
        weightType=params["weightType"],
    )
    
    model.fit(train_df_)
    
    predictions = model.predict(val_df)
    
    score = evaluator.evaluate(predictions)
    print(f"Score for parameters {params}: {score}")
    
    if score > best_score:
        best_score = score
        best_model = model
        best_params = params

print(f"Best model parameters: {best_params}")
best_model = EmbeddingsLogReg(
    maxIter=best_params["maxIter"], 
    regParam=best_params["regParam"], 
    elasticNetParam=best_params["elasticNetParam"],
    weightType=best_params["weightType"],
)
best_model.fit(train_df_)
predictions = best_model.predict(val_df)

val_labels = predictions.select("label").rdd.flatMap(lambda x: x).collect()
val_preds = predictions.select("prediction").rdd.flatMap(lambda x: x).collect()

precision = precision_score(val_labels, val_preds)
recall = recall_score(val_labels, val_preds)
f1 = f1_score(val_labels, val_preds)
accuracy = accuracy_score(val_labels, val_preds)

print(f"Best Model Metrics -> Precision: {precision}, Recall: {recall}, F1-Score: {f1}, Accuracy: {accuracy}")

Training with parameters: {'maxIter': 5, 'regParam': 0.1, 'elasticNetParam': 0.0, 'weightType': None}


                                                                                

Precision = 0.775, Recall = 0.493
Score for parameters {'maxIter': 5, 'regParam': 0.1, 'elasticNetParam': 0.0, 'weightType': None}: 0.6024636058230683
Training with parameters: {'maxIter': 5, 'regParam': 0.1, 'elasticNetParam': 0.0, 'weightType': 'balanced'}


                                                                                

Precision = 0.730, Recall = 0.650
Score for parameters {'maxIter': 5, 'regParam': 0.1, 'elasticNetParam': 0.0, 'weightType': 'balanced'}: 0.687984496124031
Training with parameters: {'maxIter': 5, 'regParam': 0.1, 'elasticNetParam': 0.0, 'weightType': 'sqrt'}


                                                                                

Precision = 0.763, Recall = 0.606
Score for parameters {'maxIter': 5, 'regParam': 0.1, 'elasticNetParam': 0.0, 'weightType': 'sqrt'}: 0.6755102040816326
Training with parameters: {'maxIter': 5, 'regParam': 0.1, 'elasticNetParam': 0.3, 'weightType': None}


                                                                                

Precision = 0.632, Recall = 0.158
Score for parameters {'maxIter': 5, 'regParam': 0.1, 'elasticNetParam': 0.3, 'weightType': None}: 0.25219941348973607
Training with parameters: {'maxIter': 5, 'regParam': 0.1, 'elasticNetParam': 0.3, 'weightType': 'balanced'}


                                                                                

Precision = 0.622, Recall = 0.505
Score for parameters {'maxIter': 5, 'regParam': 0.1, 'elasticNetParam': 0.3, 'weightType': 'balanced'}: 0.5575757575757576
Training with parameters: {'maxIter': 5, 'regParam': 0.1, 'elasticNetParam': 0.3, 'weightType': 'sqrt'}


                                                                                

Precision = 0.676, Recall = 0.390
Score for parameters {'maxIter': 5, 'regParam': 0.1, 'elasticNetParam': 0.3, 'weightType': 'sqrt'}: 0.4947735191637631
Training with parameters: {'maxIter': 5, 'regParam': 0.3, 'elasticNetParam': 0.0, 'weightType': None}


                                                                                

Precision = 0.783, Recall = 0.397
Score for parameters {'maxIter': 5, 'regParam': 0.3, 'elasticNetParam': 0.0, 'weightType': None}: 0.5273390036452005
Training with parameters: {'maxIter': 5, 'regParam': 0.3, 'elasticNetParam': 0.0, 'weightType': 'balanced'}


                                                                                

Precision = 0.682, Recall = 0.562
Score for parameters {'maxIter': 5, 'regParam': 0.3, 'elasticNetParam': 0.0, 'weightType': 'balanced'}: 0.6164658634538153
Training with parameters: {'maxIter': 5, 'regParam': 0.3, 'elasticNetParam': 0.0, 'weightType': 'sqrt'}


                                                                                

Precision = 0.721, Recall = 0.498
Score for parameters {'maxIter': 5, 'regParam': 0.3, 'elasticNetParam': 0.0, 'weightType': 'sqrt'}: 0.5893824485373781
Training with parameters: {'maxIter': 5, 'regParam': 0.3, 'elasticNetParam': 0.3, 'weightType': None}


                                                                                

Precision = 0.655, Recall = 0.035
Score for parameters {'maxIter': 5, 'regParam': 0.3, 'elasticNetParam': 0.3, 'weightType': None}: 0.06608695652173914
Training with parameters: {'maxIter': 5, 'regParam': 0.3, 'elasticNetParam': 0.3, 'weightType': 'balanced'}


                                                                                

Precision = 0.529, Recall = 0.549
Score for parameters {'maxIter': 5, 'regParam': 0.3, 'elasticNetParam': 0.3, 'weightType': 'balanced'}: 0.5390835579514826
Training with parameters: {'maxIter': 5, 'regParam': 0.3, 'elasticNetParam': 0.3, 'weightType': 'sqrt'}


                                                                                

Precision = 0.706, Recall = 0.141
Score for parameters {'maxIter': 5, 'regParam': 0.3, 'elasticNetParam': 0.3, 'weightType': 'sqrt'}: 0.2351145038167939
Training with parameters: {'maxIter': 10, 'regParam': 0.1, 'elasticNetParam': 0.0, 'weightType': None}


                                                                                

Precision = 0.837, Recall = 0.489
Score for parameters {'maxIter': 10, 'regParam': 0.1, 'elasticNetParam': 0.0, 'weightType': None}: 0.8173410404624277
Training with parameters: {'maxIter': 10, 'regParam': 0.1, 'elasticNetParam': 0.0, 'weightType': 'balanced'}


                                                                                

Precision = 0.741, Recall = 0.648
Score for parameters {'maxIter': 10, 'regParam': 0.1, 'elasticNetParam': 0.0, 'weightType': 'balanced'}: 0.69140625
Training with parameters: {'maxIter': 10, 'regParam': 0.1, 'elasticNetParam': 0.0, 'weightType': 'sqrt'}


                                                                                

Precision = 0.790, Recall = 0.538
Score for parameters {'maxIter': 10, 'regParam': 0.1, 'elasticNetParam': 0.0, 'weightType': 'sqrt'}: 0.6405228758169934
Training with parameters: {'maxIter': 10, 'regParam': 0.1, 'elasticNetParam': 0.3, 'weightType': None}


                                                                                

Precision = 0.686, Recall = 0.148
Score for parameters {'maxIter': 10, 'regParam': 0.1, 'elasticNetParam': 0.3, 'weightType': None}: 0.2439759036144578
Training with parameters: {'maxIter': 10, 'regParam': 0.1, 'elasticNetParam': 0.3, 'weightType': 'balanced'}


                                                                                

Precision = 0.600, Recall = 0.515
Score for parameters {'maxIter': 10, 'regParam': 0.1, 'elasticNetParam': 0.3, 'weightType': 'balanced'}: 0.554240631163708
Training with parameters: {'maxIter': 10, 'regParam': 0.1, 'elasticNetParam': 0.3, 'weightType': 'sqrt'}


                                                                                

Precision = 0.717, Recall = 0.372
Score for parameters {'maxIter': 10, 'regParam': 0.1, 'elasticNetParam': 0.3, 'weightType': 'sqrt'}: 0.48974668275030153
Training with parameters: {'maxIter': 10, 'regParam': 0.3, 'elasticNetParam': 0.0, 'weightType': None}


                                                                                

Precision = 0.698, Recall = 0.178
Score for parameters {'maxIter': 10, 'regParam': 0.3, 'elasticNetParam': 0.0, 'weightType': None}: 0.2832116788321168
Training with parameters: {'maxIter': 10, 'regParam': 0.3, 'elasticNetParam': 0.0, 'weightType': 'balanced'}


                                                                                

Precision = 0.691, Recall = 0.562
Score for parameters {'maxIter': 10, 'regParam': 0.3, 'elasticNetParam': 0.0, 'weightType': 'balanced'}: 0.6202020202020202
Training with parameters: {'maxIter': 10, 'regParam': 0.3, 'elasticNetParam': 0.0, 'weightType': 'sqrt'}


                                                                                

Precision = 0.783, Recall = 0.496
Score for parameters {'maxIter': 10, 'regParam': 0.3, 'elasticNetParam': 0.0, 'weightType': 'sqrt'}: 0.6076233183856502
Training with parameters: {'maxIter': 10, 'regParam': 0.3, 'elasticNetParam': 0.3, 'weightType': None}


                                                                                

Precision = 0.625, Recall = 0.027
Score for parameters {'maxIter': 10, 'regParam': 0.3, 'elasticNetParam': 0.3, 'weightType': None}: 0.052631578947368425
Training with parameters: {'maxIter': 10, 'regParam': 0.3, 'elasticNetParam': 0.3, 'weightType': 'balanced'}


                                                                                

Precision = 0.357, Recall = 0.553
Score for parameters {'maxIter': 10, 'regParam': 0.3, 'elasticNetParam': 0.3, 'weightType': 'balanced'}: 0.4342199856218548
Training with parameters: {'maxIter': 10, 'regParam': 0.3, 'elasticNetParam': 0.3, 'weightType': 'sqrt'}


                                                                                

Precision = 0.667, Recall = 0.147
Score for parameters {'maxIter': 10, 'regParam': 0.3, 'elasticNetParam': 0.3, 'weightType': 'sqrt'}: 0.24024024024024024
Best model parameters: {'maxIter': 10, 'regParam': 0.1, 'elasticNetParam': 0.0, 'weightType': None}


                                                                                

Best Model Metrics -> Precision: 0.8369905956112853, Recall: 0.489010989010989, F1-Score: 0.6173410404624278, Accuracy: 0.9172293073268317


In [46]:
best_model.save("./data/logreg_big_best")

## Bloom filter sample training

In [47]:
bloom_filter = BloomFilterBasedModel(spark, fpr=0.1)

In [48]:
bloom_filter.fit(train_df_)

                                                                                

In [49]:
predictions = bloom_filter.predict(val_df)

In [51]:
val_labels = predictions.select("label").rdd.flatMap(lambda x: x).collect()
val_preds = predictions.select("prediction").rdd.flatMap(lambda x: x).collect()

print("Bloom filter metrics on validation:")
print(f"\t- Accuracy: {accuracy_score(val_labels, val_preds):.5f}")
print(f"\t- Precision: {precision_score(val_labels, val_preds):.5f}")
print(f"\t- Recall: {recall_score(val_labels, val_preds):.5f}")
print(f"\t- F1-score: {f1_score(val_labels, val_preds):.5f}")

Bloom filter metrics on validation:
	- Accuracy: 0.92673
	- Precision: 0.65408
	- Recall: 0.98352
	- F1-score: 0.78566


In [52]:
bloom_filter.save("./data/filter")

In [53]:
filter2 = BloomFilterBasedModel.load(spark, "./data/filter")

In [54]:
triggered_users = predictions.filter(F.col("prediction") == 1).select("user").distinct().rdd.map(lambda r: r[0]).collect()

In [55]:
train_bots = train_df_.filter(F.col("bot")).select("user").distinct().rdd.map(lambda r: r[0]).collect()

In [56]:
val_bots = val_df.filter(F.col("bot")).select("user").distinct().rdd.map(lambda r: r[0]).collect()

In [57]:
predictions = filter2.predict(val_df)

In [58]:
val_labels = predictions.select("label").rdd.flatMap(lambda x: x).collect()
val_preds = predictions.select("prediction").rdd.flatMap(lambda x: x).collect()

accuracy_score(val_labels, val_preds), precision_score(val_labels, val_preds), recall_score(val_labels, val_preds), f1_score(val_labels, val_preds)

(0.9267316829207302,
 np.float64(0.6540803897685749),
 np.float64(0.9835164835164835),
 np.float64(0.7856620336503292))

## Finally

In [59]:
spark.stop()