In [1]:
import findspark
findspark.init()

In [2]:
import pyspark.sql.functions as F
import pyspark.sql.types as T

from pyspark.sql import SparkSession
from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

from models.bloom_filter import BloomFilterBasedModel
from models.emb_logreg import EmbeddingsLogReg

In [3]:
spark = SparkSession.builder \
    .appName("WikimediaStreamProcessor") \
    .config("spark.driver.memory", "12G") \
    .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:5.5.1") \
    .getOrCreate()

:: loading settings :: url = jar:file:/Users/markson/Desktop/UCU/UCU_6K1S_MiningMassiveDatasets/venv/lib/python3.11/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /Users/markson/.ivy2/cache
The jars for the packages stored in: /Users/markson/.ivy2/jars
com.johnsnowlabs.nlp#spark-nlp_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-60cb92c3-92ad-4470-8f04-14e195c2ad9e;1.0
	confs: [default]
	found com.johnsnowlabs.nlp#spark-nlp_2.12;5.5.1 in central
	found com.typesafe#config;1.4.2 in central
	found org.rocksdb#rocksdbjni;6.29.5 in central
	found com.amazonaws#aws-java-sdk-s3;1.12.500 in central
	found com.amazonaws#aws-java-sdk-kms;1.12.500 in central
	found com.amazonaws#aws-java-sdk-core;1.12.500 in central
	found commons-logging#commons-logging;1.1.3 in central
	found commons-codec#commons-codec;1.15 in central
	found org.apache.httpcomponents#httpclient;4.5.13 in central
	found org.apache.httpcomponents#httpcore;4.4.13 in central
	found software.amazon.ion#ion-java;1.0.2 in central
	found joda-time#joda-time;2.8.1 in central
	found com.amazonaws#jmespath-java;1.12.500 in ce

## Data preparation

In [4]:
train_df = spark.read.csv(
    "./data/train/train_dataset_big.csv",
    header=True,
    inferSchema=True,
    multiLine=True,
    escape="\"",
    quote="\"",
    sep=",",
)

In [5]:
train_df_wtext = train_df.withColumn(
    "text",
    F.concat(
        F.col("title"),
        F.when(F.col("comment").isNotNull(), F.col("comment")).otherwise(F.lit("NULL"))
    )
)

In [6]:
train_df_wtext = train_df_wtext.withColumn("label", F.udf(lambda x: 1 if x else 0, T.IntegerType())(F.col("bot")))

## Pure Logistic Regression with Embeddings training

In [7]:
total_rows = train_df_wtext.count()
train_split_index = int(total_rows * 0.7)

train_df_wtext = train_df_wtext.withColumn("row_index", F.monotonically_increasing_id())

train_df_ = train_df_wtext.filter(F.col("row_index") <= train_split_index).drop("row_index")
val_df = train_df_wtext.filter(F.col("row_index") > train_split_index).drop("row_index")

print("Training Dataset Count: " + str(train_df_.count()))
print("Val Dataset Count: " + str(val_df.count()))


Training Dataset Count: 28001
Val Dataset Count: 11999


In [8]:
embeddingLogReg = EmbeddingsLogReg(maxIter=10, regParam=0.3, elasticNetParam=0)

word2vec_gigaword_300 download started this may take some time.
Approximate size to download 312.3 MB
[ | ]

24/11/19 20:35:22 WARN S3AbortableInputStream: Not all bytes were read from the S3ObjectInputStream, aborting HTTP connection. This is likely an error and may result in sub-optimal behavior. Request only the bytes you need via a ranged GET or drain the input stream after use.
24/11/19 20:35:22 WARN S3AbortableInputStream: Not all bytes were read from the S3ObjectInputStream, aborting HTTP connection. This is likely an error and may result in sub-optimal behavior. Request only the bytes you need via a ranged GET or drain the input stream after use.


word2vec_gigaword_300 download started this may take some time.
Approximate size to download 312.3 MB
Download done! Loading the resource.
[ / ]

                                                                                

[OK!]


In [9]:
embeddingLogReg.fit(train_df_)

24/11/19 20:35:42 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
                                                                                

In [10]:
predictions = embeddingLogReg.predict(val_df)

In [11]:
val_labels = predictions.select("label").rdd.flatMap(lambda x: x).collect()
val_preds = predictions.select("prediction").rdd.flatMap(lambda x: x).collect()

                                                                                

In [12]:
accuracy_score(val_labels, val_preds), precision_score(val_labels, val_preds), recall_score(val_labels, val_preds), f1_score(val_labels, val_preds)

(0.9160763396949746,
 np.float64(0.7622478386167147),
 np.float64(0.38584974471188915),
 np.float64(0.5123486682808717))

In [13]:
embeddingLogReg.save("./data/logreg_big")

## LogReg Parameter search

In [14]:
class RecallWithPrecisionConstraint:
    def __init__(self, spark, precision_boundary=0.95):
        self.spark = spark
        self.precision_boundary = precision_boundary

    def evaluate(self, predictions):
        metrics = predictions.withColumn(
            "tp", F.expr("CASE WHEN bot AND prediction = 1 THEN 1 ELSE 0 END")
        ).withColumn(
            "fp", F.expr("CASE WHEN NOT bot AND prediction = 1 THEN 1 ELSE 0 END")
        ).withColumn(
            "fn", F.expr("CASE WHEN bot AND prediction = 0 THEN 1 ELSE 0 END")
        )

        metrics = metrics.groupBy().agg(
            F.sum("tp").alias("true_positives"),
            F.sum("fp").alias("false_positives"),
            F.sum("fn").alias("false_negatives"),
            F.count("*").alias("total_observations")
        ).withColumn(
            "precision", F.expr("CASE WHEN false_positives = 0 AND true_positives = 0 THEN 1 ELSE true_positives / (true_positives + false_positives) END")
        ).withColumn(
            "recall", F.expr("CASE WHEN false_negatives = 0 AND true_positives = 0 THEN 1 ELSE true_positives / (true_positives + false_negatives) END")
        )

        precision = metrics.select("precision").first()[0]
        recall = metrics.select("recall").first()[0]

        if precision >= self.precision_boundary:
            return 1 + recall # If precision is self.precision_boundary+, it's better than maximizing precision
        else:
            return precision

In [None]:
param_grid = [
    {"maxIter": max_iter, "regParam": reg_param, "elasticNetParam": elastic_net_param}
    for max_iter in [10, 20, 50]
    for reg_param in [0.1, 0.3, 0.5]
    for elastic_net_param in [0.0, 0.5, 1.0]
]

best_model = None
best_score = float("-inf")
evaluator = RecallWithPrecisionConstraint(spark)

for params in param_grid:
    print(f"Training with parameters: {params}")
    
    model = EmbeddingsLogReg(
        maxIter=params["maxIter"], 
        regParam=params["regParam"], 
        elasticNetParam=params["elasticNetParam"]
    )
    
    model.fit(train_df_)
    
    predictions = model.predict(val_df)
    
    score = evaluator.evaluate(predictions)
    print(f"Score for parameters {params}: {score}")
    
    if score > best_score:
        best_score = score
        best_model = model
        best_params = params

print(f"Best model parameters: {best_params}")
predictions = best_model.predict(val_df)

val_labels = predictions.select("label").rdd.flatMap(lambda x: x).collect()
val_preds = predictions.select("prediction").rdd.flatMap(lambda x: x).collect()

precision = precision_score(val_labels, val_preds)
recall = recall_score(val_labels, val_preds)
f1 = f1_score(val_labels, val_preds)
accuracy = accuracy_score(val_labels, val_preds)

print(f"Best Model Metrics -> Precision: {precision}, Recall: {recall}, F1-Score: {f1}, Accuracy: {accuracy}")

Training with parameters: {'maxIter': 10, 'regParam': 0.1, 'elasticNetParam': 0.0}


                                                                                

Score for parameters {'maxIter': 10, 'regParam': 0.1, 'elasticNetParam': 0.0}: 0.8188118811881188
Training with parameters: {'maxIter': 10, 'regParam': 0.1, 'elasticNetParam': 0.5}


                                                                                

Score for parameters {'maxIter': 10, 'regParam': 0.1, 'elasticNetParam': 0.5}: 0.4783783783783784
Training with parameters: {'maxIter': 10, 'regParam': 0.1, 'elasticNetParam': 1.0}


                                                                                

Score for parameters {'maxIter': 10, 'regParam': 0.1, 'elasticNetParam': 1.0}: 0.4339622641509434
Training with parameters: {'maxIter': 10, 'regParam': 0.3, 'elasticNetParam': 0.0}


                                                                                

Score for parameters {'maxIter': 10, 'regParam': 0.3, 'elasticNetParam': 0.0}: 0.7622478386167147
Training with parameters: {'maxIter': 10, 'regParam': 0.3, 'elasticNetParam': 0.5}


                                                                                

Score for parameters {'maxIter': 10, 'regParam': 0.3, 'elasticNetParam': 0.5}: 0.0
Training with parameters: {'maxIter': 10, 'regParam': 0.3, 'elasticNetParam': 1.0}


                                                                                

Score for parameters {'maxIter': 10, 'regParam': 0.3, 'elasticNetParam': 1.0}: 1.0
Training with parameters: {'maxIter': 10, 'regParam': 0.5, 'elasticNetParam': 0.0}


                                                                                

Score for parameters {'maxIter': 10, 'regParam': 0.5, 'elasticNetParam': 0.0}: 0.6466346153846154
Training with parameters: {'maxIter': 10, 'regParam': 0.5, 'elasticNetParam': 0.5}


                                                                                

Score for parameters {'maxIter': 10, 'regParam': 0.5, 'elasticNetParam': 0.5}: 1.0
Training with parameters: {'maxIter': 10, 'regParam': 0.5, 'elasticNetParam': 1.0}


                                                                                

Score for parameters {'maxIter': 10, 'regParam': 0.5, 'elasticNetParam': 1.0}: 1.0
Training with parameters: {'maxIter': 20, 'regParam': 0.1, 'elasticNetParam': 0.0}


                                                                                

Score for parameters {'maxIter': 20, 'regParam': 0.1, 'elasticNetParam': 0.0}: 0.817279046673287
Training with parameters: {'maxIter': 20, 'regParam': 0.1, 'elasticNetParam': 0.5}


                                                                                

Score for parameters {'maxIter': 20, 'regParam': 0.1, 'elasticNetParam': 0.5}: 0.49441340782122906
Training with parameters: {'maxIter': 20, 'regParam': 0.1, 'elasticNetParam': 1.0}


                                                                                

Score for parameters {'maxIter': 20, 'regParam': 0.1, 'elasticNetParam': 1.0}: 0.4753521126760563
Training with parameters: {'maxIter': 20, 'regParam': 0.3, 'elasticNetParam': 0.0}


                                                                                

Score for parameters {'maxIter': 20, 'regParam': 0.3, 'elasticNetParam': 0.0}: 0.7633477633477633
Training with parameters: {'maxIter': 20, 'regParam': 0.3, 'elasticNetParam': 0.5}


                                                                                

Score for parameters {'maxIter': 20, 'regParam': 0.3, 'elasticNetParam': 0.5}: 0.0
Training with parameters: {'maxIter': 20, 'regParam': 0.3, 'elasticNetParam': 1.0}


                                                                                

Score for parameters {'maxIter': 20, 'regParam': 0.3, 'elasticNetParam': 1.0}: 1.0
Training with parameters: {'maxIter': 20, 'regParam': 0.5, 'elasticNetParam': 0.0}


                                                                                

Score for parameters {'maxIter': 20, 'regParam': 0.5, 'elasticNetParam': 0.0}: 0.6391509433962265
Training with parameters: {'maxIter': 20, 'regParam': 0.5, 'elasticNetParam': 0.5}


                                                                                

Score for parameters {'maxIter': 20, 'regParam': 0.5, 'elasticNetParam': 0.5}: 1.0
Training with parameters: {'maxIter': 20, 'regParam': 0.5, 'elasticNetParam': 1.0}


                                                                                

Score for parameters {'maxIter': 20, 'regParam': 0.5, 'elasticNetParam': 1.0}: 1.0
Training with parameters: {'maxIter': 50, 'regParam': 0.1, 'elasticNetParam': 0.0}


                                                                                

Score for parameters {'maxIter': 50, 'regParam': 0.1, 'elasticNetParam': 0.0}: 0.817279046673287
Training with parameters: {'maxIter': 50, 'regParam': 0.1, 'elasticNetParam': 0.5}


                                                                                

Score for parameters {'maxIter': 50, 'regParam': 0.1, 'elasticNetParam': 0.5}: 0.49441340782122906
Training with parameters: {'maxIter': 50, 'regParam': 0.1, 'elasticNetParam': 1.0}


                                                                                

Score for parameters {'maxIter': 50, 'regParam': 0.1, 'elasticNetParam': 1.0}: 0.4753521126760563
Training with parameters: {'maxIter': 50, 'regParam': 0.3, 'elasticNetParam': 0.0}


                                                                                

Score for parameters {'maxIter': 50, 'regParam': 0.3, 'elasticNetParam': 0.0}: 0.7633477633477633
Training with parameters: {'maxIter': 50, 'regParam': 0.3, 'elasticNetParam': 0.5}


                                                                                

Score for parameters {'maxIter': 50, 'regParam': 0.3, 'elasticNetParam': 0.5}: 0.0
Training with parameters: {'maxIter': 50, 'regParam': 0.3, 'elasticNetParam': 1.0}


                                                                                

Score for parameters {'maxIter': 50, 'regParam': 0.3, 'elasticNetParam': 1.0}: 1.0
Training with parameters: {'maxIter': 50, 'regParam': 0.5, 'elasticNetParam': 0.0}


                                                                                

Score for parameters {'maxIter': 50, 'regParam': 0.5, 'elasticNetParam': 0.0}: 0.6391509433962265
Training with parameters: {'maxIter': 50, 'regParam': 0.5, 'elasticNetParam': 0.5}


                                                                                

Score for parameters {'maxIter': 50, 'regParam': 0.5, 'elasticNetParam': 0.5}: 1.0
Training with parameters: {'maxIter': 50, 'regParam': 0.5, 'elasticNetParam': 1.0}


                                                                                

Score for parameters {'maxIter': 50, 'regParam': 0.5, 'elasticNetParam': 1.0}: 1.0
Best model parameters: {'maxIter': 10, 'regParam': 0.3, 'elasticNetParam': 1.0}


[Stage 653:>                                                        (0 + 1) / 1]

Best Model Metrics -> Precision: 0.0, Recall: 0.0, F1-Score: 0.0, Accuracy: 0.8857404783731978


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [39]:
print(f"Best model score: {best_score}")

Best model score: 1.0


In [16]:
best_model.save("./data/logreg_big_best")

In [None]:
# embeddingLogReg = EmbeddingsLogReg(maxIter=10, regParam=0.3, elasticNetParam=0)

In [None]:
# param_grid = ParamGridBuilder() \
#     .addGrid(embeddingLogReg.lr.maxIter, [10, 20, 50]) \
#     .addGrid(embeddingLogReg.lr.regParam, [0.1, 0.3, 0.5]) \
#     .addGrid(embeddingLogReg.lr.elasticNetParam, [0.0, 0.5, 1.0]) \
#     .build()

In [None]:
# train_val_split = TrainValidationSplit(
#     estimator=embeddingLogReg,
#     estimatorParamMaps=param_grid,
#     evaluator=evaluator,
#     trainRatio=0.8
# )

In [None]:
# best_model = train_val_split.fit(train_df_wtext)

In [17]:
# best_model = train_val_split.bestModel

# print(f"Best model parameters: {best_model}")

# predictions = best_model.transform(val_df)
# val_labels = predictions.select("label").rdd.flatMap(lambda x: x).collect()
# val_preds = predictions.select("prediction").rdd.flatMap(lambda x: x).collect()

# precision = precision_score(val_labels, val_preds)
# recall = recall_score(val_labels, val_preds)
# f1 = f1_score(val_labels, val_preds)
# accuracy = accuracy_score(val_labels, val_preds)
# print(f"Precision: {precision}, Recall: {recall}, F1-Score: {f1}, Accuracy: {accuracy}")

## Bloom filter sample training

In [18]:
bloom_filter = BloomFilterBasedModel(spark, fpr=0.1)

In [19]:
bloom_filter.fit(train_df_)

In [20]:
predictions = bloom_filter.predict(val_df)

In [21]:
val_labels = predictions.select("label").rdd.flatMap(lambda x: x).collect()
val_preds = predictions.select("prediction").rdd.flatMap(lambda x: x).collect()

accuracy_score(val_labels, val_preds), precision_score(val_labels, val_preds), recall_score(val_labels, val_preds), f1_score(val_labels, val_preds)

(0.9075756313026085,
 np.float64(0.5545378850957535),
 np.float64(0.9715536105032823),
 np.float64(0.7060694407633183))

In [22]:
bloom_filter.save("./data/filter_train_small")

In [23]:
filter2 = BloomFilterBasedModel.load(spark, "./data/filter_train_small")

In [24]:
triggered_users = predictions.filter(F.col("prediction") == 1).select("user").distinct().rdd.map(lambda r: r[0]).collect()

In [25]:
train_bots = train_df_.filter(F.col("bot")).select("user").distinct().rdd.map(lambda r: r[0]).collect()

In [26]:
val_bots = val_df.filter(F.col("bot")).select("user").distinct().rdd.map(lambda r: r[0]).collect()

In [27]:
set(train_bots) & set(val_bots)

{'AnomieBOT',
 'BaranBOT',
 'Bot1058',
 'Cewbot',
 'ChristieBot',
 'Citation bot',
 'ClueBot III',
 'Community Tech bot',
 'Cyberbot I',
 'DPL bot',
 'DYKHousekeepingBot',
 'DYKToolsBot',
 'DatBot',
 'DeltaQuadBot',
 'EarwigBot',
 'Filedelinkerbot',
 'FireflyBot',
 'GalliumBot',
 'HBC AIV helperbot14',
 'ImageTaggingBot',
 'InceptionBot',
 'InternetArchiveBot',
 'JJMC89 bot',
 'JJMC89 bot III',
 'Legobot',
 'ListeriaBot',
 'Lowercase sigmabot III',
 'MajavahBot',
 'MilHistBot',
 'MusikBot',
 'MusikBot II',
 'Mz7 (bot)',
 'Qwerfjkl (bot)',
 'RMCD bot',
 'Reports bot',
 'RussBot',
 'SDZeroBot',
 'SineBot',
 'TNTBot',
 'WugBot',
 'Yapperbot'}

In [28]:
set(val_bots) & set(triggered_users)

{'AnomieBOT',
 'BaranBOT',
 'Bot1058',
 'Cewbot',
 'ChristieBot',
 'Citation bot',
 'ClueBot III',
 'Community Tech bot',
 'Cyberbot I',
 'DPL bot',
 'DYKHousekeepingBot',
 'DYKToolsBot',
 'DatBot',
 'DeltaQuadBot',
 'EarwigBot',
 'Filedelinkerbot',
 'FireflyBot',
 'GalliumBot',
 'HBC AIV helperbot14',
 'Hazard-Bot',
 'ImageTaggingBot',
 'InceptionBot',
 'InternetArchiveBot',
 'JJMC89 bot',
 'JJMC89 bot III',
 'Legobot',
 'ListeriaBot',
 'Lowercase sigmabot III',
 'MajavahBot',
 'MilHistBot',
 'MusikBot',
 'MusikBot II',
 'Mz7 (bot)',
 'Qwerfjkl (bot)',
 'RMCD bot',
 'Reports bot',
 'RussBot',
 'SDZeroBot',
 'SineBot',
 'SporkBot',
 'TNTBot',
 'WugBot',
 'Yapperbot'}

In [29]:
set(train_bots) & set(triggered_users)

{'AnomieBOT',
 'BaranBOT',
 'Bot1058',
 'Cewbot',
 'ChristieBot',
 'Citation bot',
 'ClueBot III',
 'Community Tech bot',
 'Cyberbot I',
 'DPL bot',
 'DYKHousekeepingBot',
 'DYKToolsBot',
 'DatBot',
 'DeltaQuadBot',
 'EarwigBot',
 'Filedelinkerbot',
 'FireflyBot',
 'GalliumBot',
 'HBC AIV helperbot14',
 'ImageTaggingBot',
 'InceptionBot',
 'InternetArchiveBot',
 'JJMC89 bot',
 'JJMC89 bot III',
 'Legobot',
 'ListeriaBot',
 'Lowercase sigmabot III',
 'MDanielsBot',
 'MajavahBot',
 'MilHistBot',
 'MusikBot',
 'MusikBot II',
 'Mz7 (bot)',
 'Qwerfjkl (bot)',
 'RMCD bot',
 'Reports bot',
 'RussBot',
 'SDZeroBot',
 'SineBot',
 'TNTBot',
 'WugBot',
 'Yapperbot'}

In [30]:
filter2.bloom_filter.lookup('Georgymm')

True

In [31]:
set(triggered_users) - set(train_bots)

{'100.40.177.247',
 '103.111.34.186',
 '103.131.214.227',
 '103.155.223.62',
 '103.185.24.153',
 '103.76.183.72',
 '104.160.102.106',
 '109.193.38.37',
 '1112Gumdaddy',
 '112.134.185.0',
 '115.66.95.78',
 '117.200.185.208',
 '117.250.240.128',
 '134.228.208.207',
 '139.47.115.113',
 '144.48.128.182',
 '146.199.122.154',
 '162.251.173.193',
 '171.248.216.35',
 '173.73.64.102',
 '186.29.35.253',
 '189.28.69.105',
 '194.230.160.108',
 '198.239.119.4',
 '2001:4C4E:10DE:5C00:4196:F8A2:680:1225',
 '2001:56A:F919:7500:C0DC:4F47:59AC:4F94',
 '2001:5B0:ACE1:F180:FC0C:34D8:2B81:9FD7',
 '2003:C0:5744:26B6:18AC:BF25:9073:98DD',
 '2003:E6:1F02:4A00:D911:15D8:24F1:FD9E',
 '202.179.76.97',
 '212.159.141.14',
 '212.159.19.92',
 '212.171.67.228',
 '223.25.63.192',
 '24.154.56.98',
 '2400:9800:180:9B21:682F:30C0:AFC0:3865',
 '2402:7500:4F1:77E2:29E1:7B52:9032:8255',
 '2402:800:6236:B30:C8C7:94A4:F9E5:C281',
 '2405:3800:8A6:69A2:0:0:0:1',
 '2405:6E00:238:370C:C5C5:DDD3:ED17:161A',
 '2406:3003:2077:17A2:C

In [32]:
untriggered_users = predictions.filter((F.col("prediction") == 0) & (F.col("label") == 0)).select("user").distinct().rdd.map(lambda r: r[0]).collect()

In [33]:
len(set(untriggered_users)), len(set(triggered_users))

(3418, 426)

In [34]:
predictions = filter2.predict(val_df)

In [35]:
val_labels = predictions.select("label").rdd.flatMap(lambda x: x).collect()
val_preds = predictions.select("prediction").rdd.flatMap(lambda x: x).collect()

accuracy_score(val_labels, val_preds), precision_score(val_labels, val_preds), recall_score(val_labels, val_preds), f1_score(val_labels, val_preds)

(0.9075756313026085,
 np.float64(0.5545378850957535),
 np.float64(0.9715536105032823),
 np.float64(0.7060694407633183))

In [36]:
spark.stop()