In [1]:
import findspark
findspark.init()

In [2]:
import pyspark.sql.functions as F
import pyspark.sql.types as T

from pyspark.sql import SparkSession
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

from models.bloom_filter import BloomFilter
from models.emb_logreg import EmbeddingsLogReg

In [3]:
spark = SparkSession.builder \
    .appName("WikimediaStreamProcessor") \
    .config("spark.driver.memory", "12G") \
    .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:5.5.1") \
    .getOrCreate()

    # .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
    # .config("spark.kryoserializer.buffer.max", "2000M") \

24/11/17 21:26:07 WARN Utils: Your hostname, andrii-VirtualBox resolves to a loopback address: 127.0.1.1; using 10.0.2.15 instead (on interface enp0s3)
24/11/17 21:26:07 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


:: loading settings :: url = jar:file:/home/andrii/spark-3.5.3-bin-hadoop3/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/andrii/.ivy2/cache
The jars for the packages stored in: /home/andrii/.ivy2/jars
com.johnsnowlabs.nlp#spark-nlp_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-edc3fe1c-660c-46b2-9828-ec1d37e066a9;1.0
	confs: [default]
	found com.johnsnowlabs.nlp#spark-nlp_2.12;5.5.1 in central
	found com.typesafe#config;1.4.2 in central
	found org.rocksdb#rocksdbjni;6.29.5 in central
	found com.amazonaws#aws-java-sdk-s3;1.12.500 in central
	found com.amazonaws#aws-java-sdk-kms;1.12.500 in central
	found com.amazonaws#aws-java-sdk-core;1.12.500 in central
	found commons-logging#commons-logging;1.1.3 in central
	found commons-codec#commons-codec;1.15 in central
	found org.apache.httpcomponents#httpclient;4.5.13 in central
	found org.apache.httpcomponents#httpcore;4.4.13 in central
	found software.amazon.ion#ion-java;1.0.2 in central
	found joda-time#joda-time;2.8.1 in central
	found com.amazonaws#jmespath-java;1.12.500 in centra

In [4]:
train_df = spark.read.csv(
    "./data/train/train_dataset.csv",
    header=True,
    inferSchema=True,
    multiLine=True,
    escape="\"",
    quote="\"",
    sep=",",
)

                                                                                

In [5]:
train_df_wtext = train_df.withColumn(
    "text",
    F.concat(
        F.col("title"),
        F.when(F.col("comment").isNotNull(), F.col("comment")).otherwise(F.lit("NULL"))
    )
)

In [6]:
# embeddingLogReg = EmbeddingsLogReg(10, 0.3, 0)

In [7]:
train_df_wtext = train_df_wtext.withColumn("label", F.udf(lambda x: 1 if x else 0, T.IntegerType())(F.col("bot")))

In [8]:
total_rows = train_df_wtext.count()
train_split_index = int(total_rows * 0.7)

train_df_wtext = train_df_wtext.withColumn("row_index", F.monotonically_increasing_id())

train_df_ = train_df_wtext.filter(F.col("row_index") <= train_split_index).drop("row_index")
val_df = train_df_wtext.filter(F.col("row_index") > train_split_index).drop("row_index")

print("Training Dataset Count: " + str(train_df_.count()))
print("Val Dataset Count: " + str(val_df.count()))


                                                                                

Training Dataset Count: 3501
Val Dataset Count: 1499


In [9]:
# embeddingLogReg.fit(train_df_)

In [10]:
# predictions = embeddingLogReg.predict(val_df)

In [11]:
# val_labels = predictions.select("label").rdd.flatMap(lambda x: x).collect()
# val_preds = predictions.select("prediction").rdd.flatMap(lambda x: x).collect()

In [12]:
# accuracy_score(val_labels, val_preds), precision_score(val_labels, val_preds), recall_score(val_labels, val_preds), f1_score(val_labels, val_preds)

In [13]:
bloom_filter = BloomFilter(spark, capacity=100, false_positive_rate=0.1)

In [14]:
bloom_filter.fit(train_df_)

                                                                                

In [15]:
# val_df = val_df.repartition(1)  # Force single partition

In [16]:
predictions = bloom_filter.predict(val_df)

In [17]:
val_labels = predictions.select("label").rdd.flatMap(lambda x: x).collect()
val_preds = predictions.select("prediction").rdd.flatMap(lambda x: x).collect()

accuracy_score(val_labels, val_preds), precision_score(val_labels, val_preds), recall_score(val_labels, val_preds), f1_score(val_labels, val_preds)

                                                                                

(0.44429619746497667,
 np.float64(0.014492753623188406),
 np.float64(0.004746835443037975),
 np.float64(0.007151370679380214))

In [17]:
bloom_filter.save("./data/filter_train_small")

In [18]:
filter2 = BloomFilterBasedModel.load(spark, "./data/filter_train_small")

In [19]:
triggered_users = predictions.filter(F.col("prediction") == 1).select("user").distinct().rdd.map(lambda r: r[0]).collect()

In [20]:
train_bots = train_df_.filter(F.col("bot")).select("user").distinct().rdd.map(lambda r: r[0]).collect()

                                                                                

In [21]:
val_bots = val_df.filter(F.col("bot")).select("user").distinct().rdd.map(lambda r: r[0]).collect()

In [22]:
set(train_bots) & set(val_bots)

{'AnomieBOT',
 'Cewbot',
 'Citation bot',
 'Community Tech bot',
 'DeltaQuadBot',
 'InternetArchiveBot',
 'JJMC89 bot III',
 'ListeriaBot',
 'RMCD bot',
 'SDZeroBot',
 'SineBot',
 'WoodwardBot'}

In [23]:
set(val_bots) & set(triggered_users)

{'AnomieBOT'}

In [24]:
set(train_bots) & set(triggered_users)

{'AnomieBOT'}

In [27]:
filter2.bloom_filter.lookup('Axeman89')

False

In [26]:
set(triggered_users) - set(train_bots)

{'102.211.145.1',
 '154.185.5.74',
 '180.189.104.101',
 '196.188.225.213',
 '2400:4052:11A2:4E00:E401:F6F8:7C3A:7513',
 '2A00:10:9910:4C01:40F6:9E0D:C07D:A148',
 '2A06:5906:3A10:5500:8DAE:169F:F49F:42B9',
 '83.6.194.74',
 '94.244.90.164',
 'Athel cb',
 'Axeman89',
 'DimensionalFusion',
 'Drchriswilliams',
 'Evelyn Harthbrooke',
 'EzekielT',
 'Fanminton',
 'Fm3dici97',
 'FrB.TG',
 'HangInThere1000',
 'Helrasincke',
 'Hg03u',
 'How15948',
 'Industrial Metal Brain',
 'Jeet Dev',
 'Jimmaths',
 'KopikoBlanca2014',
 'Maiō T.',
 'Makenzis',
 'Mjks28',
 'Niasoh',
 'Nicknack009',
 'PEditorS10',
 'Paok4ever',
 'Rxsxuis',
 'RxxingAddict',
 'Sarim Wani',
 'Sharontse121',
 'Tom.Reding',
 'Turniner',
 'UnpetitproleX',
 'さえぼー'}

In [29]:
predictions = filter2.predict(val_df)

In [30]:
val_labels = predictions.select("label").rdd.flatMap(lambda x: x).collect()
val_preds = predictions.select("prediction").rdd.flatMap(lambda x: x).collect()

accuracy_score(val_labels, val_preds), precision_score(val_labels, val_preds), recall_score(val_labels, val_preds), f1_score(val_labels, val_preds)

24/11/17 19:42:36 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/11/17 19:42:36 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/11/17 19:42:36 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/11/17 19:42:36 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/11/17 19:42:37 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/11/17 19:42:37 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/11/17 1

(0.9553035356904603,
 np.float64(0.9088277858176556),
 np.float64(0.9936708860759493),
 np.float64(0.9493575207860923))

In [None]:
# embeddingLogReg.save("./data/logreg_small")

                                                                                

In [17]:
lr2 = EmbeddingsLogReg(0, 0, 0)

In [18]:
lr2 = EmbeddingsLogReg.load("./data/logreg_small")

In [19]:
predictions2 = lr2.predict(val_df)

In [20]:
val_labels_2 = predictions2.select("label").rdd.flatMap(lambda x: x).collect()
val_preds_2 = predictions2.select("prediction").rdd.flatMap(lambda x: x).collect()

                                                                                

In [21]:
accuracy_score(val_labels_2, val_preds_2), precision_score(val_labels_2, val_preds_2), recall_score(val_labels_2, val_preds_2), f1_score(val_labels_2, val_preds_2)

(0.9172781854569713,
 np.float64(0.9635036496350365),
 np.float64(0.8354430379746836),
 np.float64(0.8949152542372881))

In [None]:
spark.stop()