in \spark\notebooks> python3 local_proxy.py jinnytty loltyler1

In [1]:
import threading

# Helper thread to avoid the Spark StreamingContext from blocking Jupyter
        
class StreamingThread(threading.Thread):
    def __init__(self, ssc):
        super().__init__()
        self.ssc = ssc
    def run(self):
        self.ssc.start()
        self.ssc.awaitTermination()
    def stop(self):
        print('----- Stopping... this may take a few seconds -----')
        self.ssc.stop(stopSparkContext=False, stopGraceFully=True)

In [2]:
sc

In [3]:
spark

In [4]:
from pyspark.streaming import StreamingContext
from pyspark.sql import Row
from pyspark.sql.functions import udf, struct, array, col, lit
from pyspark.sql.types import StringType
import pyspark
from pyspark.sql import SparkSession
import pyspark.sql.functions as f
from pyspark.ml.feature import Tokenizer, StopWordsRemover, StringIndexer, HashingTF, IDF, Word2Vec
from pyspark.ml.classification import LogisticRegression, NaiveBayes, LogisticRegressionModel
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [5]:
globals()['models_loaded'] = False
globals()['my_model'] = None

# Toy predict function. Normally you'd use your loaded globals()['my_model'] here
def predict(df):
    print("def predict(df)")
    # Preprocessing
    df.show()
    df = StringIndexer(inputCol='channel', outputCol='label', handleInvalid='keep').fit(df).transform(df)
    df = df.drop(*("username","channel"))
    df = df.withColumn("words1", f.regexp_replace(f.col("message"), "[\$#,<>+@=?!]", ""))
    df = df.withColumn("words2", f.regexp_replace(f.col("words1"), "  +", " "))
    df = df.dropna()
    tokenizer = Tokenizer(inputCol="words2", outputCol="words3")
    df = tokenizer.transform(df)
    remover = StopWordsRemover(inputCol="words3", outputCol="words")
    df = remover.transform(df)
    df = df.drop(*("words1", "words2", "words3"))
    word2Vec = Word2Vec(vectorSize=10, minCount=0, inputCol="words", outputCol="w2v")
    w2vmodel = word2Vec.fit(df)
    df = w2vmodel.transform(df)
    # Predict
    df_result = globals()['my_model'].transform(df)
    df_result.show()
    return(df_result)

predict_udf = udf(predict, StringType())
print("after predict_udf")
def process(time, rdd):
    if rdd.isEmpty():
        return
    
    print("========= %s =========" % str(time))
    # Load in the model if not yet loaded:
    if not globals()['models_loaded']:
        print("Loading the model!")
        # load in your models here
        globals()['my_model'] = LogisticRegressionModel.load("a3.model")
        globals()['models_loaded'] = True
    #else:
        print("Model's already loaded")
    
    # Convert to data frame
    df = spark.read.json(rdd)
    df.show(n=3)
    
    # Utilize our predict function
    print("before df_withpreds")
    #df_withpreds = df.withColumn("pred", predict_udf(
    #    struct([df[x] for x in df.columns])
    #))
    # Utilize our predict function – no UDF here
    predictions = predict(df)
    predictions.show()
    print("after df_withpreds")
    #df_withpreds.show()
    
    # Normally, you wouldn't use a UDF (User Defined Function) Python function to predict as we did here (you can)
    # but an MLlib model you've built and saved with Spark
    # In this case, you need to prevent loading your model in every call to "process" as follows:
    
    print("This is where load models was")
        
    # And then predict using the loaded model: 
    #df_result = globals()['my_model'].transform(df)
    #df_result.show()

after predict_udf


In [6]:
ssc = StreamingContext(sc, 10)

In [7]:
lines = ssc.socketTextStream("localhost", 8080)
lines.foreachRDD(process)

In [8]:
ssc_t = StreamingThread(ssc)
ssc_t.start()

Loading the model!
Model's already loaded
+----------+--------------------+-------------------+---------+
|   channel|            datetime|            message| username|
+----------+--------------------+-------------------+---------+
|#loltyler1|2022-05-26T13:41:...|           Gigachad|  artzen_|
|#loltyler1|2022-05-26T13:41:...|BigBrother ImTyping|smile_vwv|
|#loltyler1|2022-05-26T13:41:...|     trynd ult KEKW|  foker01|
+----------+--------------------+-------------------+---------+
only showing top 3 rows

before df_withpreds
def predict(df)
+----------+--------------------+--------------------+------------+
|   channel|            datetime|             message|    username|
+----------+--------------------+--------------------+------------+
|#loltyler1|2022-05-26T13:41:...|            Gigachad|     artzen_|
|#loltyler1|2022-05-26T13:41:...| BigBrother ImTyping|   smile_vwv|
|#loltyler1|2022-05-26T13:41:...|      trynd ult KEKW|     foker01|
| #jinnytty|2022-05-26T13:41:...|        

+--------------------+--------------------+-----+--------------------+--------------------+--------------------+--------------------+----------+
|            datetime|             message|label|               words|                 w2v|       rawPrediction|         probability|prediction|
+--------------------+--------------------+-----+--------------------+--------------------+--------------------+--------------------+----------+
|2022-05-26T13:41:...|BigBrother SHOW H...|  0.0|[bigbrother, show...|[0.02014559414237...|[7.5229353736028,...|[0.57211876872484...|       0.0|
|2022-05-26T13:41:...|               !rank|  0.0|              [rank]|[-0.0237755235284...|[7.70525685201694...|[0.64675717411767...|       0.0|
|2022-05-26T13:41:...|@Pr0b1ematic, ACC...|  0.0|[pr0b1ematic, acc...|[-0.0079744018148...|[7.58170137324454...|[0.58815750357845...|       0.0|
|2022-05-26T13:41:...|          TRYND KEKW|  0.0|       [trynd, kekw]|[-0.0049563255161...|[7.44988334426679...|[0.52383887644950.

+----------+--------------------+--------------------+---------------+
|   channel|            datetime|             message|       username|
+----------+--------------------+--------------------+---------------+
|#loltyler1|2022-05-26T13:42:...|PogChamp PogChamp...|  onetruelurker|
|#loltyler1|2022-05-26T13:42:...|hoping for a loud...|saltmineforeman|
|#loltyler1|2022-05-26T13:42:...|          PauseChamp|   kangkehsynth|
+----------+--------------------+--------------------+---------------+
only showing top 3 rows

before df_withpreds
def predict(df)
+----------+--------------------+--------------------+----------------+
|   channel|            datetime|             message|        username|
+----------+--------------------+--------------------+----------------+
|#loltyler1|2022-05-26T13:42:...|PogChamp PogChamp...|   onetruelurker|
|#loltyler1|2022-05-26T13:42:...|hoping for a loud...| saltmineforeman|
|#loltyler1|2022-05-26T13:42:...|          PauseChamp|    kangkehsynth|
|#loltyler

+----------+--------------------+--------------------+--------------+
|   channel|            datetime|             message|      username|
+----------+--------------------+--------------------+--------------+
|#loltyler1|2022-05-26T13:42:...|     pepeLaugh t1 d1|almondmilk1254|
|#loltyler1|2022-05-26T13:42:...|Swain will carry ...| bagoftrash212|
|#loltyler1|2022-05-26T13:42:...|   say it PauseChamp|     gabudinow|
+----------+--------------------+--------------------+--------------+
only showing top 3 rows

before df_withpreds
def predict(df)
+----------+--------------------+--------------------+-----------------+
|   channel|            datetime|             message|         username|
+----------+--------------------+--------------------+-----------------+
|#loltyler1|2022-05-26T13:42:...|     pepeLaugh t1 d1|   almondmilk1254|
|#loltyler1|2022-05-26T13:42:...|Swain will carry ...|    bagoftrash212|
|#loltyler1|2022-05-26T13:42:...|   say it PauseChamp|        gabudinow|
|#loltyler1

+--------------------+--------------------+-----+--------------------+--------------------+--------------------+--------------------+----------+
|            datetime|             message|label|               words|                 w2v|       rawPrediction|         probability|prediction|
+--------------------+--------------------+-----+--------------------+--------------------+--------------------+--------------------+----------+
|2022-05-26T13:42:...|Swain trying his ...|  0.0|[swain, trying, h...|[-0.0034457841621...|[7.39341741975335...|[0.52685672929090...|       0.0|
|2022-05-26T13:42:...|Playing in pitch ...|  0.0|[playing, pitch, ...|[0.02079125680029...|[7.57022581913466...|[0.58203844908612...|       0.0|
|2022-05-26T13:42:...|      TriHard Gaming|  0.0|   [trihard, gaming]|[0.03179780952632...|[7.67527130553992...|[0.64805910671260...|       0.0|
|2022-05-26T13:42:...|                LULW|  0.0|              [lulw]|[0.03433923050761...|[7.53951154854080...|[0.59391896312222.

+----------+--------------------+--------------------+-------------+
|   channel|            datetime|             message|     username|
+----------+--------------------+--------------------+-------------+
|#loltyler1|2022-05-26T13:42:...|CAN WIN please bi...|     realll54|
|#loltyler1|2022-05-26T13:42:...|               !rank|youngbootsyvk|
|#loltyler1|2022-05-26T13:42:...|BigBrother BigBro...|      yus1785|
+----------+--------------------+--------------------+-------------+
only showing top 3 rows

before df_withpreds
def predict(df)
+----------+--------------------+--------------------+---------------+
|   channel|            datetime|             message|       username|
+----------+--------------------+--------------------+---------------+
|#loltyler1|2022-05-26T13:42:...|CAN WIN please bi...|       realll54|
|#loltyler1|2022-05-26T13:42:...|               !rank|  youngbootsyvk|
|#loltyler1|2022-05-26T13:42:...|BigBrother BigBro...|        yus1785|
|#loltyler1|2022-05-26T13:42:.

+----------+--------------------+-------------------+----------------+
|   channel|            datetime|            message|        username|
+----------+--------------------+-------------------+----------------+
|#loltyler1|2022-05-26T13:42:...|Gigachad dont blame|ninja_do_pantano|
|#loltyler1|2022-05-26T13:42:...|winable? PauseChamp|         tenchua|
| #jinnytty|2022-05-26T13:42:...|         UwU yyjUwU|      cocoacake5|
+----------+--------------------+-------------------+----------------+
only showing top 3 rows

before df_withpreds
def predict(df)
+----------+--------------------+--------------------+--------------------+
|   channel|            datetime|             message|            username|
+----------+--------------------+--------------------+--------------------+
|#loltyler1|2022-05-26T13:42:...| Gigachad dont blame|    ninja_do_pantano|
|#loltyler1|2022-05-26T13:42:...| winable? PauseChamp|             tenchua|
| #jinnytty|2022-05-26T13:42:...|          UwU yyjUwU|        

+----------+--------------------+--------------------+-----------+
|   channel|            datetime|             message|   username|
+----------+--------------------+--------------------+-----------+
|#loltyler1|2022-05-26T13:43:...| Gigachad GIVE FIGHT|relaxedshet|
| #jinnytty|2022-05-26T13:43:...|           PogO stop|   kiridane|
|#loltyler1|2022-05-26T13:43:...|moon2GIGA Don't b...|    c60_jam|
+----------+--------------------+--------------------+-----------+
only showing top 3 rows

before df_withpreds
def predict(df)
+----------+--------------------+--------------------+-----------------+
|   channel|            datetime|             message|         username|
+----------+--------------------+--------------------+-----------------+
|#loltyler1|2022-05-26T13:43:...| Gigachad GIVE FIGHT|      relaxedshet|
| #jinnytty|2022-05-26T13:43:...|           PogO stop|         kiridane|
|#loltyler1|2022-05-26T13:43:...|moon2GIGA Don't b...|          c60_jam|
|#loltyler1|2022-05-26T13:43:...

+----------+--------------------+--------------------+------------+
|   channel|            datetime|             message|    username|
+----------+--------------------+--------------------+------------+
|#loltyler1|2022-05-26T13:43:...|@omarallan9696 iv...|crady_killem|
|#loltyler1|2022-05-26T13:43:...|Tyler needs 200 l...|    bigfroob|
|#loltyler1|2022-05-26T13:43:...|       LULU BIG SUCK| xxkillhouse|
+----------+--------------------+--------------------+------------+
only showing top 3 rows

before df_withpreds
def predict(df)
+----------+--------------------+--------------------+----------------+
|   channel|            datetime|             message|        username|
+----------+--------------------+--------------------+----------------+
|#loltyler1|2022-05-26T13:43:...|@omarallan9696 iv...|    crady_killem|
|#loltyler1|2022-05-26T13:43:...|Tyler needs 200 l...|        bigfroob|
|#loltyler1|2022-05-26T13:43:...|       LULU BIG SUCK|     xxkillhouse|
|#loltyler1|2022-05-26T13:43:..

+----------+--------------------+--------------------+-----------+
|   channel|            datetime|             message|   username|
+----------+--------------------+--------------------+-----------+
|#loltyler1|2022-05-26T13:43:...|RareTyler  RareTy...| digitalova|
|#loltyler1|2022-05-26T13:43:...|                   1|   waifumax|
|#loltyler1|2022-05-26T13:43:...|                   1|xxkillhouse|
+----------+--------------------+--------------------+-----------+
only showing top 3 rows

before df_withpreds
def predict(df)
+----------+--------------------+--------------------+-----------------+
|   channel|            datetime|             message|         username|
+----------+--------------------+--------------------+-----------------+
|#loltyler1|2022-05-26T13:43:...|RareTyler  RareTy...|       digitalova|
|#loltyler1|2022-05-26T13:43:...|                   1|         waifumax|
|#loltyler1|2022-05-26T13:43:...|                   1|      xxkillhouse|
|#loltyler1|2022-05-26T13:43:...

+----------+--------------------+--------------------+--------------+
|   channel|            datetime|             message|      username|
+----------+--------------------+--------------------+--------------+
| #jinnytty|2022-05-26T13:43:...|        ommaSad chet|    cocoacake5|
|#loltyler1|2022-05-26T13:43:...|MAKE tyler1Despai...|akshan_gaming1|
|#loltyler1|2022-05-26T13:43:...|tyler1R tyler1R t...|     derieri08|
+----------+--------------------+--------------------+--------------+
only showing top 3 rows

before df_withpreds
def predict(df)
+----------+--------------------+--------------------+--------------------+
|   channel|            datetime|             message|            username|
+----------+--------------------+--------------------+--------------------+
| #jinnytty|2022-05-26T13:43:...|        ommaSad chet|          cocoacake5|
|#loltyler1|2022-05-26T13:43:...|MAKE tyler1Despai...|      akshan_gaming1|
|#loltyler1|2022-05-26T13:43:...|tyler1R tyler1R t...|           deri

In [9]:
ssc_t.stop()

----- Stopping... this may take a few seconds -----
+----------+--------------------+--------------------+------------+
|   channel|            datetime|             message|    username|
+----------+--------------------+--------------------+------------+
|#loltyler1|2022-05-26T13:43:...|          PauseChamp|     chururu|
|#loltyler1|2022-05-26T13:43:...|                 Pog|timewaster00|
|#loltyler1|2022-05-26T13:43:...|BigBrother BigBro...|     yus1785|
+----------+--------------------+--------------------+------------+
only showing top 3 rows

before df_withpreds
def predict(df)
+----------+--------------------+--------------------+--------------+
|   channel|            datetime|             message|      username|
+----------+--------------------+--------------------+--------------+
|#loltyler1|2022-05-26T13:43:...|          PauseChamp|       chururu|
|#loltyler1|2022-05-26T13:43:...|                 Pog|  timewaster00|
|#loltyler1|2022-05-26T13:43:...|BigBrother BigBro...|       

ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/home/muddy/Documents/spark/spark/spark-3.2.1-bin-hadoop2.7/python/lib/py4j-0.10.9.3-src.zip/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/home/muddy/Documents/spark/spark/spark-3.2.1-bin-hadoop2.7/python/lib/py4j-0.10.9.3-src.zip/py4j/clientserver.py", line 475, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/usr/local/lib/python3.9/socket.py", line 704, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt


KeyboardInterrupt: 