# Daily Model Deployment

In [1]:
########################### Initialize ####################################

# Basic
import json
import time
import os


# Spark
from pyspark import SparkContext
from pyspark.sql import SparkSession, SQLContext

from pyspark.sql.functions import desc, col, window

from pyspark.sql.types import *
from pyspark.streaming import StreamingContext



Trying to read from S3 and use structured streaming.

In [2]:
# To bypass the no s3 file system installed.

os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.amazonaws:aws-java-sdk:1.10.34,org.apache.hadoop:hadoop-aws:2.6.0 pyspark-shell'

APP_NAME = "Test"
SPARK_URL = "local[*]"

spark = SparkSession.builder.appName(APP_NAME).master(SPARK_URL).getOrCreate()
sqlContext = SQLContext(spark)

hadoopConf = spark._jsc.hadoopConfiguration()
myAccessKey = os.environ['AWS_ACCESS_KEY_ID'] 
mySecretKey = os.environ['AWS_SECRET_ACCESS_KEY']
hadoopConf.set("fs.s3.impl", "org.apache.hadoop.fs.s3native.NativeS3FileSystem")
hadoopConf.set("fs.s3.awsAccessKeyId", myAccessKey)
hadoopConf.set("fs.s3.awsSecretAccessKey", mySecretKey)

In [13]:
#### Twitter #### Static Read
inputPath = "s3://brandyn-twitter-sentiment-analysis/Twitter2018/*/*/*"

# Create Schema
twitterSchema = StructType() \
            .add("created_at", StringType()) \
            .add("id_str", StringType()) \
            .add("text", StringType()) \
            .add("quote_count", StringType()) \
            .add("reply_count", StringType()) \
            .add("retweet_count", StringType()) \
            .add("favorite_count", StringType()) \
            .add("retweeted", StringType()) \
            .add("lang", StringType()) \
            .add("user_name", StringType()) \
            .add("user_followers_count", StringType()) \
            .add("user_statuses_count", StringType()) \
            .add("user_screen_name", StringType()) \
            .add("Company", StringType())

# Create Dataframe
testDf = spark.read.schema(twitterSchema).json(inputPath, multiLine=True)

In [4]:
testDf.show()

+--------------------+------------------+--------------------+-----------+-----------+-------------+--------------+---------+----+--------------------+--------------------+-------------------+----------------+-----------------+
|          created_at|            id_str|                text|quote_count|reply_count|retweet_count|favorite_count|retweeted|lang|           user_name|user_followers_count|user_statuses_count|user_screen_name|          Company|
+--------------------+------------------+--------------------+-----------+-----------+-------------+--------------+---------+----+--------------------+--------------------+-------------------+----------------+-----------------+
|Fri May 18 16:18:...|997511693264719872|RT @NOD008: I've ...|          0|          0|            0|             0|    false|  en|       Hugh R Calder|                  65|               3862|     DrHugh2thDr|         ["TSLA"]|
|Fri May 18 16:12:...|997510337673871360|Isn't Elon suppos...|          0|          0|  

In [5]:
# Setup Streaming Input Dataframe
from pyspark.streaming import StreamingContext
streamingInputDF = (
  spark
    .readStream                       
    .schema(twitterSchema)               # Set the schema of the JSON data
    .option("maxFilesPerTrigger", 1)  # Treat a sequence of files as a stream by picking one file at a time
    .json(inputPath)
)

In [7]:
# Create STreaming dataframe
streamingWindowDF = streamingInputDF \
    .select('user_name') \
    .groupBy(streamingInputDF.user_name) \
    .count() \
    .sort(desc('count'))

# Is this Streaming?
streamingWindowDF.isStreaming

True

In [10]:
# Now start the engine
spark.conf.set("spark.sql.shuffle.partitions", "2")  # keep the size of shuffles small

# Write stream to an in memroy table called 
query = (
  streamingWindowDF
    .writeStream
    .format("memory")       
    .queryName("tweets")     # counts = name of the in-memory table
    .outputMode("complete")  # complete = all the counts should be in the table
    .start()
)

In [14]:
# let the query run for a bit to insure there is data in the recent progress structure.
time.sleep(4)

# Monitor the progress of the query. The last table should be identical to the static query.
while True:
    spark.sql("select * from tweets").show(20)
    time.sleep(1)
    

+-------------------+-----+
|          user_name|count|
+-------------------+-----+
|         Bibey Post|    3|
|        Utah Herald|    2|
|         StockTexts|    2|
|               FinX|    1|
|          Herald KS|    1|
|           KL Daily|    1|
|                yon|    1|
|           HFTAlert|    1|
|          Mr. Rigid|    1|
|         Bilo Selhi|    1|
|        Key Gazette|    1|
|  Clark Joseph Kent|    1|
|      Travis Howard|    1|
|  The Norman Weekly|    1|
|            The Fly|    1|
|     The Lincolnian|    1|
|        HuronReport|    1|
|    David Bergerson|    1|
|Chaffey Breeze News|    1|
|      The San Times|    1|
+-------------------+-----+
only showing top 20 rows

+-------------------+-----+
|          user_name|count|
+-------------------+-----+
|         Bibey Post|    3|
|        Utah Herald|    2|
|         StockTexts|    2|
|               FinX|    1|
|          Herald KS|    1|
|           KL Daily|    1|
|                yon|    1|
|           HFTAlert| 

KeyboardInterrupt: 