# Daily Model Deployment

1. Load Model from S3
2. Create Dataframe from structured streaming from Kinesis in 10 min intervals
3. Make predictions
4. Log predictions to database

In [2]:
########################### Initialize ####################################

# Basic
import subprocess
import os
import numpy as np
import pandas as pd
import time
from datetime import date, datetime
import boto3
import boto3.s3
import os.path
import sys
import io
import warnings

# Pipeline
from pyspark.ml import Pipeline, PipelineModel

# Feature Engineering
from pyspark.ml.feature import (VectorAssembler,VectorIndexer,
                                Tokenizer,StopWordsRemover, CountVectorizer,IDF,StringIndexer, HashingTF)
from pyspark.sql.functions import length
from pyspark.sql.functions import col, udf
#from pyspark.sql.types import StringType
import preprocessor as p
from pyspark.sql.functions import dayofyear, concat_ws, collect_list, countDistinct
from pyspark.sql.types import *
# Models
from pyspark.ml.classification import LogisticRegression

# Streaming
from pyspark.streaming.kinesis import KinesisUtils, InitialPositionInStream
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('nlp').getOrCreate()


# Database Setup
import mysql.connector
from sqlalchemy import create_engine
from sqlalchemy import inspect
from sqlalchemy import MetaData
from sqlalchemy import Table
from sqlalchemy import Column
from sqlalchemy import Integer, String, DateTime, Float

In [30]:
# Download the Model
subprocess.run(['aws', 's3','cp','s3://brandyn-twitter-sentiment-analysis/Models/Daily_Stock_Prediction_latest/','./Models/Daily_Stock_Prediction_latest','--recursive'])

CompletedProcess(args=['aws', 's3', 'cp', 's3://brandyn-twitter-sentiment-analysis/Models/Daily_Stock_Prediction_latest/', './Models/Daily_Stock_Prediction_latest', '--recursive'], returncode=0)

In [37]:
# Deserialize Model
model = PipelineModel.load('./Models/Daily_Stock_Prediction_latest')

Trying to read from S3 and use structured streaming.

In [1]:
# To bypass the no s3 file system installed.
from pyspark.sql import SparkSession, SQLContext

import os
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.amazonaws:aws-java-sdk:1.10.34,org.apache.hadoop:hadoop-aws:2.6.0 pyspark-shell'

spark = SparkSession.builder.appName('nlp').getOrCreate()
sqlContext = SQLContext(spark)

hadoopConf = spark._jsc.hadoopConfiguration()
myAccessKey = os.environ['AWS_ACCESS_KEY_ID'] 
mySecretKey = os.environ['AWS_SECRET_ACCESS_KEY']
hadoopConf.set("fs.s3.impl", "org.apache.hadoop.fs.s3native.NativeS3FileSystem")
hadoopConf.set("fs.s3.awsAccessKeyId", myAccessKey)
hadoopConf.set("fs.s3.awsSecretAccessKey", mySecretKey)

In [None]:
testDf = sqlContext.readStream.json("s3://brandyn-twitter-sentiment-analysis/Twitter2018")

In [11]:
############################# Bring in Data ###############################
from pyspark.sql.types import *


#### Twitter ####
inputPath = "s3://brandyn-twitter-sentiment-analysis/Twitter2018/"

# Create Schema
twitterSchema = StructType() \
            .add("created_at", StringType()) \
            .add("id_str", IntegerType()) \
            .add("text", StringType()) \
            .add("quote_count", IntegerType()) \
            .add("reply_count", IntegerType()) \
            .add("retweet_count", IntegerType()) \
            .add("favorite_count", IntegerType()) \
            .add("lang", StringType()) \
            .add("user_followers_count", IntegerType()) \
            .add("user_statuses_count", IntegerType()) \
            .add("user_name", StringType()) \
            .add("user_screen_name", StringType()) \
            .add("Company", StringType())

# Create Dataframe
testDf = (
    spark.read.schema(twitterSchema).json(inputPath))

display(testDf)

DataFrame[created_at: string, id_str: int, text: string, quote_count: int, reply_count: int, retweet_count: int, favorite_count: int, lang: string, user_followers_count: int, user_statuses_count: int, user_name: string, user_screen_name: string, Company: string]

In [12]:
testDf.show()

+----------+------+----+-----------+-----------+-------------+--------------+----+--------------------+-------------------+---------+----------------+-------+
|created_at|id_str|text|quote_count|reply_count|retweet_count|favorite_count|lang|user_followers_count|user_statuses_count|user_name|user_screen_name|Company|
+----------+------+----+-----------+-----------+-------------+--------------+----+--------------------+-------------------+---------+----------------+-------+
+----------+------+----+-----------+-----------+-------------+--------------+----+--------------------+-------------------+---------+----------------+-------+



In [5]:
# Create Dataframe
df = (
    spark.readStream.schema(twitterSchema).json(inputPath))

In [6]:
query = df.writeStream.outputMode("append").format("console").start()
# query.awaitTermination()

In [8]:
df.select('*').where("retweet_count = 0").show()

AnalysisException: 'Queries with streaming sources must be executed with writeStream.start();;\nFileSource[s3://brandyn-twitter-sentiment-analysis/Twitter2018/05/18/16]'

In [1]:
########## From Thinkful Course

from pyspark import SparkContext
from pyspark.sql import SparkSession

from pyspark.sql.functions import desc, col, window

from pyspark.sql.types import *
from pyspark.streaming import StreamingContext

import json
import time
import os

In [2]:
# To bypass the no s3 file system installed.
from pyspark.sql import SparkSession, SQLContext

import os
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.amazonaws:aws-java-sdk:1.10.34,org.apache.hadoop:hadoop-aws:2.6.0 pyspark-shell'

APP_NAME = "Test"
SPARK_URL = "local[*]"

spark = SparkSession.builder.appName(APP_NAME).master(SPARK_URL).getOrCreate()
sqlContext = SQLContext(spark)

hadoopConf = spark._jsc.hadoopConfiguration()
myAccessKey = os.environ['AWS_ACCESS_KEY_ID'] 
mySecretKey = os.environ['AWS_SECRET_ACCESS_KEY']
hadoopConf.set("fs.s3.impl", "org.apache.hadoop.fs.s3native.NativeS3FileSystem")
hadoopConf.set("fs.s3.awsAccessKeyId", myAccessKey)
hadoopConf.set("fs.s3.awsSecretAccessKey", mySecretKey)

In [32]:
#### Twitter #### Static Read
inputPath = "s3://brandyn-twitter-sentiment-analysis/Twitter2018/05/*/*"

# Create Schema
twitterSchema = StructType() \
            .add("created_at", StringType()) \
            .add("id_str", StringType()) \
            .add("text", StringType()) \
            .add("quote_count", StringType()) \
            .add("reply_count", StringType()) \
            .add("retweet_count", StringType()) \
            .add("favorite_count", StringType()) \
            .add("retweeted", StringType()) \
            .add("lang", StringType()) \
            .add("user_name", StringType()) \
            .add("user_followers_count", StringType()) \
            .add("user_statuses_count", StringType()) \
            .add("user_screen_name", StringType()) \
            .add("Company", StringType())

# Create Dataframe
testDf = spark.read.schema(twitterSchema).json(inputPath, multiLine=True)

In [33]:
testDf.show()

+--------------------+------------------+--------------------+-----------+-----------+-------------+--------------+---------+----+--------------------+--------------------+-------------------+----------------+-----------------+
|          created_at|            id_str|                text|quote_count|reply_count|retweet_count|favorite_count|retweeted|lang|           user_name|user_followers_count|user_statuses_count|user_screen_name|          Company|
+--------------------+------------------+--------------------+-----------+-----------+-------------+--------------+---------+----+--------------------+--------------------+-------------------+----------------+-----------------+
|Fri May 18 16:18:...|997511693264719872|RT @NOD008: I've ...|          0|          0|            0|             0|    false|  en|       Hugh R Calder|                  65|               3862|     DrHugh2thDr|         ["TSLA"]|
|Fri May 18 16:12:...|997510337673871360|Isn't Elon suppos...|          0|          0|  

In [35]:
# Setup Streaming Input Dataframe
from pyspark.streaming import StreamingContext
streamingInputDF = (
  spark
    .readStream                       
    .schema(twitterSchema)               # Set the schema of the JSON data
    .option("maxFilesPerTrigger", 1)  # Treat a sequence of files as a stream by picking one file at a time
    .json(inputPath)
)

In [36]:
# Create STreaming dataframe
streamingWindowDF = streamingInputDF \
    .select('*')
    .groupBy

# Is this Streaming?
streamingWindowDF.isStreaming

True

In [38]:
# Now start the engine
spark.conf.set("spark.sql.shuffle.partitions", "2")  # keep the size of shuffles small

# Write stream to an in memroy table called 
query = (
  streamingWindowDF
    .writeStream
    .format("memory")       
    .queryName("stocks")     # counts = name of the in-memory table
    .outputMode("complete")  # complete = all the counts should be in the table
    .start()
)

AnalysisException: 'Complete output mode not supported when there are no streaming aggregations on streaming DataFrames/Datasets;;\nProject [created_at#944, id_str#945, text#946, quote_count#947, reply_count#948, retweet_count#949, favorite_count#950, retweeted#951, lang#952, user_name#953, user_followers_count#954, user_statuses_count#955, user_screen_name#956, Company#957]\n+- StreamingRelation DataSource(org.apache.spark.sql.SparkSession@4cb89750,json,List(),Some(StructType(StructField(created_at,StringType,true), StructField(id_str,StringType,true), StructField(text,StringType,true), StructField(quote_count,StringType,true), StructField(reply_count,StringType,true), StructField(retweet_count,StringType,true), StructField(favorite_count,StringType,true), StructField(retweeted,StringType,true), StructField(lang,StringType,true), StructField(user_name,StringType,true), StructField(user_followers_count,StringType,true), StructField(user_statuses_count,StringType,true), StructField(user_screen_name,StringType,true), StructField(Company,StringType,true))),List(),None,Map(maxFilesPerTrigger -> 1, path -> s3://brandyn-twitter-sentiment-analysis/Twitter2018/05/*/*),None), FileSource[s3://brandyn-twitter-sentiment-analysis/Twitter2018/05/*/*], [created_at#944, id_str#945, text#946, quote_count#947, reply_count#948, retweet_count#949, favorite_count#950, retweeted#951, lang#952, user_name#953, user_followers_count#954, user_statuses_count#955, user_screen_name#956, Company#957]\n'