In [1]:
######################################## Initialize ##################################

# Basics
from pymongo import MongoClient
import os
import numpy as np
import pandas as pd
import time
import boto3
import io
import warnings
warnings.filterwarnings('ignore')

# import findspark
# findspark.init('/usr/lib/spark')

from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('nlp').getOrCreate()
import time

# Feature Engineering
from pyspark.ml.feature import (VectorAssembler,VectorIndexer,
                                Tokenizer,StopWordsRemover, CountVectorizer,IDF,StringIndexer, HashingTF)
from pyspark.sql.functions import length
from pyspark.sql.functions import col, udf
from pyspark.sql.types import StringType
import preprocessor as p
from pyspark.sql.functions import dayofyear, concat_ws, collect_list, countDistinct

# Models
from pyspark.ml.classification import LogisticRegression

# Pipeline
from pyspark.ml import Pipeline

# Evaluators
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator

In [31]:
######################################## Data ############################################

#Setup Mongo and create the database and collection
User = os.environ['MONGODB_USER']
password = os.environ['MONGODB_PASS']
IP = os.environ['IP']

client = MongoClient(IP, username=User, password=password)
db = client['stock_tweets']

#Grab references
twitter_coll_reference = db.twitter
iex_coll_reference = db.iex

In [11]:
######################################## Build Twitter Pandas Frame #######################
# Create Data Frame
twitter_data = pd.DataFrame(list(twitter_coll_reference.find()))

# Need to convert the created_at to a time stamp and set to index
twitter_data.index=pd.to_datetime(twitter_data['created_at'])

# Delimited the Company List into separate rows
delimited_twitter_data=[]

for item in twitter_data.itertuples():
    #twitter_dict={}
    for company in item[1]:
        twitter_dict={}
        twitter_dict['created_at']=item[0]
        twitter_dict['company']=company
        twitter_dict['text']=item[11]
        twitter_dict['user_followers_count']=item[12]
        twitter_dict['user_name']=item[13]
        twitter_dict['user_statuses_count']=item[15]
        delimited_twitter_data.append(twitter_dict)

delimited_twitter_df = pd.DataFrame(delimited_twitter_data) 
#delimited_twitter_df.set_index('created_at', inplace=True)

In [12]:
# Convert to Spark Dataframe
# Create a Spark DataFrame from Pandas
twitter_df = spark.createDataFrame(delimited_twitter_df)

In [13]:
twitter_df.show()

+-------+-------------------+--------------------+--------------------+--------------------+-------------------+
|company|         created_at|                text|user_followers_count|           user_name|user_statuses_count|
+-------+-------------------+--------------------+--------------------+--------------------+-------------------+
|   TSLA|2018-03-12 18:07:08|$TSLA so nice so ...|                1703|TradeTherapAnalytics|              67528|
|   AAPL|2018-03-12 18:07:19|@JoKiddo But how ...|                2901|        Gilmo Report|              18524|
|   AAPL|2018-03-12 18:07:23|RT @StockTwits: T...|                5256|           Mark Hill|              13523|
|   GOOG|2018-03-12 18:07:23|RT @StockTwits: T...|                5256|           Mark Hill|              13523|
|  GOOGL|2018-03-12 18:07:23|RT @StockTwits: T...|                5256|           Mark Hill|              13523|
|   AAPL|2018-03-12 18:07:25|$AAPL may be work...|                 486|       William White|    

In [79]:
# Need to Group by Day and company
twitter_daily_df = twitter_df.groupby(dayofyear("created_at"),"Company").count().orderBy('dayofyear(created_at)','Company')
twitter_daily_df = twitter_daily_df.select(col("dayofyear(created_at)").alias("Day"), 
                                           col("Company").alias ("Company"), 
                                           col("count").alias("Number_of_tweets"))

In [73]:
# Combine the Text
combined_text = twitter_df.groupby(dayofyear("created_at"),"Company").agg(concat_ws(" ", collect_list("text"))).orderBy('dayofyear(created_at)','Company')
combined_text = combined_text.select(col("dayofyear(created_at)").alias("Day"), 
                                           col("Company").alias ("Company"), 
                                           col("concat_ws( , collect_list(text))").alias("Text"))


In [97]:
# Add Text Data
twitter_daily_df = twitter_daily_df.join(combined_text, ["Day","Company"]).orderBy('Day','Company')

In [99]:
# Distinct Users
distinct_users = twitter_df.groupby(dayofyear("created_at"),"Company").agg(countDistinct("user_name")).orderBy('dayofyear(created_at)','Company')
distinct_users = distinct_users.select(col("dayofyear(created_at)").alias("Day"), 
                                           col("Company").alias ("Company"), 
                                           col("count(DISTINCT user_name)").alias("Distinct_Users"))



In [101]:
# Add Distinct Users
twitter_daily_df = twitter_daily_df.join(distinct_users, ["Day","Company"]).orderBy('Day','Company')

In [107]:
twitter_daily_df.show()

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:38411)
Traceback (most recent call last):
  File "/usr/local/spark/python/lib/py4j-0.10.6-src.zip/py4j/java_gateway.py", line 852, in _get_connection
    connection = self.deque.pop()
IndexError: pop from an empty deque

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/spark/python/lib/py4j-0.10.6-src.zip/py4j/java_gateway.py", line 990, in start
    self.socket.connect((self.address, self.port))
ConnectionRefusedError: [Errno 111] Connection refused


Py4JNetworkError: An error occurred while trying to connect to the Java server (127.0.0.1:38411)

In [103]:
####################### Build Stock Data ###########################
stock_data = pd.DataFrame(list(iex_coll_reference.find()))

# Need to convert the created_at to a time stamp
stock_data.index=pd.to_datetime(stock_data['latestUpdate'])
stock_data['latestUpdate'] = pd.to_datetime(stock_data['latestUpdate'])
#Group By hourly and stock price
# Need to get the first stock price in teh hour, and then the last to take the difference to see how much change.
stock_delimited_daily = stock_data.sort_values('latestUpdate').groupby([pd.Grouper(freq="D"), 'Ticker']).first()['latestPrice'].to_frame()
stock_delimited_daily.columns = ['First_Price']
stock_delimited_daily['Last_Price'] = stock_data.sort_values('latestUpdate').groupby([pd.Grouper(freq="D"), 'Ticker']).last()['latestPrice']

# Then need to take the difference and turn into a percentage.
stock_delimited_daily['Price_Percent_Change'] = ((stock_delimited_daily['Last_Price'] 
                                                   - stock_delimited_daily['First_Price'])/stock_delimited_daily['First_Price'])*100

# Need to also show Percent from open price
stock_delimited_daily['Open_Price'] = stock_data.groupby([pd.Grouper(freq="D"), 'Ticker'])['open'].mean()
stock_delimited_daily['Price_Percent_Open'] = ((stock_delimited_daily['Last_Price'] 
                                                 - stock_delimited_daily['Open_Price'])/stock_delimited_daily['Open_Price'])*100

# Also include mean volume
stock_delimited_daily['Mean_Volume'] = stock_data.groupby([pd.Grouper(freq="D"), 'Ticker'])['latestVolume'].mean()

# Classification Labels
stock_delimited_daily['Price_Change'] = np.where(stock_delimited_daily['Price_Percent_Change']>=0, 1, 0)
stock_delimited_daily['Open_Price_Change'] = np.where(stock_delimited_daily['Price_Percent_Open']>=0, 1, 0)

# Rename the Index
stock_delimited_daily = stock_delimited_daily.reindex(stock_delimited_daily.index.rename(['Time', 'Company']))

# Flatten Dataframe
stock_delimited_daily.reset_index(inplace=True)


----------------------------------------
Exception happened during processing of request from ('127.0.0.1', 45536)
Traceback (most recent call last):
  File "/opt/conda/lib/python3.6/socketserver.py", line 317, in _handle_request_noblock
    self.process_request(request, client_address)
  File "/opt/conda/lib/python3.6/socketserver.py", line 348, in process_request
    self.finish_request(request, client_address)
  File "/opt/conda/lib/python3.6/socketserver.py", line 361, in finish_request
    self.RequestHandlerClass(request, client_address, self)
  File "/opt/conda/lib/python3.6/socketserver.py", line 696, in __init__
    self.handle()
  File "/usr/local/spark/python/pyspark/accumulators.py", line 235, in handle
    num_updates = read_int(self.rfile)
  File "/usr/local/spark/python/pyspark/serializers.py", line 685, in read_int
    raise EOFError
EOFError
----------------------------------------


In [106]:
# Create a Spark DataFrame from Pandas
stock_df = spark.createDataFrame(stock_delimited_daily)

ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "/usr/local/spark/python/lib/py4j-0.10.6-src.zip/py4j/java_gateway.py", line 1062, in send_command
    raise Py4JNetworkError("Answer from Java side is empty")
py4j.protocol.Py4JNetworkError: Answer from Java side is empty

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/spark/python/lib/py4j-0.10.6-src.zip/py4j/java_gateway.py", line 908, in send_command
    response = connection.send_command(command)
  File "/usr/local/spark/python/lib/py4j-0.10.6-src.zip/py4j/java_gateway.py", line 1067, in send_command
    "Error while receiving", e, proto.ERROR_ON_RECEIVE)
py4j.protocol.Py4JNetworkError: Error while receiving


Py4JError: An error occurred while calling o29.get