# Preparing the environment

## Import dependencies

In [1]:
#    Spark
from pyspark import SparkContext
#    Spark Streaming
from pyspark.streaming import StreamingContext
#    Kafka
from pyspark.streaming.kafka import KafkaUtils
#    json parsing
import json

## Create Streaming Context

In [2]:
ssc = StreamingContext(sc, 60)  #batch duration = 60 seconds

In [3]:
kafkaStream = KafkaUtils.createStream(ssc, 'cdh57-01-node-01.moffatt.me:2181', 'spark-streaming', {'twitter':1})

# Message Processing

## Parse the inbound message as json

The inbound stream is a DStream, which supports various built-in transformations such as map which is used here to parse the inbound messages from their native JSON format.
Note that this will fail horribly if the inbound message isn't valid JSON.

In [4]:
parsed = kafkaStream.map(lambda v: json.loads(v[1]))

In [5]:
parsed.count().map(lambda x:'Tweets in this batch: %s' % x).pprint()

## Extract Author name from each tweet

In [6]:
authors_dstream = parsed.map(lambda tweet: tweet['user']['screen_name'])

## Count the number of tweets per author

In [7]:
author_counts = authors_dstream.countByValue()
author_counts.pprint()

## Sort the author count

In [8]:
author_counts_sorted_dstream = author_counts.transform(\
  (lambda foo:foo\
   .sortBy(lambda x:( -x[1]))))

In [9]:
author_counts_sorted_dstream.pprint()

## Get top 5 authors by tweet count

In [10]:
top_five_authors = author_counts_sorted_dstream.transform\
  (lambda rdd:sc.parallelize(rdd.take(5)))
top_five_authors.pprint()

## Get authors with more than one tweet, or whose username starts with 'a'

In [13]:
filtered_authors = author_counts.filter(lambda x:\
                                                x[1]>1 \
                                                or \
                                                x[0].lower().startswith('rm'))

In [14]:
filtered_authors.transform\
  (lambda rdd:rdd\
  .sortBy(lambda x:-x[1]))\
  .pprint()

## List the most common words in the tweets

In [15]:
parsed.\
    flatMap(lambda tweet:tweet['text'].split(" "))\
    .countByValue()\
    .transform\
      (lambda rdd:rdd.sortBy(lambda x:-x[1]))\
    .pprint()

## Start the streaming context

In [16]:
ssc.start()
ssc.awaitTermination(timeout=180)

-------------------------------------------
Time: 2017-06-14 19:57:00
-------------------------------------------

-------------------------------------------
Time: 2017-06-14 19:57:00
-------------------------------------------

-------------------------------------------
Time: 2017-06-14 19:57:00
-------------------------------------------

-------------------------------------------
Time: 2017-06-14 19:57:00
-------------------------------------------

-------------------------------------------
Time: 2017-06-14 19:57:00
-------------------------------------------

-------------------------------------------
Time: 2017-06-14 19:57:00
-------------------------------------------

-------------------------------------------
Time: 2017-06-14 19:58:00
-------------------------------------------

-------------------------------------------
Time: 2017-06-14 19:58:00
-------------------------------------------

-------------------------------------------
Time: 2017-06-14 19:58:00
----------

# Raw crawler

In [1]:
# %load utils.py
# CrisisLex
# Author: Alexandra Olteanu
# Check LICENSE for details about copyright.

import json

#receives a string in json format
#returns the textual content of a tweet
def extract_tweet_from_json(data):
    try:
        json_tweet = json.loads(data.strip())
    except:
        exit("Not able to load json data")
    if 'text' in json_tweet:
        return json_tweet['text'].replace('\n','')
    else:
        return None

# reads the terms to be tracked from a file
# expects one term per line
def get_query_terms(input_filename):
    query_terms = []
    for line in input_filename:
        query_terms.append(line.strip())
    return query_terms

In [2]:
# %load config.py
#Sets Twitter API access

#Please give the keys as 'strings'
CONSUMER_KEY = 'm4pfoQYtQOCahpKEY55dlogg7' # API key
CONSUMER_SECRET = 'TdU3rKhwfspQYrSA0GsPW7IooRkbw9opfZ82KZmADsGvnAUL01' # API secret
ACCESS_KEY = '798171014693879808-9I6Ms2tOWxLmXlwVbNHmEoYQJxO97rB' # Access token
ACCESS_SECRET= '3vEM0E6i6OfmNE95ng8VcK6dHtKS6qByQoD7mZtp7qqvb' # Access token secret

In [4]:
import sys
import os

import tweepy1 as t
from tweepy1.parsers import ModelParser
from tweepy1 import StreamListener
from tweepy1 import Stream


import utils1
#import config as c

class PrintListener(StreamListener):
    output = None
    def on_data(self, data):
        if self.output is None:
            print data
            return True
        else:
            print>>self.output, data.strip()
            return True

    def on_error(self, status):
        if status == 420:
            print status, "Twitter API Error: Enhance your calm -- You are being rate limited"
        elif status == 401:
            print status, "Twitter API Error: Unauthorized -- Authentication credentials were missing or incorrect. Please double check config.py"
        else:
            print status
        
    def set_output(self, output_json):
        self.output = output_json

#authenticate
auth = t.OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET)
auth.set_access_token(ACCESS_KEY, ACCESS_SECRET)
api = t.API(auth_handler=auth, parser = ModelParser())      
        
print "Configuring query settings ...."  
    
filename = 'data/test3.txt'
lexicon = 'QuakeLex.txt'
pl = PrintListener()
pl.set_output(open(filename,"w"))

try:
        to_track = utils1.get_query_terms(open(lexicon,"r"))
except Exception as e:
        print "The file path is seems to be wrong. Check the error below or run the script with -h. Please revise and restart the script"
        print e
        exit(0)
        
#start tracking crisis-relevant tweets
stream = Stream(auth, pl)
try:
    print "Collecting tweets ...."
    #stream.filter(track=to_track[0:400],locations=[-6.38,49.87,1.77,55.81],languages=["en"])
    stream.filter(track=to_track[0:400],languages=["en"])
except Exception as e:
    print "The script have crashed with the following error: "
    print e
    print "\n Please check if your Twitter API keys are correct"

Configuring query settings ....
Collecting tweets ....


KeyboardInterrupt: 

In [8]:
import socket
import tweepy
from tweepy import OAuthHandler
from tweepy import Stream
from tweepy.streaming import StreamListener
import json

In [9]:
class TweetsListener(StreamListener):

    def __init__(self, csocket):

        self.client_socket = csocket

    def on_data(self, data):

        try:

            msg = json.loads( data )
            print( msg['text'].encode('utf-8') )
            self.client_socket.send( msg['text'].encode('utf-8') )
            return True
        
        except BaseException as e:
            
            print("Error on_data: %s" % str(e))
            
        return True

    def on_error(self, status):
        
        print(status)
        return True

In [10]:
def sendData(c_socket):
    auth = OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET)
    auth.set_access_token(ACCESS_KEY, ACCESS_SECRET)
    twitter_stream = Stream(auth, TweetsListener(c_socket))
    twitter_stream.filter(track=['trump'])

In [16]:
s = socket.socket()         # Create a socket object
host = "83.212.118.171"      # Get local machine name
port = 8889                 # Reserve a port for your service.
s.bind((host, port))        # Bind to the port
print("Listening on port: %s" % str(port))
s.listen(5)                 # Now wait for client connection.
c, addr = s.accept()        # Establish connection with client
print( "Received request from: " + str( addr ) )
sendData( c )

Listening on port: 8889


KeyboardInterrupt: 