# Preparing the environment

## Import dependencies

In [1]:
#    Spark
from pyspark import SparkContext
#    Spark Streaming
from pyspark.streaming import StreamingContext
#    Kafka
from pyspark.streaming.kafka import KafkaUtils
#    json parsing
import json
#    print function
from __future__ import print_function
#    os
import sys

## Create Streaming Context

In [2]:
zkQuorum = "localhost:2181"
topic = "twitter-stream"
seconds_to_run = 60
ssc = StreamingContext(sc, seconds_to_run)

tweets = KafkaUtils.createStream(ssc, zkQuorum, "spark-streaming-consumer", {topic: 1})

## Parse the inbound message as json

The inbound stream is a DStream, which supports various built-in transformations such as map which is used here to parse the inbound messages from their native JSON format.
Note that this will fail horribly if the inbound message isn't valid JSON.

In [3]:
parsed = tweets.map(lambda v: json.loads(v[1]))

In [4]:
parsed.count().map(lambda x:'Tweets in this batch: %s' % x).pprint()

## Extract Text from each tweet

In [5]:
text_dstream = parsed.map(lambda tweet: tweet['text'])
#text_dstream.pprint()

## Process the extracted text 

In [6]:
import nltk
import csv
import string
import re
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords

In [7]:
def filter_out_unicode(tweet):
  
    try:
        clean_tweet = str(tweet)
    except UnicodeEncodeError:
        pass
    return clean_tweet

def expand_around_chars(text, characters):
    for char in characters:
        text = text.replace(char, ' ' + char + ' ')
    return text

def strip_quotations_newline(text):
    clean_tweet = ' '.join(text.split())
    clean_tweet = clean_tweet.encode('utf-8')
    clean_tweet = clean_tweet.replace('",\'','')
    clean_tweet = re.sub(r'''(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’]))''', "", clean_tweet)
    clean_tweet = re.sub(r'''(@[A-Za-z0-9]+)''', "", clean_tweet)
    clean_tweet = re.sub("([0-9]+)", "", clean_tweet)
    clean_tweet = re.sub(r'[^\x00-\x7F]+','', clean_tweet)
    return clean_tweet

def split_text(text):
    text = strip_quotations_newline(text)
    text = expand_around_chars(text, '\/".,()&[]{}:;!-_\'')
    splitted_text = text.split(' ')
    cleaned_text = [x for x in splitted_text if len(x) > 2]
    text_lowercase = [x.lower() for x in cleaned_text]
    return text_lowercase

In [9]:
mess = text_dstream.map(lambda text: ' '.join(split_text(text)))
mess.pprint()

## Create clean tweet

In [12]:
tweet_df = mess.foreachRDD(lambda rdd: rdd.toDF("text"))

Py4JJavaError: An error occurred while calling z:org.apache.spark.streaming.api.python.PythonDStream.callForeachRDD.
: java.lang.IllegalStateException: Adding new inputs, transformations, and output operations after starting a context is not supported
	at org.apache.spark.streaming.dstream.DStream.validateAtInit(DStream.scala:223)
	at org.apache.spark.streaming.dstream.DStream.<init>(DStream.scala:65)
	at org.apache.spark.streaming.dstream.ForEachDStream.<init>(ForEachDStream.scala:39)
	at org.apache.spark.streaming.dstream.DStream.org$apache$spark$streaming$dstream$DStream$$foreachRDD(DStream.scala:653)
	at org.apache.spark.streaming.dstream.DStream$$anonfun$foreachRDD$2.apply$mcV$sp(DStream.scala:638)
	at org.apache.spark.streaming.dstream.DStream$$anonfun$foreachRDD$2.apply(DStream.scala:638)
	at org.apache.spark.streaming.dstream.DStream$$anonfun$foreachRDD$2.apply(DStream.scala:638)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.SparkContext.withScope(SparkContext.scala:679)
	at org.apache.spark.streaming.StreamingContext.withScope(StreamingContext.scala:264)
	at org.apache.spark.streaming.dstream.DStream.foreachRDD(DStream.scala:634)
	at org.apache.spark.streaming.api.python.PythonDStream$.callForeachRDD(PythonDStream.scala:179)
	at org.apache.spark.streaming.api.python.PythonDStream.callForeachRDD(PythonDStream.scala)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:237)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:280)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:214)
	at java.lang.Thread.run(Thread.java:748)


-------------------------------------------
Time: 2017-06-20 15:03:00
-------------------------------------------
Tweets in this batch: 49



In [13]:
tweet_df

-------------------------------------------
Time: 2017-06-20 15:03:00
-------------------------------------------
ing doyoung noticed yuta hands shaking out nervousness and held his hands calm his nerves down
~lerrrd shaking~ #fightmyway
last earthquake sat jun gmt location lat long magnitude
where were they radicalized
panther whales out here hittin licks seen all
all the trump kids supposedly went good schools but their grammar bad their father
imagine this dance king hit the stage already shaking
the #finsburypark attacker targeted worshippers attempt drive fear failed prayers were held the
jisoo duality has shaking oml
nature strikes back
...

-------------------------------------------
Time: 2017-06-20 15:04:00
-------------------------------------------
Tweets in this batch: 57

-------------------------------------------
Time: 2017-06-20 15:04:00
-------------------------------------------
kim taeyeon she vocal goddess she has such stable vocals shaking she eats cds for every me

## Convert data to be classified

## Start the streaming context

In [11]:
ssc.start()
ssc.awaitTermination(timeout=180)

-------------------------------------------
Time: 2017-06-20 14:49:00
-------------------------------------------
Tweets in this batch: 97

-------------------------------------------
Time: 2017-06-20 14:49:00
-------------------------------------------
stay the fuck outta their hood then
any teenation member beat quake champions will recieve till reach you lose will
panther whales out here hittin licks seen all
ara won shaking #whatsmynamestwin
global unrest event underway west coast usa canada hit earthquake
jonny amp thom play one off show macerata sferisterio august proceeds will help the restoration earthq
after you predicted crooked hilly landslide you now are saying maybe and maybe not? lol
blackpink shaking #krunk
scoffs shaking her head please outside often both know that
jay panorama reported people lived that tower survivors are really suggesting only are dead
...



Py4JJavaError: An error occurred while calling o33.awaitTerminationOrTimeout.
: org.apache.spark.SparkException: An exception was raised by Python:
Traceback (most recent call last):
  File "/usr/local/src/spark/spark-2.0.2-bin-hadoop2.7/python/pyspark/streaming/util.py", line 65, in call
    r = self.func(t, *rdds)
  File "/usr/local/src/spark/spark-2.0.2-bin-hadoop2.7/python/pyspark/streaming/dstream.py", line 159, in <lambda>
    func = lambda t, rdd: old_func(rdd)
  File "<ipython-input-10-977c4d20cde0>", line 1, in <lambda>
    tweet_df = mess.foreachRDD(lambda rdd: rdd.toDF("text"))
  File "/usr/local/src/spark/spark-2.0.2-bin-hadoop2.7/python/pyspark/sql/session.py", line 57, in toDF
    return sparkSession.createDataFrame(self, schema, sampleRatio)
  File "/usr/local/src/spark/spark-2.0.2-bin-hadoop2.7/python/pyspark/sql/session.py", line 492, in createDataFrame
    schema = _parse_datatype_string(schema)
  File "/usr/local/src/spark/spark-2.0.2-bin-hadoop2.7/python/pyspark/sql/types.py", line 845, in _parse_datatype_string
    return _parse_basic_datatype_string(s)
  File "/usr/local/src/spark/spark-2.0.2-bin-hadoop2.7/python/pyspark/sql/types.py", line 739, in _parse_basic_datatype_string
    raise ValueError("Could not parse datatype: %s" % s)
ValueError: Could not parse datatype: text

	at org.apache.spark.streaming.api.python.TransformFunction.callPythonTransformFunction(PythonDStream.scala:95)
	at org.apache.spark.streaming.api.python.TransformFunction.apply(PythonDStream.scala:78)
	at org.apache.spark.streaming.api.python.PythonDStream$$anonfun$callForeachRDD$1.apply(PythonDStream.scala:179)
	at org.apache.spark.streaming.api.python.PythonDStream$$anonfun$callForeachRDD$1.apply(PythonDStream.scala:179)
	at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1$$anonfun$apply$mcV$sp$1.apply$mcV$sp(ForEachDStream.scala:51)
	at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1$$anonfun$apply$mcV$sp$1.apply(ForEachDStream.scala:51)
	at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1$$anonfun$apply$mcV$sp$1.apply(ForEachDStream.scala:51)
	at org.apache.spark.streaming.dstream.DStream.createRDDWithLocalProperties(DStream.scala:415)
	at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1.apply$mcV$sp(ForEachDStream.scala:50)
	at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1.apply(ForEachDStream.scala:50)
	at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1.apply(ForEachDStream.scala:50)
	at scala.util.Try$.apply(Try.scala:192)
	at org.apache.spark.streaming.scheduler.Job.run(Job.scala:39)
	at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler$$anonfun$run$1.apply$mcV$sp(JobScheduler.scala:247)
	at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler$$anonfun$run$1.apply(JobScheduler.scala:247)
	at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler$$anonfun$run$1.apply(JobScheduler.scala:247)
	at scala.util.DynamicVariable.withValue(DynamicVariable.scala:58)
	at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler.run(JobScheduler.scala:246)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
	at java.lang.Thread.run(Thread.java:748)


-------------------------------------------
Time: 2017-06-20 14:50:00
-------------------------------------------
Tweets in this batch: 61

-------------------------------------------
Time: 2017-06-20 14:50:00
-------------------------------------------
the whales are fed with our shit
she shaking them titties girl yes
gangs aggressive killer whales are shaking down alaska fishing boats for their fish report
imagine this dance king hit the stage already shaking
aggressive shaking down
your area expertise sir
got shaking hands during #mmvas slow hands
ara won shaking #whatsmynamestwin
your polls have never been close being right they were hillary would won landslide fire your polling agent lol
residents flee after tsunami hit greenland following earthquake
...

-------------------------------------------
Time: 2017-06-20 14:51:00
-------------------------------------------
Tweets in this batch: 41

-------------------------------------------
Time: 2017-06-20 14:51:00
-------------------

## Sort the author count

In [8]:
author_counts_sorted_dstream = author_counts.transform(\
  (lambda foo:foo\
   .sortBy(lambda x:( -x[1]))))

In [9]:
author_counts_sorted_dstream.pprint()

## Get top 5 authors by tweet count

In [10]:
top_five_authors = author_counts_sorted_dstream.transform\
  (lambda rdd:sc.parallelize(rdd.take(5)))
top_five_authors.pprint()

## Get authors with more than one tweet, or whose username starts with 'a'

In [13]:
filtered_authors = author_counts.filter(lambda x:\
                                                x[1]>1 \
                                                or \
                                                x[0].lower().startswith('rm'))

In [14]:
filtered_authors.transform\
  (lambda rdd:rdd\
  .sortBy(lambda x:-x[1]))\
  .pprint()

## List the most common words in the tweets

In [15]:
parsed.\
    flatMap(lambda tweet:tweet['text'].split(" "))\
    .countByValue()\
    .transform\
      (lambda rdd:rdd.sortBy(lambda x:-x[1]))\
    .pprint()