# Preparing the environment

## Import dependencies

In [1]:
#    Spark
from pyspark import SparkContext
#    Spark Streaming
from pyspark.streaming import StreamingContext
#    Kafka
from pyspark.streaming.kafka import KafkaUtils
#    json parsing
import json
#    print function
from __future__ import print_function
#    os
import sys

## Create Streaming Context

In [2]:
zkQuorum = "localhost:2181"
topic = "twitter-stream"
seconds_to_run = 60
ssc = StreamingContext(sc, seconds_to_run)

tweets = KafkaUtils.createStream(ssc, zkQuorum, "spark-streaming-consumer", {topic: 1})

## Parse the inbound message as json

The inbound stream is a DStream, which supports various built-in transformations such as map which is used here to parse the inbound messages from their native JSON format.
Note that this will fail horribly if the inbound message isn't valid JSON.

In [3]:
parsed = tweets.map(lambda v: json.loads(v[1]))

In [4]:
parsed.count().map(lambda x:'Tweets in this batch: %s' % x).pprint()

## Extract Text from each tweet

In [5]:
text_dstream = parsed.map(lambda tweet: tweet['text'])
#text_dstream.pprint()

## Process the extracted text 

In [6]:
import nltk
import csv
import string
import re
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords

In [7]:
def filter_out_unicode(tweet):
  
    try:
        clean_tweet = str(tweet)
    except UnicodeEncodeError:
        pass
    return clean_tweet

def expand_around_chars(text, characters):
    for char in characters:
        text = text.replace(char, ' ' + char + ' ')
    return text

def strip_quotations_newline(text):
    clean_tweet = ' '.join(text.split())
    clean_tweet = clean_tweet.encode('utf-8')
    clean_tweet = clean_tweet.replace('",\'','')
    clean_tweet = re.sub(r'''(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’]))''', "", clean_tweet)
    clean_tweet = re.sub(r'''(@[A-Za-z0-9]+)''', "", clean_tweet)
    clean_tweet = re.sub("([0-9]+)", "", clean_tweet)
    clean_tweet = re.sub(r'[^\x00-\x7F]+','', clean_tweet)
    return clean_tweet

def split_text(text):
    text = strip_quotations_newline(text)
    text = expand_around_chars(text, '\/".,()&[]{}:;!-_\'')
    splitted_text = text.split(' ')
    cleaned_text = [x for x in splitted_text if len(x) > 2]
    text_lowercase = [x.lower() for x in cleaned_text]
    return text_lowercase

In [8]:
mess = text_dstream.map(lambda text: ' '.join(split_text(text)))
#mess.pprint()

## Load feature extraction pipeline

In [16]:
from pyspark.ml import PipelineModel
pModel = PipelineModel.load('target/tmp/pythonFeatExtractor')

In [14]:
from pyspark.sql import DataFrame,Row
mess_row = mess.map(lambda text: Row(tweet_text=text))
mess_df = mess_row.map(lambda row: row.toDF())

In [18]:
mess_feat = mess_df.map(lambda df: pModel.transform(df))

## Create clean tweet

## Convert data to be classified

In [10]:
from pyspark.sql import DataFrame,Row
def process(rdd):
    
    #try:
        
    # Convert RDD[String] to RDD[Row] to DataFrame
    rowRdd = rdd.map(lambda w: Row(tweet_text=w))
    #rowRdd.collect()
    #tweetDataFrame = spark.createDataFrame(rowRdd)
    #tweetDataFrame.show()
    # Creates a temporary view using the DataFrame.
    #tweetDataFrame.createOrReplaceTempView("tweets")
    #only_tweets = spark.sql("SELECT tweet_text FROM tweets")
    #featVectors = pModel.transform(only_tweets)
    #featVectors.select("features").show()
    
    #except:
        
        #pass        

In [11]:
mess.foreachRDD(process)

## Start the streaming context

In [19]:
ssc.start()
ssc.awaitTermination(timeout=180)

-------------------------------------------
Time: 2017-06-28 01:27:00
-------------------------------------------



Py4JJavaError: An error occurred while calling o33.awaitTerminationOrTimeout.
: org.apache.spark.SparkException: An exception was raised by Python:
Traceback (most recent call last):
  File "/usr/local/src/spark/spark-2.0.2-bin-hadoop2.7/python/pyspark/streaming/util.py", line 67, in call
    return r._jrdd
  File "/usr/local/src/spark/spark-2.0.2-bin-hadoop2.7/python/pyspark/sql/types.py", line 1502, in __getattr__
    raise AttributeError(item)
AttributeError: _jrdd

	at org.apache.spark.streaming.api.python.TransformFunction.callPythonTransformFunction(PythonDStream.scala:95)
	at org.apache.spark.streaming.api.python.TransformFunction.apply(PythonDStream.scala:78)
	at org.apache.spark.streaming.api.python.PythonDStream$$anonfun$callForeachRDD$1.apply(PythonDStream.scala:179)
	at org.apache.spark.streaming.api.python.PythonDStream$$anonfun$callForeachRDD$1.apply(PythonDStream.scala:179)
	at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1$$anonfun$apply$mcV$sp$1.apply$mcV$sp(ForEachDStream.scala:51)
	at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1$$anonfun$apply$mcV$sp$1.apply(ForEachDStream.scala:51)
	at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1$$anonfun$apply$mcV$sp$1.apply(ForEachDStream.scala:51)
	at org.apache.spark.streaming.dstream.DStream.createRDDWithLocalProperties(DStream.scala:415)
	at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1.apply$mcV$sp(ForEachDStream.scala:50)
	at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1.apply(ForEachDStream.scala:50)
	at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1.apply(ForEachDStream.scala:50)
	at scala.util.Try$.apply(Try.scala:192)
	at org.apache.spark.streaming.scheduler.Job.run(Job.scala:39)
	at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler$$anonfun$run$1.apply$mcV$sp(JobScheduler.scala:247)
	at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler$$anonfun$run$1.apply(JobScheduler.scala:247)
	at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler$$anonfun$run$1.apply(JobScheduler.scala:247)
	at scala.util.DynamicVariable.withValue(DynamicVariable.scala:58)
	at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler.run(JobScheduler.scala:246)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
	at java.lang.Thread.run(Thread.java:748)


-------------------------------------------
Time: 2017-06-28 01:27:00
-------------------------------------------

-------------------------------------------
Time: 2017-06-28 01:27:00
-------------------------------------------

-------------------------------------------
Time: 2017-06-28 01:27:00
-------------------------------------------

-------------------------------------------
Time: 2017-06-28 01:28:00
-------------------------------------------

-------------------------------------------
Time: 2017-06-28 01:28:00
-------------------------------------------

-------------------------------------------
Time: 2017-06-28 01:28:00
-------------------------------------------

-------------------------------------------
Time: 2017-06-28 01:28:00
-------------------------------------------

-------------------------------------------
Time: 2017-06-28 01:29:00
-------------------------------------------

-------------------------------------------
Time: 2017-06-28 01:29:00
----------