In [1]:
import time
import re, ast
import numpy as np
import os
os.environ['PYSPARK_SUBMIT_ARGS'] = '--conf spark.ui.port=4040 --packages org.apache.spark:spark-streaming-kafka-0-8_2.11:2.0.0,com.datastax.spark:spark-cassandra-connector_2.11:2.0.0-M3 pyspark-shell'



In [2]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext, Row
conf = SparkConf() \
    .setAppName("Streaming test") \
    .setMaster("local[2]") \
    .set("spark.cassandra.connection.host", "127.0.0.1") # "local[2]" to run locally with 2 cores
sc = SparkContext(conf=conf) 
sqlContext=SQLContext(sc)
from pyspark.streaming import StreamingContext
from pyspark.streaming.kafka import KafkaUtils

In [3]:
ssc = StreamingContext(sc, 5) 
# 0.1 = batchDuration
# Main entry point for Spark Streaming functionality. A StreamingContext represents the
# connection to a Spark cluster, and can be used to create DStream various input sources.
# A Discretized Stream (DStream), the basic abstraction in Spark Streaming, 
# is a continuous sequence of RDDs (of the same type) representing a continuous stream of data
# DStreams can either be created from live data (such as, data from TCP sockets, Kafka, Flume, etc.) using a 
# StreamingContext or it can be generated by transforming existing DStreams using operations such as map and window.
# https://blog.jetoile.fr/2014/05/rdd-quest-ce-que-cest.html
ssc.checkpoint("checkpoint")
# Sets the context to periodically checkpoint the DStream operations for master fault-tolerance.
# Parameters:	directory – HDFS-compatible directory where the checkpoint data will be reliably stored
ssc.remember(1)
# Set each DStreams in this context to remember RDDs it generated in the last given duration. 

In [4]:
kvs = KafkaUtils.createStream(ssc, "127.0.0.1:2181", "spark-streaming-consumer", {'volatility': 1})
# createStream(ssc, zkQuorum, groupId, topics)
# Parameters:	
#				ssc – StreamingContext object
#				zkQuorum – Zookeeper quorum (hostname:port,hostname:port,..).
#				groupId – The group id for this consumer.
#				topics – Dict of (topic_name -> numPartitions) to consume. Each partition is consumed in its own thread.
# Topic partitions in Kafka does not correlate to partitions of RDDs generated in Spark Streaming. 

In [5]:
def readInput(line):
    # Convert the input string into a pair of numbers
    print('now')
    vec= [float(x) for x in line.split()] 


    # The pair represents the inputs for two model (forgetting factor 1 and 0.99)
    return [('mod1',('mod1',np.array(vec))),('mod2',('mod2',np.array(vec)))]

In [6]:
def updateFunction(new_values, state): 
    ## RLS update function, formulas slide 51
    ## new_values = (key, [i, y, proxies])
    ## state = what is given by sc.parallelize in next cell on the first call
    ##       = what this function returned on the other calls
    if (len(new_values)>0 ):
        
        key=new_values[0][0]
        yx=new_values[0][1]
        i=yx[0]
        y=yx[1]
        x=yx[2:]
        n=len(x)
        
        beta=state[1]
        beta.shape=(n,1) # Transorms beta in a matrix of n rows and 1 column
        V=state[2]
        mu=state[3]
        sse=state[4]  ## sum of squared errors
        N=state[5]    ## number of treated samples
        x.shape=(1,n)
        err=y-x.dot(beta)        
        sse=sse+pow(err,2.0)
        V=1.0/mu*(V-V.dot(x.T).dot(x).dot(V)/(1.0+float(x.dot(V).dot(x.T)))) # dot = matrix multiplication
                                                                             # .T = Transpose
        gamma=V.dot(x.T)
        beta=beta+gamma*err
        proxyPrediction=x.dot(beta)
        
        errNaivePrec=y-x[0][-1]
        errNaiveMean=y-np.mean(x[0][1:])
        sseNaivePrec=state[7]
        sseNaiveMean=state[8]
        sseNaivePrec=sseNaivePrec+pow(errNaivePrec,2.0)
        sseNaiveMean=sseNaiveMean+pow(errNaiveMean,2.0)
        
        MSE_RLS=sse/(N+1.0)
        MSE_NaivePrec=sseNaivePrec/(N+1.0)
        MSE_NaiveMean=sseNaiveMean/(N+1.0)
        
        if (key=='mod1'):
            return (key,beta,V,mu,MSE_RLS,N+1, proxyPrediction, sseNaivePrec, sseNaiveMean, MSE_RLS/MSE_NaivePrec, MSE_RLS/MSE_NaiveMean)  ## update formula mod1
        else:
            return (key,beta,V,mu,MSE_RLS,N+1, proxyPrediction, sseNaivePrec, sseNaiveMean, MSE_RLS/MSE_NaivePrec, MSE_RLS/MSE_NaiveMean)  ## update formula mod1
        
    else:
        return state

In [7]:
import re, ast
n=3 # number of features counting t_0
beta1=np.zeros(n)  ## initial parameter vector slide 49
beta2=np.zeros(n)
V1=np.diag(np.zeros(n)+10) ## initial covariance matrix slide 49
V2=np.diag(np.zeros(n)+1)
mu1=1.0 # forgetting factor slide 50
mu2=0.99
data = kvs.map(lambda x: np.array(ast.literal_eval(x[1])))
# map: Returns a new DStream by applying a function to each element of DStream.
# literal_eval: This can be used for safely evaluating strings containing Python values (strings, numbers, tuples, lists, dicts, booleans, and None)
# from untrusted sources without the need to parse the values oneself.
data=data.flatMap(lambda x: [('mod1',('mod1',1.0*np.array(x))),
                            ('mod2',('mod2',1.0*np.array(x)))])
data.pprint() # prints received array
# Return a new DStream by applying a function to all elements of this DStream, and then flattening the results
#initialStateRDD = sc.parallelize([('k',([1,2,3]))])
initialStateRDD = sc.parallelize([(u'mod1', ('mod1',beta1,V1,mu1,0,0,0,0,0)),
                                  (u'mod2', ('mod2',beta2,V2,mu2,0,0,0,0,0))])
# The elements of the collection [] are copied to form a distributed dataset that can be operated on in parallel.
# ('mod1',beta1,V1,mu1,0,0,0)) == what updateFunction returns
data2=data.updateStateByKey(updateFunction,initialRDD=initialStateRDD)
# Return a new "state" DStream where the state for each key is updated by applying the given function on the previous
# state of the key and the new values for the key. This can be used to maintain arbitrary state data for each key.
#data.pprint()
data2.map(lambda x: [x[1][i] for i in [0,1,6,4,9,10]]).pprint() 
# prints key, beta, proxyPrediction, MSE_RLS, NMSE_prec, NMSE_mean

In [8]:
ssc.start()


-------------------------------------------
Time: 2018-04-08 22:52:20
-------------------------------------------

-------------------------------------------
Time: 2018-04-08 22:52:25
-------------------------------------------

-------------------------------------------
Time: 2018-04-08 22:52:30
-------------------------------------------

-------------------------------------------
Time: 2018-04-08 22:52:35
-------------------------------------------

-------------------------------------------
Time: 2018-04-08 22:52:40
-------------------------------------------

-------------------------------------------
Time: 2018-04-08 22:52:45
-------------------------------------------

-------------------------------------------
Time: 2018-04-08 22:52:50
-------------------------------------------

-------------------------------------------
Time: 2018-04-08 22:52:55
-------------------------------------------

-------------------------------------------
Time: 2018-04-08 22:53:00
----------

-------------------------------------------
Time: 2018-04-08 22:54:10
-------------------------------------------
['mod2', array([[-0.02056622],
       [ 0.57656916],
       [ 0.07305949]]), array([[ 0.32238696]]), array([[ 0.07979813]]), array([[ 0.10468428]]), array([[ 0.2088008]])]
['mod1', array([[-0.80941632],
       [ 1.34848973],
       [-0.21592313]]), array([[-0.02539729]]), array([[ 0.04355574]]), array([[ 0.05713921]]), array([[ 0.11396852]])]

-------------------------------------------
Time: 2018-04-08 22:54:15
-------------------------------------------

-------------------------------------------
Time: 2018-04-08 22:54:15
-------------------------------------------
['mod2', array([[-0.02056622],
       [ 0.57656916],
       [ 0.07305949]]), array([[ 0.32238696]]), array([[ 0.07979813]]), array([[ 0.10468428]]), array([[ 0.2088008]])]
['mod1', array([[-0.80941632],
       [ 1.34848973],
       [-0.21592313]]), array([[-0.02539729]]), array([[ 0.04355574]]), array([[ 0.057

-------------------------------------------
Time: 2018-04-08 22:54:55
-------------------------------------------
['mod2', array([[-0.07399566],
       [ 0.3870372 ],
       [ 0.33467585]]), array([[-0.01127404]]), array([[ 0.00055776]]), array([[ 0.0013261]]), array([[ 0.00167444]])]
['mod1', array([[-0.11103986],
       [ 0.45709888],
       [ 0.31715138]]), array([[-0.04099659]]), array([[ 0.00120523]]), array([[ 0.0028655]]), array([[ 0.00361821]])]

-------------------------------------------
Time: 2018-04-08 22:55:00
-------------------------------------------

-------------------------------------------
Time: 2018-04-08 22:55:00
-------------------------------------------
['mod2', array([[-0.07399566],
       [ 0.3870372 ],
       [ 0.33467585]]), array([[-0.01127404]]), array([[ 0.00055776]]), array([[ 0.0013261]]), array([[ 0.00167444]])]
['mod1', array([[-0.11103986],
       [ 0.45709888],
       [ 0.31715138]]), array([[-0.04099659]]), array([[ 0.00120523]]), array([[ 0.0028

-------------------------------------------
Time: 2018-04-08 22:55:40
-------------------------------------------
['mod2', array([[-0.25058975],
       [ 0.42109194],
       [ 0.50851692]]), array([[-1.31723698]]), array([[ 0.00134114]]), array([[ 0.00413977]]), array([[ 0.00428421]])]
['mod1', array([[-0.26351501],
       [ 0.47162441],
       [ 0.48224186]]), array([[-1.33635285]]), array([[ 0.00132631]]), array([[ 0.00409399]]), array([[ 0.00423684]])]

-------------------------------------------
Time: 2018-04-08 22:55:45
-------------------------------------------

-------------------------------------------
Time: 2018-04-08 22:55:45
-------------------------------------------
['mod2', array([[-0.25058975],
       [ 0.42109194],
       [ 0.50851692]]), array([[-1.31723698]]), array([[ 0.00134114]]), array([[ 0.00413977]]), array([[ 0.00428421]])]
['mod1', array([[-0.26351501],
       [ 0.47162441],
       [ 0.48224186]]), array([[-1.33635285]]), array([[ 0.00132631]]), array([[ 0.0

-------------------------------------------
Time: 2018-04-08 22:56:25
-------------------------------------------
['mod2', array([[-0.24815223],
       [ 0.30039506],
       [ 0.60155031]]), array([[-2.04835248]]), array([[ 0.00018712]]), array([[ 0.00066132]]), array([[ 0.00061735]])]
['mod1', array([[-0.24589198],
       [ 0.32995004],
       [ 0.58697991]]), array([[-2.07605976]]), array([[ 0.00040459]]), array([[ 0.00142986]]), array([[ 0.0013348]])]

-------------------------------------------
Time: 2018-04-08 22:56:30
-------------------------------------------

-------------------------------------------
Time: 2018-04-08 22:56:30
-------------------------------------------
['mod2', array([[-0.24815223],
       [ 0.30039506],
       [ 0.60155031]]), array([[-2.04835248]]), array([[ 0.00018712]]), array([[ 0.00066132]]), array([[ 0.00061735]])]
['mod1', array([[-0.24589198],
       [ 0.32995004],
       [ 0.58697991]]), array([[-2.07605976]]), array([[ 0.00040459]]), array([[ 0.00

-------------------------------------------
Time: 2018-04-08 22:57:10
-------------------------------------------
['mod2', array([[-0.23519428],
       [ 0.28146652],
       [ 0.59993545]]), array([[-1.41474562]]), array([[ 0.00023744]]), array([[ 0.00102558]]), array([[ 0.00095736]])]
['mod1', array([[-0.23038836],
       [ 0.30688892],
       [ 0.58715094]]), array([[-1.42693542]]), array([[ 0.00031062]]), array([[ 0.00134169]]), array([[ 0.00125245]])]

-------------------------------------------
Time: 2018-04-08 22:57:15
-------------------------------------------

-------------------------------------------
Time: 2018-04-08 22:57:15
-------------------------------------------
['mod2', array([[-0.23519428],
       [ 0.28146652],
       [ 0.59993545]]), array([[-1.41474562]]), array([[ 0.00023744]]), array([[ 0.00102558]]), array([[ 0.00095736]])]
['mod1', array([[-0.23038836],
       [ 0.30688892],
       [ 0.58715094]]), array([[-1.42693542]]), array([[ 0.00031062]]), array([[ 0.0

-------------------------------------------
Time: 2018-04-08 22:57:55
-------------------------------------------
['mod2', array([[-0.16106349],
       [ 0.20548797],
       [ 0.69986228]]), array([[-0.04639601]]), array([[ 0.00019174]]), array([[ 0.00096309]]), array([[ 0.00086758]])]
['mod1', array([[-0.16590084],
       [ 0.22946217],
       [ 0.68210329]]), array([[-0.04259792]]), array([[ 0.00017482]]), array([[ 0.00087811]]), array([[ 0.00079103]])]

-------------------------------------------
Time: 2018-04-08 22:58:00
-------------------------------------------

-------------------------------------------
Time: 2018-04-08 22:58:00
-------------------------------------------
['mod2', array([[-0.16106349],
       [ 0.20548797],
       [ 0.69986228]]), array([[-0.04639601]]), array([[ 0.00019174]]), array([[ 0.00096309]]), array([[ 0.00086758]])]
['mod1', array([[-0.16590084],
       [ 0.22946217],
       [ 0.68210329]]), array([[-0.04259792]]), array([[ 0.00017482]]), array([[ 0.0

-------------------------------------------
Time: 2018-04-08 22:58:40
-------------------------------------------
('mod1', ('mod1', array([ 122.        ,    0.14961683,    1.        ,    0.18938342,
          0.16890643])))
('mod2', ('mod2', array([ 122.        ,    0.14961683,    1.        ,    0.18938342,
          0.16890643])))
('mod1', ('mod1', array([ 123.        ,    0.14036135,    1.        ,    0.17874783,
          0.15962526])))
('mod2', ('mod2', array([ 123.        ,    0.14036135,    1.        ,    0.17874783,
          0.15962526])))
('mod1', ('mod1', array([ 124.        ,    0.14113026,    1.        ,    0.16945043,
          0.15035466])))
('mod2', ('mod2', array([ 124.        ,    0.14113026,    1.        ,    0.16945043,
          0.15035466])))
('mod1', ('mod1', array([  1.25000000e+02,  -1.22844618e-01,   1.00000000e+00,
         1.60164448e-01,   1.51082285e-01])))
('mod2', ('mod2', array([  1.25000000e+02,  -1.22844618e-01,   1.00000000e+00,
         1.60164448e-0

In [9]:
ssc.stop(stopSparkContext=False,stopGraceFully=True)

-------------------------------------------
Time: 2018-04-08 22:59:05
-------------------------------------------

-------------------------------------------
Time: 2018-04-08 22:59:05
-------------------------------------------
['mod2', array([[-0.13258745],
       [ 0.14427761],
       [ 0.77113328]]), array([[-1.38407685]]), array([[  2.74415364e-05]]), array([[ 0.00016327]]), array([[ 0.00014083]])]
['mod1', array([[-0.13965728],
       [ 0.17300249],
       [ 0.74703377]]), array([[-1.3901014]]), array([[  2.19944964e-05]]), array([[ 0.00013086]]), array([[ 0.00011287]])]

-------------------------------------------
Time: 2018-04-08 22:59:10
-------------------------------------------

-------------------------------------------
Time: 2018-04-08 22:59:10
-------------------------------------------
['mod2', array([[-0.13258745],
       [ 0.14427761],
       [ 0.77113328]]), array([[-1.38407685]]), array([[  2.74415364e-05]]), array([[ 0.00016327]]), array([[ 0.00014083]])]
['mod1',

In [None]:
a=[1,2,3,4]


In [None]:
[a[i] for i in [1,0,2,3]]
