In [3]:
import time
import re, ast
import numpy as np
import os
os.environ['PYSPARK_SUBMIT_ARGS'] = '--conf spark.ui.port=4040 --packages org.apache.spark:spark-streaming-kafka-0-8_2.11:2.0.0,com.datastax.spark:spark-cassandra-connector_2.11:2.0.0-M3 pyspark-shell'



In [4]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext, Row
conf = SparkConf() \
    .setAppName("Streaming test") \
    .setMaster("local[2]") \
    .set("spark.cassandra.connection.host", "127.0.0.1") # "local[2]" to run locally with 2 cores
sc = SparkContext(conf=conf) 
sqlContext=SQLContext(sc)
from pyspark.streaming import StreamingContext
from pyspark.streaming.kafka import KafkaUtils

In [5]:
ssc = StreamingContext(sc, 60) 
# 0.1 = batchDuration
# Main entry point for Spark Streaming functionality. A StreamingContext represents the
# connection to a Spark cluster, and can be used to create DStream various input sources.
# A Discretized Stream (DStream), the basic abstraction in Spark Streaming, 
# is a continuous sequence of RDDs (of the same type) representing a continuous stream of data
# DStreams can either be created from live data (such as, data from TCP sockets, Kafka, Flume, etc.) using a 
# StreamingContext or it can be generated by transforming existing DStreams using operations such as map and window.
# https://blog.jetoile.fr/2014/05/rdd-quest-ce-que-cest.html
ssc.checkpoint("checkpoint")
# Sets the context to periodically checkpoint the DStream operations for master fault-tolerance.
# Parameters:	directory – HDFS-compatible directory where the checkpoint data will be reliably stored
ssc.remember(1)
# Set each DStreams in this context to remember RDDs it generated in the last given duration. 

In [6]:
kvs = KafkaUtils.createStream(ssc, "127.0.0.1:2181", "spark-streaming-consumer", {'test2': 1})
# createStream(ssc, zkQuorum, groupId, topics)
# Parameters:	
#				ssc – StreamingContext object
#				zkQuorum – Zookeeper quorum (hostname:port,hostname:port,..).
#				groupId – The group id for this consumer.
#				topics – Dict of (topic_name -> numPartitions) to consume. Each partition is consumed in its own thread.
# Topic partitions in Kafka does not correlate to partitions of RDDs generated in Spark Streaming. 

In [7]:
def readInput(line):
    # Convert the input string into a pair of numbers
    print('now')
    vec= [float(x) for x in line.split()] 


    # The pair represents the inputs for two model (forgetting factor 1 and 0.99)
    return [('mod1',('mod1',np.array(vec))),('mod2',('mod2',np.array(vec)))]

In [8]:
def updateFunction(new_values, state): 
    ## RLS update function, formulas slide 51
    if (len(new_values)>0 ):
        
        key=new_values[0][0]
        yx=new_values[0][1]
        i=yx[0]
        y=yx[1]
        x=yx[2:]
        n=len(x)
        
        beta=state[1]
        beta.shape=(n,1) # Transorms beta in a matrix of n rows and 1 column
        V=state[2]
        mu=state[3]
        sse=state[4]  ## sum of squared errors
        N=state[5]    ## number of treated samples
        x.shape=(1,n)
        err=y-x.dot(beta)
        sse=sse+pow(err,2.0)
        V=1.0/mu*(V-V.dot(x.T).dot(x).dot(V)/(1.0+float(x.dot(V).dot(x.T)))) # dot = matrix multiplication
                                                                             # .T = Transpose
        gamma=V.dot(x.T)
        beta=beta+gamma*err
        if (key=='mod1'):          
            return (key,beta,V,mu,sse/(N+1.0),N+1,i)  ## update formula mod1
        else:
            return (key,beta,V,mu,sse/(N+1.0),N+1,i)  ## update formula mod2
        
    else:
        return state

In [9]:
import re, ast
n=10 # number of features
beta1=np.zeros(n)  ## initial parameter vector slide 49
beta2=np.zeros(n)
V1=np.diag(np.zeros(n)+10) ## initial covariance matrix slide 49
V2=np.diag(np.zeros(n)+1)
mu1=1.0 # forgetting factor slide 50
mu2=0.99
data = kvs.map(lambda x: np.array(ast.literal_eval(x[1])))
# map: Returns a new DStream by applying a function to each element of DStream.
# literal_eval: This can be used for safely evaluating strings containing Python values (strings, numbers, tuples, lists, dicts, booleans, and None)
# from untrusted sources without the need to parse the values oneself.
data=data.flatMap(lambda x: [('mod1',('mod1',1.0*np.array(x))),
                            ('mod2',('mod2',1.0*np.array(x)))])
data.pprint()
# Return a new DStream by applying a function to all elements of this DStream, and then flattening the results
#initialStateRDD = sc.parallelize([('k',([1,2,3]))])
initialStateRDD = sc.parallelize([(u'mod1', ('mod1',beta1,V1,mu1,0,0,0)),
                                  (u'mod2', ('mod2',beta2,V2,mu2,0,0,0))])
# The elements of the collection [] are copied to form a distributed dataset that can be operated on in parallel.
# ('mod1',beta1,V1,mu1,0,0,0)) == what updateFunction returns
data2=data.updateStateByKey(updateFunction,initialRDD=initialStateRDD)
# Return a new "state" DStream where the state for each key is updated by applying the given function on the previous
# state of the key and the new values for the key. This can be used to maintain arbitrary state data for each key.
#data.pprint()
data2.map(lambda x: [x[1][i] for i in [0,1,4]]).pprint()

In [10]:
ssc.start()


-------------------------------------------
Time: 2017-11-06 17:21:00
-------------------------------------------

-------------------------------------------
Time: 2017-11-06 17:21:00
-------------------------------------------
['mod2', array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.]), 0]
['mod1', array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.]), 0]

-------------------------------------------
Time: 2017-11-06 17:22:00
-------------------------------------------

-------------------------------------------
Time: 2017-11-06 17:22:00
-------------------------------------------
['mod2', array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.]), 0]
['mod1', array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.]), 0]

-------------------------------------------
Time: 2017-11-06 17:23:00
-------------------------------------------

-------------------------------------------
Time: 2017-11-06 17:23:00
-------------------------------------------
['mod2', array([ 0.,  0.,  

-------------------------------------------
Time: 2017-11-06 17:36:00
-------------------------------------------
['mod2', array([[ 0.00011466],
       [ 0.00012192],
       [ 0.00012192],
       [ 0.00012192],
       [ 0.00012192],
       [ 0.00012192],
       [ 0.00012192],
       [ 0.00012192],
       [ 0.00012192],
       [ 0.00012192]]), array([[ 0.00467453]])]
['mod1', array([[ 0.00113481],
       [ 0.00120669],
       [ 0.00120669],
       [ 0.00120669],
       [ 0.00120669],
       [ 0.00120669],
       [ 0.00120669],
       [ 0.00120669],
       [ 0.00120669],
       [ 0.00120669]]), array([[ 0.00467453]])]

-------------------------------------------
Time: 2017-11-06 17:37:00
-------------------------------------------

-------------------------------------------
Time: 2017-11-06 17:37:00
-------------------------------------------
['mod2', array([[ 0.00011466],
       [ 0.00012192],
       [ 0.00012192],
       [ 0.00012192],
       [ 0.00012192],
       [ 0.00012192],
     

-------------------------------------------
Time: 2017-11-06 17:47:00
-------------------------------------------

-------------------------------------------
Time: 2017-11-06 17:47:00
-------------------------------------------
['mod2', array([[ 0.00031418],
       [ 0.00032145],
       [ 0.00032145],
       [ 0.00032145],
       [ 0.00032547],
       [ 0.00032547],
       [ 0.00032547],
       [ 0.00032547],
       [ 0.00032547],
       [ 0.00032547]]), array([[ 0.00743006]])]
['mod1', array([[ 0.00308875],
       [ 0.00316059],
       [ 0.00316059],
       [ 0.00316059],
       [ 0.00319998],
       [ 0.00319998],
       [ 0.00319998],
       [ 0.00319998],
       [ 0.00319998],
       [ 0.00319998]]), array([[ 0.00742793]])]

-------------------------------------------
Time: 2017-11-06 17:48:00
-------------------------------------------
('mod1', ('mod1', array([  2.00000000e+00,   8.98569463e-02,   1.97683250e-03,
         1.97683250e-03,   1.97683250e-03,   1.97683250e-03,
      

-------------------------------------------
Time: 2017-11-06 17:57:00
-------------------------------------------

-------------------------------------------
Time: 2017-11-06 17:57:00
-------------------------------------------
['mod2', array([[ 0.0006863 ],
       [ 0.00069357],
       [ 0.00069357],
       [ 0.00069357],
       [ 0.00069759],
       [ 0.00069759],
       [ 0.00069759],
       [ 0.00069759],
       [ 0.00069759],
       [ 0.00069759]]), array([[ 0.0034031]])]
['mod1', array([[ 0.00667418],
       [ 0.00674595],
       [ 0.00674595],
       [ 0.00674595],
       [ 0.00678532],
       [ 0.00678532],
       [ 0.00678532],
       [ 0.00678532],
       [ 0.00678532],
       [ 0.00678532]]), array([[ 0.00339804]])]

-------------------------------------------
Time: 2017-11-06 17:58:00
-------------------------------------------

-------------------------------------------
Time: 2017-11-06 17:58:00
-------------------------------------------
['mod2', array([[ 0.0006863 ],
 

-------------------------------------------
Time: 2017-11-06 18:08:00
-------------------------------------------

-------------------------------------------
Time: 2017-11-06 18:08:00
-------------------------------------------
['mod2', array([[ 0.00072603],
       [ 0.00073329],
       [ 0.00073329],
       [ 0.00073329],
       [ 0.00073732],
       [ 0.00073732],
       [ 0.00073732],
       [ 0.00073732],
       [ 0.00073732],
       [ 0.00073732]]), array([[ 0.0007537]])]
['mod1', array([[ 0.007049  ],
       [ 0.00712077],
       [ 0.00712077],
       [ 0.00712077],
       [ 0.00716014],
       [ 0.00716014],
       [ 0.00716014],
       [ 0.00716014],
       [ 0.00716014],
       [ 0.00716014]]), array([[ 0.00075178]])]

-------------------------------------------
Time: 2017-11-06 18:09:00
-------------------------------------------

-------------------------------------------
Time: 2017-11-06 18:09:00
-------------------------------------------
['mod2', array([[ 0.00072603],
 

-------------------------------------------
Time: 2017-11-06 18:18:00
-------------------------------------------

-------------------------------------------
Time: 2017-11-06 18:18:00
-------------------------------------------
['mod2', array([[ 0.00084481],
       [ 0.00085207],
       [ 0.00085207],
       [ 0.00085207],
       [ 0.00085609],
       [ 0.00085609],
       [ 0.00085609],
       [ 0.00085609],
       [ 0.00085609],
       [ 0.00085609]]), array([[  8.35740931e-05]])]
['mod1', array([[ 0.00815836],
       [ 0.00823011],
       [ 0.00823011],
       [ 0.00823011],
       [ 0.00826947],
       [ 0.00826947],
       [ 0.00826947],
       [ 0.00826947],
       [ 0.00826947],
       [ 0.00826947]]), array([[  8.29647798e-05]])]

-------------------------------------------
Time: 2017-11-06 18:19:00
-------------------------------------------

-------------------------------------------
Time: 2017-11-06 18:19:00
-------------------------------------------
['mod2', array([[ 0.0

-------------------------------------------
Time: 2017-11-06 18:29:00
-------------------------------------------

-------------------------------------------
Time: 2017-11-06 18:29:00
-------------------------------------------
['mod2', array([[ 0.00102378],
       [ 0.00103104],
       [ 0.00103104],
       [ 0.00103104],
       [ 0.00103507],
       [ 0.00103507],
       [ 0.00103507],
       [ 0.00103507],
       [ 0.00103507],
       [ 0.00103507]]), array([[ 0.00088338]])]
['mod1', array([[ 0.0098025 ],
       [ 0.00987422],
       [ 0.00987422],
       [ 0.00987422],
       [ 0.00991356],
       [ 0.00991356],
       [ 0.00991356],
       [ 0.00991356],
       [ 0.00991356],
       [ 0.00991356]]), array([[ 0.00088026]])]

-------------------------------------------
Time: 2017-11-06 18:30:00
-------------------------------------------

-------------------------------------------
Time: 2017-11-06 18:30:00
-------------------------------------------
['mod2', array([[ 0.00102378],


-------------------------------------------
Time: 2017-11-06 18:40:00
-------------------------------------------

-------------------------------------------
Time: 2017-11-06 18:40:00
-------------------------------------------
['mod2', array([[ 0.00112498],
       [ 0.00113224],
       [ 0.00113224],
       [ 0.00113224],
       [ 0.00113626],
       [ 0.00113626],
       [ 0.00113626],
       [ 0.00113626],
       [ 0.00113626],
       [ 0.00113626]]), array([[ 0.00034132]])]
['mod1', array([[ 0.01072069],
       [ 0.01079239],
       [ 0.01079239],
       [ 0.01079239],
       [ 0.01083173],
       [ 0.01083173],
       [ 0.01083173],
       [ 0.01083173],
       [ 0.01083173],
       [ 0.01083173]]), array([[ 0.00033915]])]

-------------------------------------------
Time: 2017-11-06 18:41:00
-------------------------------------------
('mod1', ('mod1', array([  9.00000000e+00,   1.80727044e-02,   1.97683250e-03,
         1.97683250e-03,   1.97683250e-03,   1.97683250e-03,
      

-------------------------------------------
Time: 2017-11-06 18:50:00
-------------------------------------------

-------------------------------------------
Time: 2017-11-06 18:50:00
-------------------------------------------
['mod2', array([[ 0.00126153],
       [ 0.00126879],
       [ 0.00126879],
       [ 0.00126879],
       [ 0.00127281],
       [ 0.00127281],
       [ 0.00127281],
       [ 0.00127281],
       [ 0.00127281],
       [ 0.00127281]]), array([[ 0.00018207]])]
['mod1', array([[ 0.01193471],
       [ 0.01200639],
       [ 0.01200639],
       [ 0.01200639],
       [ 0.01204572],
       [ 0.01204572],
       [ 0.01204572],
       [ 0.01204572],
       [ 0.01204572],
       [ 0.01204572]]), array([[ 0.00018041]])]

-------------------------------------------
Time: 2017-11-06 18:51:00
-------------------------------------------

-------------------------------------------
Time: 2017-11-06 18:51:00
-------------------------------------------
['mod2', array([[ 0.00126153],


-------------------------------------------
Time: 2017-11-06 19:01:00
-------------------------------------------

-------------------------------------------
Time: 2017-11-06 19:01:00
-------------------------------------------
['mod2', array([[ 0.00139939],
       [ 0.0014067 ],
       [ 0.0014067 ],
       [ 0.00140784],
       [ 0.00141285],
       [ 0.00141285],
       [ 0.00141338],
       [ 0.00142264],
       [ 0.0014496 ],
       [ 0.0014496 ]]), array([[ 0.00033335]])]
['mod1', array([[ 0.0131468 ],
       [ 0.01321892],
       [ 0.01321892],
       [ 0.013229  ],
       [ 0.01327703],
       [ 0.01327703],
       [ 0.0132817 ],
       [ 0.01336349],
       [ 0.0136015 ],
       [ 0.0136015 ]]), array([[ 0.00033087]])]

-------------------------------------------
Time: 2017-11-06 19:02:00
-------------------------------------------

-------------------------------------------
Time: 2017-11-06 19:02:00
-------------------------------------------
['mod2', array([[ 0.00139939],


-------------------------------------------
Time: 2017-11-06 19:12:00
-------------------------------------------

-------------------------------------------
Time: 2017-11-06 19:12:00
-------------------------------------------
['mod2', array([[ 0.00167363],
       [ 0.00168095],
       [ 0.00168095],
       [ 0.00168209],
       [ 0.00168709],
       [ 0.00171228],
       [ 0.00171315],
       [ 0.00172241],
       [ 0.00174937],
       [ 0.00174937]]), array([[ 0.00071855]])]
['mod1', array([[ 0.01553358],
       [ 0.01560566],
       [ 0.01560566],
       [ 0.01561573],
       [ 0.01566373],
       [ 0.01588387],
       [ 0.01589147],
       [ 0.01597319],
       [ 0.016211  ],
       [ 0.016211  ]]), array([[ 0.00071375]])]

-------------------------------------------
Time: 2017-11-06 19:13:00
-------------------------------------------

-------------------------------------------
Time: 2017-11-06 19:13:00
-------------------------------------------
['mod2', array([[ 0.00167363],


-------------------------------------------
Time: 2017-11-06 19:23:00
-------------------------------------------

-------------------------------------------
Time: 2017-11-06 19:23:00
-------------------------------------------
['mod2', array([[ 0.00198494],
       [ 0.00199225],
       [ 0.00199225],
       [ 0.00199339],
       [ 0.0019984 ],
       [ 0.00202358],
       [ 0.00202445],
       [ 0.00203371],
       [ 0.00206067],
       [ 0.00206067]]), array([[ 0.00073153]])]
['mod1', array([[ 0.01821271],
       [ 0.01828474],
       [ 0.01828474],
       [ 0.0182948 ],
       [ 0.01834277],
       [ 0.01856275],
       [ 0.01857034],
       [ 0.01865198],
       [ 0.01888957],
       [ 0.01888957]]), array([[ 0.00072574]])]

-------------------------------------------
Time: 2017-11-06 19:24:00
-------------------------------------------

-------------------------------------------
Time: 2017-11-06 19:24:00
-------------------------------------------
['mod2', array([[ 0.00198494],


-------------------------------------------
Time: 2017-11-06 19:34:00
-------------------------------------------

-------------------------------------------
Time: 2017-11-06 19:34:00
-------------------------------------------
['mod2', array([[ 0.00210918],
       [ 0.00211672],
       [ 0.00211685],
       [ 0.00211799],
       [ 0.002123  ],
       [ 0.00214818],
       [ 0.00214905],
       [ 0.00215831],
       [ 0.00218527],
       [ 0.00218527]]), array([[ 0.00014791]])]
['mod1', array([[ 0.01926224],
       [ 0.01933612],
       [ 0.01933726],
       [ 0.01934732],
       [ 0.01939528],
       [ 0.0196152 ],
       [ 0.01962279],
       [ 0.0197044 ],
       [ 0.0199419 ],
       [ 0.0199419 ]]), array([[ 0.00014518]])]

-------------------------------------------
Time: 2017-11-06 19:35:00
-------------------------------------------

-------------------------------------------
Time: 2017-11-06 19:35:00
-------------------------------------------
['mod2', array([[ 0.00210918],


-------------------------------------------
Time: 2017-11-06 19:45:00
-------------------------------------------
('mod1', ('mod1', array([  1.60000000e+01,   8.18766569e-02,   2.79931203e-03,
         2.79931203e-03,   2.79931203e-03,   2.79931203e-03,
         2.79931203e-03,   2.79931203e-03,   2.79931203e-03,
         2.79931203e-03,   2.79931203e-03,   2.79931203e-03])))
('mod2', ('mod2', array([  1.60000000e+01,   8.18766569e-02,   2.79931203e-03,
         2.79931203e-03,   2.79931203e-03,   2.79931203e-03,
         2.79931203e-03,   2.79931203e-03,   2.79931203e-03,
         2.79931203e-03,   2.79931203e-03,   2.79931203e-03])))

-------------------------------------------
Time: 2017-11-06 19:45:00
-------------------------------------------
['mod2', array([[ 0.00250098],
       [ 0.00250851],
       [ 0.00250865],
       [ 0.00250979],
       [ 0.0025148 ],
       [ 0.00253998],
       [ 0.00254084],
       [ 0.00255089],
       [ 0.00257784],
       [ 0.00257784]]), array([[ 0

-------------------------------------------
Time: 2017-11-06 19:55:00
-------------------------------------------

-------------------------------------------
Time: 2017-11-06 19:55:00
-------------------------------------------
['mod2', array([[ 0.00284092],
       [ 0.00284845],
       [ 0.00284858],
       [ 0.00284972],
       [ 0.00285473],
       [ 0.00287991],
       [ 0.00288077],
       [ 0.00289082],
       [ 0.00291777],
       [ 0.00291777]]), array([[ 0.00059388]])]
['mod1', array([[ 0.02532184],
       [ 0.0253956 ],
       [ 0.02539674],
       [ 0.02540678],
       [ 0.02545468],
       [ 0.02567421],
       [ 0.02568179],
       [ 0.02576983],
       [ 0.02600683],
       [ 0.02600683]]), array([[ 0.00058722]])]

-------------------------------------------
Time: 2017-11-06 19:56:00
-------------------------------------------

-------------------------------------------
Time: 2017-11-06 19:56:00
-------------------------------------------
['mod2', array([[ 0.00284092],


In [11]:
ssc.stop(stopSparkContext=False,stopGraceFully=True)

-------------------------------------------
Time: 2017-11-06 20:02:00
-------------------------------------------

-------------------------------------------
Time: 2017-11-06 20:02:00
-------------------------------------------
['mod2', array([[ 0.00308583],
       [ 0.00309336],
       [ 0.0030935 ],
       [ 0.00309464],
       [ 0.00309965],
       [ 0.00312482],
       [ 0.00312569],
       [ 0.00313574],
       [ 0.00316268],
       [ 0.00316268]]), array([[ 0.00030686]])]
['mod1', array([[ 0.02730992],
       [ 0.02738364],
       [ 0.02738479],
       [ 0.02739481],
       [ 0.02744269],
       [ 0.02766211],
       [ 0.02766968],
       [ 0.02775765],
       [ 0.02799449],
       [ 0.02799449]]), array([[ 0.00030168]])]

-------------------------------------------
Time: 2017-11-06 20:03:00
-------------------------------------------

-------------------------------------------
Time: 2017-11-06 20:03:00
-------------------------------------------
['mod2', array([[ 0.00308583],


In [None]:
a=[1,2,3,4]


In [None]:
[a[i] for i in [1,0,2,3]]
