In [1]:
# Parameters

numOfRendements=10   # Number of returns used to compute a proxy (there are 4221 prices in the batch)
numOfColumnsX=3      # Number of columns of a sample (with the intercept)
timeBetweenPrices=3  # Interval between two prices

In [2]:
# Kafka and Spark setup

import os
import numpy as np
os.environ['PYSPARK_SUBMIT_ARGS'] = '--conf spark.ui.port=4041 --packages org.apache.kafka:kafka_2.11:0.9.0.0,org.apache.kafka:kafka-clients:0.9.0.0  pyspark-shell'

In [3]:
from pyspark import SparkContext
sc = SparkContext("local[1]", "KafkaSendStream") 
from kafka import KafkaProducer
import time

In [None]:
import json
from satori.rtm.client import make_client, SubscriptionMode
from math import log

# Connection to Satori

endpoint = "wss://open-data.api.satori.com"
appkey = "dC6c33Fbb5ECdAC1Ef2aB77dcBfBB0B0"
channel = "cryptocurrency-market-data"

with make_client(endpoint=endpoint, appkey=appkey) as client:
    print('Connected to Satori RTM!')
    messagebox=[]

    class SubscriptionObserver(object):
        def on_subscription_data(self, data):
            for message in data['messages']:
                messagebox.append(message)

    subscription_observer = SubscriptionObserver()
    client.subscribe(
        channel,
        SubscriptionMode.SIMPLE,
        subscription_observer,
        {'filter':'select * from `cryptocurrency-market-data` where exchange = "Bitstamp" and basecurrency = "USD" and cryptocurrency="BTC"'})
        # Stream SQL filter
    producer = KafkaProducer(bootstrap_servers='localhost:9092')
    i=0
    sumOfProxies=np.zeros(numOfColumnsX)
    sumOfSquaredProxies=np.zeros(numOfColumnsX)
    while not messagebox: # Waiting for a first price to arrive
        time.sleep(1)
    firstMessage=messagebox.pop()
    firstMessage=json.dumps(firstMessage, ensure_ascii=False)
    firstMessage=json.loads(firstMessage)
    lastPrice=float(firstMessage['price']) # Get that first price
    lastTime=int(firstMessage['timestamp'])
    rendements=np.empty(0)
    proxies=np.empty(0)
    interpolatedTimes=[]
    interpolatedPrices=[]
    while True:
        
        # Computation of the proxies
        
        proxies=proxies[1:]
        while proxies.size<numOfColumnsX: # Not -1 because because the last proxy will be used as y            
            if messagebox and not interpolatedTimes:
                newMessage=messagebox.pop()
                newMessage=json.dumps(newMessage, ensure_ascii=False)
                newMessage=json.loads(newMessage)
                prices=[lastPrice, float(newMessage['price'])]
                times=[lastTime, int(newMessage['timestamp'])]
                interpolatedTimes=list(range(lastTime, int(newMessage['timestamp']), timeBetweenPrices))
                interpolatedPrices=list(np.interp(interpolatedTimes, times, prices))
            while interpolatedTimes and proxies.size<numOfColumnsX:
                newPrice=interpolatedPrices.pop(0)
                rendement=np.log(newPrice/lastPrice)
                lastPrice=newPrice
                lastTime=interpolatedTimes.pop(0)
                rendements=np.append(rendements, rendement)
                if rendements.size==numOfRendements:
                    proxies=np.append(proxies,np.sum(np.square(rendements))/numOfRendements)
                    rendements=rendements[1:]
            time.sleep(1)
        
        # Normalisation of the proxies
        
        sumOfProxies+=proxies
        sumOfSquaredProxies+=proxies**2
        mean=sumOfProxies/(i+1)                        # Update the mean of each column
        std=np.sqrt(sumOfSquaredProxies/(i+1)-mean**2) # Update the standard deviation of each column
        if i!=0:
            y=(proxies[-1]-mean[-1])/std[-1]
            message=np.array2string(np.append([i,y], np.insert((proxies[:numOfColumnsX-1]-mean[:numOfColumnsX-1])/std[:numOfColumnsX-1],0,1)),separator=",") # Ajout de la colonne t_0 aux proxies 
            producer.send('volatility', message)
            print(message)
        i=i+1