In [1]:
# innstall java
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
 
# install spark (change the version number if needed)
!wget -q https://archive.apache.org/dist/spark/spark-3.0.0/spark-3.0.0-bin-hadoop3.2.tgz
 
# unzip the spark file to the current folder
!tar xf spark-3.0.0-bin-hadoop3.2.tgz
 
# set your spark folder to your system path environment. 
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.0.0-bin-hadoop3.2"
 

# install findspark using pip
!pip install -q findspark

In [2]:
import numpy as np
import pandas as pd

In [3]:
bitfinex = np.load('/content/bitfinex_order_array.npy', allow_pickle=True)
bitfinex

array([{'orders': [[0.02701, 4, 19.63549352], [0.027, 4, 17.11867786], [0.02699, 3, 20.36402743], [0.02698, 2, 21.7853], [0.02697, 4, 43.29962648], [0.02696, 2, 7.45266557], [0.02695, 2, 4.90705179], [0.02694, 1, 5.6], [0.02693, 2, 42.37741964], [0.02692, 9, 13.4953], [0.02691, 11, 5.12], [0.0269, 12, 35.05647142], [0.02689, 12, 212.75016655], [0.02688, 14, 77.89796794], [0.02687, 13, 70.1598365], [0.02686, 12, 7.3613234], [0.02685, 13, 85.37315135], [0.02684, 11, 0.18682799], [0.02683, 13, 38.6210026], [0.02682, 11, 0.13620594], [0.02681, 11, 0.55056785], [0.0268, 12, 72.507], [0.02679, 11, 19.40053639], [0.02678, 12, 182.70556509], [0.02677, 10, 0.12], [0.02704, 6, -0.036], [0.02705, 11, -1.366348], [0.02706, 15, -12.38197914], [0.02707, 15, -37.09897034], [0.02708, 13, -13.14305331], [0.02709, 12, -27.79927846], [0.0271, 12, -22.84846225], [0.02711, 11, -25.06], [0.02712, 13, -60.23560576], [0.02713, 12, -34.309], [0.02714, 10, -0.06], [0.02715, 13, -53.70194438], [0.02716, 13, -69.

In [4]:
import findspark
findspark.init()
from pyspark import SparkContext
from pyspark import SparkConf
from pyspark.ml.linalg import Vectors
import pyspark
from pyspark.sql.functions import udf
from pyspark.sql.types import FloatType

In [5]:
from pyspark.sql import SparkSession
from pyspark import SparkContext
from pyspark.streaming import StreamingContext

In [6]:
spark = SparkSession.builder.getOrCreate()

In [7]:
sc = spark.sparkContext
ssc = StreamingContext(sc, 10)
rdd = ssc.sparkContext.parallelize(bitfinex)

In [8]:
rdd = rdd.map(lambda x: (x['orders'], x['time']))

In [9]:
rdd.take(1)

[([[0.02701, 4, 19.63549352],
   [0.027, 4, 17.11867786],
   [0.02699, 3, 20.36402743],
   [0.02698, 2, 21.7853],
   [0.02697, 4, 43.29962648],
   [0.02696, 2, 7.45266557],
   [0.02695, 2, 4.90705179],
   [0.02694, 1, 5.6],
   [0.02693, 2, 42.37741964],
   [0.02692, 9, 13.4953],
   [0.02691, 11, 5.12],
   [0.0269, 12, 35.05647142],
   [0.02689, 12, 212.75016655],
   [0.02688, 14, 77.89796794],
   [0.02687, 13, 70.1598365],
   [0.02686, 12, 7.3613234],
   [0.02685, 13, 85.37315135],
   [0.02684, 11, 0.18682799],
   [0.02683, 13, 38.6210026],
   [0.02682, 11, 0.13620594],
   [0.02681, 11, 0.55056785],
   [0.0268, 12, 72.507],
   [0.02679, 11, 19.40053639],
   [0.02678, 12, 182.70556509],
   [0.02677, 10, 0.12],
   [0.02704, 6, -0.036],
   [0.02705, 11, -1.366348],
   [0.02706, 15, -12.38197914],
   [0.02707, 15, -37.09897034],
   [0.02708, 13, -13.14305331],
   [0.02709, 12, -27.79927846],
   [0.0271, 12, -22.84846225],
   [0.02711, 11, -25.06],
   [0.02712, 13, -60.23560576],
   [0.0271

In [10]:
def min_max(iterator):

    max_b = []
    min_a = []

    for x in iterator:

        bids = []
        asks = []


        for i in range(25):

            bids.append(x[0][i][0])
            asks.append(x[0][i+25][0])
          
        max_b.append([x[1],max(bids),min(asks)] )

    return max_b

In [11]:
final = rdd.mapPartitions(min_max)

In [12]:
final.take(25)

[[1609195953321, 0.02701, 0.02704],
 [1609195954768, 0.02701, 0.02704],
 [1609195956195, 0.02702, 0.02704],
 [1609195957662, 0.02702, 0.02704],
 [1609195959134, 0.02703, 0.02704],
 [1609195960598, 0.02703, 0.02704],
 [1609195962124, 0.02706, 0.02709],
 [1609195963546, 0.02706, 0.02709],
 [1609195964951, 0.02707, 0.02709],
 [1609195966350, 0.02706, 0.02709],
 [1609195967728, 0.02706, 0.02709],
 [1609195969395, 0.02708, 0.02711],
 [1609195970821, 0.02708, 0.02711],
 [1609195972257, 0.02708, 0.02711],
 [1609195973669, 0.02708, 0.02711],
 [1609195975085, 0.02708, 0.02711],
 [1609195976495, 0.02707, 0.02711],
 [1609195978018, 0.02707, 0.02711],
 [1609195979445, 0.02706, 0.0271],
 [1609195980984, 0.02706, 0.0271],
 [1609195982389, 0.02706, 0.0271],
 [1609195983838, 0.02706, 0.0271],
 [1609195985286, 0.02706, 0.0271],
 [1609195986803, 0.02706, 0.0271],
 [1609195988331, 0.02704, 0.02708]]

In [13]:
binance = np.load('/content/binance_order_array.npy',allow_pickle=True)

In [14]:
binance = ssc.sparkContext.parallelize (
    binance
)

In [15]:
bin_times = (binance.map(lambda x: (str(x['time'])[:-3],0)))\
.leftOuterJoin(
    final.map(lambda x: ((str(x[0])[:-3]), (str(x[1]), str(x[2]))))
    )

In [16]:
bin_times = bin_times.map(lambda x: [x[0],x[1][1]])

In [17]:
def removeNone(iterator):

    sv = 0

    out = []

    for x in iterator:
        if x[1] is None:
            x[1] = sv
        else:
            sv = x[1]

        out.append([x[0],x[1]])
        
    return out 

In [18]:
bitf_order = bin_times.mapPartitions(removeNone)

In [19]:
bitf_order.take(5)

[['1609195964', ('0.02707', '0.02709')],
 ['1609195968', ('0.02707', '0.02709')],
 ['1609195970', ('0.02708', '0.02711')],
 ['1609195987', ('0.02708', '0.02711')],
 ['1609195990', ('0.02708', '0.02711')]]

In [20]:
!pip install influxdb



In [21]:
!sudo apt-get update && sudo apt-get install influxdb

0% [Working]            Ign:1 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease
0% [Connecting to archive.ubuntu.com (91.189.88.152)] [Connecting to security.u                                                                               Hit:2 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease
0% [Connecting to archive.ubuntu.com (91.189.88.152)] [Connecting to security.u                                                                               Ign:3 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  InRelease
0% [Connecting to archive.ubuntu.com (91.189.88.152)] [Connecting to security.u0% [2 InRelease gpgv 3,626 B] [Connecting to archive.ubuntu.com (91.189.88.152)                                                                               Hit:4 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  Release
Hit:5 https://developer.download.nvidia.com/comp

In [22]:
!sudo service influxdb start

 * Starting database influxd
   ...done.


In [23]:
from influxdb import InfluxDBClient
client = InfluxDBClient(host='localhost', port=8086)

In [24]:
client.create_database('bitfinex_order')

In [25]:
def influxForm(partitions):
    result = []
    for i in range(np.array(partitions).shape[0] - 1):
      if partitions[i][1]!=0:
        result.append('bitfinex_order date='+partitions[i][0]+','+'bids='+partitions[i][1][0]+','+'asks='+ partitions[i][1][1])
    return result

In [26]:
bitfinex_or = bitf_order.mapPartitions(influxForm)  

In [27]:
bitfinex_or.count()

10045

In [28]:
client.write_points(bitfinex_or.collect(), database='bitfinex_order', time_precision='ms', batch_size=1, protocol='line')

True

In [29]:
client.switch_database('bitfinex_order')
q='select * from bitfinex_order.autogen.bitfinex_order'
bf_order = pd.DataFrame(client.query(q).get_points())

In [30]:
bf_order

Unnamed: 0,time,asks,bids,date
0,2021-01-03T20:06:53.166Z,0.02709,0.02707,1.609196e+09
1,2021-01-03T20:06:53.181Z,0.02709,0.02707,1.609196e+09
2,2021-01-03T20:06:53.189Z,0.02711,0.02708,1.609196e+09
3,2021-01-03T20:06:53.196Z,0.02711,0.02708,1.609196e+09
4,2021-01-03T20:06:53.203Z,0.02711,0.02708,1.609196e+09
...,...,...,...,...
10040,2021-01-03T20:07:42.782Z,0.02731,0.02730,1.609239e+09
10041,2021-01-03T20:07:42.787Z,0.02731,0.02730,1.609239e+09
10042,2021-01-03T20:07:42.792Z,0.02731,0.02730,1.609239e+09
10043,2021-01-03T20:07:42.797Z,0.02731,0.02730,1.609239e+09
