In [67]:
# innstall java
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
 
# install spark (change the version number if needed)
!wget -q https://archive.apache.org/dist/spark/spark-3.0.0/spark-3.0.0-bin-hadoop3.2.tgz
 
# unzip the spark file to the current folder
!tar xf spark-3.0.0-bin-hadoop3.2.tgz
 
# set your spark folder to your system path environment. 
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.0.0-bin-hadoop3.2"
 

# install findspark using pip
!pip install -q findspark

In [69]:
!pip install pyspark



In [70]:
import findspark
findspark.init()
from pyspark import SparkContext
from pyspark import SparkConf
from pyspark.ml.linalg import Vectors
import pyspark
from pyspark.sql.functions import udf
from pyspark.sql.types import FloatType
from pyspark.streaming import StreamingContext
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType
from pyspark import SQLContext

In [71]:
import numpy as np
import pandas as pd

In [72]:
coinbase = np.load('/content/coinbase_order_array.npy', allow_pickle=True)
coinbase

array([{'bids': [['0.02706', '0.3941605', 2]], 'asks': [['0.02707', '17.115', 2]], 'sequence': 2517907125, 'time': 1609195957834},
       {'bids': [['0.02706', '0.1191605', 1]], 'asks': [['0.02707', '3.075', 3]], 'sequence': 2517907249, 'time': 1609195959133},
       {'bids': [['0.02707', '2.8', 1]], 'asks': [['0.02708', '24.03321021', 4]], 'sequence': 2517907434, 'time': 1609195960437},
       ...,
       {'bids': [['0.0273', '11.82427217', 3]], 'asks': [['0.02731', '5.87095917', 2]], 'sequence': 2519953418, 'time': 1609239371302},
       {'bids': [['0.02729', '24.01996707', 7]], 'asks': [['0.0273', '5.87095917', 2]], 'sequence': 2519953656, 'time': 1609239373100},
       {'bids': [['0.02729', '29.53056707', 7]], 'asks': [['0.0273', '5.87095917', 2]], 'sequence': 2519953695, 'time': 1609239374755}],
      dtype=object)

In [73]:
spark = SparkSession.builder.getOrCreate()

In [74]:
sc = spark.sparkContext
ssc = StreamingContext(sc, 10)
rdd = ssc.sparkContext.parallelize(coinbase)

In [75]:
rdd = rdd.map(lambda x: (x['time'], x['bids'], x['asks']))

In [76]:
rdd.take(5)

[(1609195957834, [['0.02706', '0.3941605', 2]], [['0.02707', '17.115', 2]]),
 (1609195959133, [['0.02706', '0.1191605', 1]], [['0.02707', '3.075', 3]]),
 (1609195960437, [['0.02707', '2.8', 1]], [['0.02708', '24.03321021', 4]]),
 (1609195962189,
  [['0.02708', '4.84406447', 3]],
  [['0.02709', '22.27755749', 2]]),
 (1609195963505,
  [['0.02708', '4.84406447', 3]],
  [['0.02709', '22.27755749', 2]])]

In [77]:
def min_max(iterator):

   
    asks=[]
    for x in iterator:

      asks.append([x[0],x[1][0][0], x[2][0][0]])

    return asks

In [78]:
final = rdd.mapPartitions(min_max)

In [79]:
final.take(10)

[[1609195957834, '0.02706', '0.02707'],
 [1609195959133, '0.02706', '0.02707'],
 [1609195960437, '0.02707', '0.02708'],
 [1609195962189, '0.02708', '0.02709'],
 [1609195963505, '0.02708', '0.02709'],
 [1609195964789, '0.02709', '0.0271'],
 [1609195966189, '0.02709', '0.02711'],
 [1609195967508, '0.02709', '0.02711'],
 [1609195969246, '0.02709', '0.02711'],
 [1609195970621, '0.0271', '0.02711']]

In [80]:
bin_order = np.load('/content/binance_order_array.npy',allow_pickle=True)

In [81]:
binance_order = ssc.sparkContext.parallelize (
    bin_order
)

In [82]:
merged = (binance_order.map(lambda x: (str(x['time'])[:-3],0)))\
.leftOuterJoin(
    final.map(lambda x: ((str(x[0])[:-3]), (x[1], x[2])))
    )

In [83]:
merged.take(100)

[('1609195964', (0, ('0.02709', '0.0271'))),
 ('1609195968', (0, None)),
 ('1609195970', (0, ('0.0271', '0.02711'))),
 ('1609195987', (0, ('0.02709', '0.0271'))),
 ('1609195990', (0, ('0.02706', '0.02708'))),
 ('1609195992', (0, None)),
 ('1609195993', (0, ('0.02706', '0.02709'))),
 ('1609195995', (0, ('0.02706', '0.02708'))),
 ('1609195996', (0, None)),
 ('1609196000', (0, None)),
 ('1609196001', (0, ('0.02706', '0.02707'))),
 ('1609196017', (0, ('0.02705', '0.02707'))),
 ('1609196020', (0, ('0.02706', '0.02708'))),
 ('1609196021', (0, ('0.02705', '0.02706'))),
 ('1609196023', (0, ('0.02705', '0.02706'))),
 ('1609196032', (0, ('0.02706', '0.02707'))),
 ('1609196037', (0, None)),
 ('1609196046', (0, ('0.02704', '0.02706'))),
 ('1609196061', (0, ('0.02702', '0.02705'))),
 ('1609196069', (0, ('0.02705', '0.02708'))),
 ('1609196071', (0, ('0.02705', '0.02708'))),
 ('1609196077', (0, None)),
 ('1609196082', (0, ('0.02706', '0.02708'))),
 ('1609196085', (0, ('0.02706', '0.02708'))),
 ('1609

In [84]:
merged = merged.map(lambda x: [x[0],x[1][1]])

In [85]:
merged.take(5)

[['1609195964', ('0.02709', '0.0271')],
 ['1609195968', None],
 ['1609195970', ('0.0271', '0.02711')],
 ['1609195987', ('0.02709', '0.0271')],
 ['1609195990', ('0.02706', '0.02708')]]

In [86]:
def arrangeNone(iterator):

    sv = 0

    out = []

    for x in iterator:
        if x[1] is None:
            x[1] = sv
        else:
            sv = x[1]

        out.append([x[0],x[1]])
        
    return out 

In [87]:
cb_order = merged.mapPartitions(arrangeNone)

In [88]:
cb_order.take(5)

[['1609195964', ('0.02709', '0.0271')],
 ['1609195968', ('0.02709', '0.0271')],
 ['1609195970', ('0.0271', '0.02711')],
 ['1609195987', ('0.02709', '0.0271')],
 ['1609195990', ('0.02706', '0.02708')]]

In [89]:
!pip install influxdb



In [90]:
!sudo apt-get update && sudo apt-get install influxdb

Ign:1 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease
Hit:2 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease
Get:3 http://security.ubuntu.com/ubuntu bionic-security InRelease [88.7 kB]
Ign:4 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  InRelease
Hit:5 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  Release
Hit:6 http://ppa.launchpad.net/c2d4u.team/c2d4u4.0+/ubuntu bionic InRelease
Hit:7 http://archive.ubuntu.com/ubuntu bionic InRelease
Hit:8 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  Release
Get:9 http://archive.ubuntu.com/ubuntu bionic-updates InRelease [88.7 kB]
Hit:11 http://ppa.launchpad.net/cran/libgit2/ubuntu bionic InRelease
Hit:12 http://ppa.launchpad.net/graphics-drivers/ppa/ubuntu bionic InRelease
Get:13 http://archive.ubuntu.com/ubuntu bionic-backports InRelease [74.6 kB]
Fetched 252 kB in 1s (226 kB/s)

In [91]:
!sudo service influxdb start

 * Starting database influxd
   ...done.


In [92]:
from influxdb import InfluxDBClient
client = InfluxDBClient(host='localhost', port=8086)

In [106]:
client.create_database('coinbase_or')

In [94]:
def inf(partitions):
    result = []
    print(np.array(partitions).shape[0] - 1)
    for i in range(np.array(partitions).shape[0] - 1):
      print(i)
      if partitions[i][1]!=0:
        result.append('coinbase_order date='+str(partitions[i][0])+',bids='+str(partitions[i][1][0])+',asks='+ str(partitions[i][1][1]))
    return result

In [95]:
cb_order_last=cb_order.mapPartitions(inf)  

In [101]:
cb_order_last.count()

10042

In [107]:
client.write_points(cb_order_last.collect(), database='coinbase_or', time_precision='ms', batch_size=1, protocol='line')

True

In [108]:
client.switch_database('coinbase_or')

In [112]:
q='select * from coinbase_or.autogen.coinbase_order'
cb_order = pd.DataFrame(client.query(q).get_points())

In [113]:
cb_order

Unnamed: 0,time,asks,bids,date
0,2021-01-03T19:55:02.53Z,0.02710,0.02709,1.609196e+09
1,2021-01-03T19:55:02.554Z,0.02710,0.02709,1.609196e+09
2,2021-01-03T19:55:02.565Z,0.02711,0.02710,1.609196e+09
3,2021-01-03T19:55:02.573Z,0.02710,0.02709,1.609196e+09
4,2021-01-03T19:55:02.587Z,0.02708,0.02706,1.609196e+09
...,...,...,...,...
10037,2021-01-03T19:56:11.914Z,0.02732,0.02731,1.609239e+09
10038,2021-01-03T19:56:11.921Z,0.02732,0.02731,1.609239e+09
10039,2021-01-03T19:56:11.929Z,0.02732,0.02731,1.609239e+09
10040,2021-01-03T19:56:11.936Z,0.02731,0.02730,1.609239e+09
