In [1]:
from pyspark.sql import SparkSession, DataFrame

In [2]:
# Initializes Spark session with MongoDB connector
jar_files_path = "file:///C:/Users/llucp/spark_jars/"

jar_files = [
    "mongo-spark-connector_2.12-10.1.1.jar",
    "mongodb-driver-core-4.10.1.jar",
    "mongodb-driver-sync-4.10.1.jar",
    "bson-4.10.1.jar"
]

MONGO_URI = "mongodb://localhost:27017/"
DB_NAME = "tfg"
FORMATTING_TRADE = "formatting_trade"
FORMATTING_LOB = "formatting_lob"
EXPLOITATION = "exploitation"

spark = (
    SparkSession.builder
    .appName("FormattedZone")
    .config("spark.jars", ",".join([jar_files_path + jar for jar in jar_files]))
    .config("spark.mongodb.read.connection.uri", MONGO_URI)
    .config("spark.mongodb.write.connection.uri", MONGO_URI)
    .config("spark.mongodb.read.database", DB_NAME)
    .config("spark.mongodb.write.database", DB_NAME)
    .config("spark.driver.memory", "4g")
    .getOrCreate()
)

spark.sparkContext.setLogLevel("ERROR")

In [3]:
import matplotlib.pyplot as plt
import statsmodels.api as sm
from scipy import stats
import seaborn as sns
import pandas as pd
import numpy as np
import re

In [4]:
def load_lob() -> DataFrame:
    """
    Loads Landing Zone LOB data and returns it as a dataframe.
    """
    lob = (
        spark.read.format("mongodb")
        .option("database", DB_NAME)
        .option("collection", FORMATTING_LOB)
        .load()
    )

    return lob

In [5]:
lob = load_lob()
print(lob.columns)

['_id', 'asks', 'bids', 'depth_all', 'depth_imbalance_all', 'liquidity_concentration_all', 'liquidity_spread_all', 'lob_slope_all', 'log_return', 'microprice', 'mid_price', 'midprice_vol', 'ofi_all', 'price_impact_proxy_all', 'spread', 'timestamp', 'variance_proxy']


In [6]:
cols = ['depth_all', 'depth_imbalance_all', 'liquidity_concentration_all', 'liquidity_spread_all', 'lob_slope_all', 'microprice', 'mid_price', 'midprice_vol', 'ofi_all', 'price_impact_proxy_all', 'spread', 'variance_proxy']
df = lob.select(cols).dropna().toPandas()
df.head()

Unnamed: 0,depth_all,depth_imbalance_all,liquidity_concentration_all,liquidity_spread_all,lob_slope_all,microprice,mid_price,midprice_vol,ofi_all,price_impact_proxy_all,spread,variance_proxy
0,257547100.0,0.180678,1.0,15883.079263,0.153569,109574.589811,109574.55,0.000192,-257209100.0,0.012429,0.1,3.671029e-08
1,257436600.0,0.174917,1.0,15877.665866,27.039736,109545.402314,109545.45,0.000219,146482000.0,0.012434,0.1,4.774919e-08
2,258128000.0,0.180251,1.0,16011.436708,0.471475,109584.074005,109584.05,0.000198,-29303900.0,0.012401,0.1,3.915831e-08
3,255477800.0,0.179928,1.0,15885.511553,7.261002,109619.304513,109619.35,0.000186,256500900.0,0.01253,0.1,3.474445e-08
4,257448700.0,0.179035,1.0,15704.559336,0.156296,109660.494,109660.45,0.000189,-257462500.0,0.012434,0.1,3.558875e-08
