# 2. Baseline across configurations

Let's import the libraries

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import gzip
import re
from scipy import signal
import numpy as np

import pyspark.sql.functions as F

from utils import start_spark_session, get_s3, list_s3

In [None]:
spark = start_spark_session()

In [None]:
spark

We can now analyze all configs. It's just an extra '*'. Everything else is the same as before.

In [None]:
base_df = spark.read.text('s3a://enginestream/configs/*/batch_*.log.gz')
logs_df = (base_df
           .select(F.split('value', '[:,]').alias('cols'))
           .select(F.expr("cols[0]").cast("float").alias('time'),
                   F.expr("cols[1]").cast("long").alias('cycle'),
                   F.expr("cols[2]").cast("long").alias('conf'),
                   F.expr("cols[3]").cast("long").alias('run'),
                   F.expr("cols[4]").cast("float").alias('x'),
                   F.expr("cols[5]").cast("float").alias('y'),
                   F.expr("cols[6]").cast("float").alias('z'))
           .drop("cols"))

logs_df.createOrReplaceTempView("logs")

Let's see how many we have for each configuration

In [None]:
pdf = (spark.sql("SELECT conf, COUNT(1) as cnt "
                 "FROM logs "
                 "GROUP BY conf "
                 "ORDER BY cnt DESC")
       .toPandas())
pdf

They have names too...

In [None]:
from pyspark.sql.types import *

schema = StructType([StructField('conf', LongType()),
                     StructField('name', StringType())])
names_df = spark.read.csv('names.csv', header=True, schema=schema)
names_df.createOrReplaceTempView("names")

names_pdf = names_df.toPandas()
names_pdf

The following query will take a couple of minutes. Explore the Spark UI in the meanwhile. Can you see the DAG? What is Spark doing?

In [None]:
pdf = (spark.sql("SELECT names.name, COUNT(1) as cnt "
                 "FROM logs, names "
                 "WHERE logs.conf = names.conf "
                 "GROUP BY names.name "
                 "ORDER BY cnt DESC")
       .toPandas())
pdf

Let's extract duration histograms for all those too. This will also take a while (4 minutes). Use the terminal to run top. You didn't write any multithreading or multiprocessing code but you get concurrency while writing clean code.

In [None]:
dur_df = spark.sql("SELECT conf, run, MAX(time) - MIN(time) AS duration "
                   "FROM logs "
                   "GROUP BY conf, run, cycle"
                  ).cache()
pdf = dur_df.toPandas()
pdf

Ok, let's plot those

In [None]:
f, a = plt.subplots(9, 2)
f.set_figwidth(15)
f.set_figheight(30)
a = a.ravel()
for conf, ax in enumerate(a):
    name = names_pdf[names_pdf['conf'] == conf].iloc[0]['name']
    conf_pdf = pdf[pdf['conf'] == conf]
    conf_pdf['duration'].hist(ax=ax)
    ax.set_title(name)

We can see that there's noise in some cases - greater than a couple of seconds or less than 0.05. Let's filter those values away across the spectrum.

In [None]:
f, a = plt.subplots(9, 2)
f.set_figwidth(15)
f.set_figheight(30)
a = a.ravel()
for conf, ax in enumerate(a):
    name = names_pdf[names_pdf['conf'] == conf].iloc[0]['name']
    conf_pdf = pdf[(pdf['conf'] == conf) &
                   (pdf['duration'] > 0.05) &
                   (pdf['duration'] < 100)]
    conf_pdf['duration'].hist(ax=ax)
    ax.set_title(name)

We can see that most of them are ok with that range. Let's build a spark dataframe with their quantiles.

In [None]:
rows = []
for conf in range(18):
    name = names_pdf[names_pdf['conf'] == conf].iloc[0]['name']
    conf_pdf = pdf[(pdf['conf'] == conf) &
                   (pdf['duration'] > 0.05) &
                   (pdf['duration'] < 100)]
    q99 = float(conf_pdf['duration'].quantile(q=0.99))
    rows.append([conf, name, q99])

schema = StructType([StructField('conf', LongType()),
                     StructField('name', StringType()),
                     StructField('q99', FloatType())])

q_df = spark.createDataFrame(rows, schema)
q_df.toPandas()

In [None]:
q_df.coalesce(1).write.parquet('q99.parquet')

Parquet files are easy to read, are column based (fast for many queries) and include their schema

In [None]:
q_load_df = spark.read.parquet('q99.parquet')
q_load_df.printSchema()