# Spark Set-up

In [1]:
from typing import Callable, List
import numpy as np
import pandas as pd

import pyspark.sql
from pyspark.sql import functions as F
from pyspark.sql import SparkSession

spark: SparkSession = (SparkSession.builder.appName("pacdb")
         .config("spark.executor.memory", "512M")
         .config("spark.sql.warehouse.dir", ".spark")
         .enableHiveSupport()
         .getOrCreate())

spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")

import matplotlib as mpl
import matplotlib.pyplot as plt

# set font to Times New Roman
LATEX = False
if LATEX:
    mpl.rcParams['text.usetex'] = True
    mpl.rcParams["font.family"] = "serif"
    mpl.rcParams["font.serif"] = "Times"
else:
    mpl.rcParams['text.usetex'] = False
    mpl.rcParams["font.family"] = "Times New Roman"
    mpl.rcParams["mathtext.fontset"] = "stix"
    
plt.rcParams['svg.fonttype'] = 'none'
mpl.rcParams['savefig.dpi'] = 300

import matplotlib_inline.backend_inline  # type: ignore
matplotlib_inline.backend_inline.set_matplotlib_formats('svg')

mpl.rcParams['axes.titleweight'] = 'bold'

24/08/23 16:04:52 WARN Utils: Your hostname, Chaitanyasumas-MacBook-Air.local resolves to a loopback address: 127.0.0.1; using 192.168.0.139 instead (on interface en0)
24/08/23 16:04:52 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/08/23 16:04:52 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


# Problem Set-up - Dataset, True Query

In [2]:
math_df: pyspark.sql.DataFrame = spark.read.csv("data/student_performance/student-mat.csv", header=True, inferSchema=True, sep=";")
math_df.show(5)

portugese_df: pyspark.sql.DataFrame = spark.read.csv("data/student_performance/student-por.csv", header=True, inferSchema=True, sep=";")
portugese_df.show(5)

24/08/23 16:04:57 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


+------+---+---+-------+-------+-------+----+----+-------+--------+------+--------+----------+---------+--------+---------+------+----+----------+-------+------+--------+--------+------+--------+-----+----+----+------+--------+---+---+---+
|school|sex|age|address|famsize|Pstatus|Medu|Fedu|   Mjob|    Fjob|reason|guardian|traveltime|studytime|failures|schoolsup|famsup|paid|activities|nursery|higher|internet|romantic|famrel|freetime|goout|Dalc|Walc|health|absences| G1| G2| G3|
+------+---+---+-------+-------+-------+----+----+-------+--------+------+--------+----------+---------+--------+---------+------+----+----------+-------+------+--------+--------+------+--------+-----+----+----+------+--------+---+---+---+
|    GP|  F| 18|      U|    GT3|      A|   4|   4|at_home| teacher|course|  mother|         2|        2|       0|      yes|    no|  no|        no|    yes|   yes|      no|      no|     4|       3|    4|   1|   1|     3|       6|  5|  6|  6|
|    GP|  F| 17|      U|    GT3|      T|

### Query 1: 

Filter: for students with absences > 10
Join: None
Group By: Guardian
Agg: Avg, Max absences

In [3]:
from pyspark.sql.functions import max, avg
result = (
    math_df.filter(F.col("absences") > 20)
           .groupBy("guardian")
           .agg(max("absences").alias("max_absences"), avg("absences").alias("avg_absences"))
)
result.show()

+--------+------------+------------------+
|guardian|max_absences|      avg_absences|
+--------+------------+------------------+
|  father|          21|              21.0|
|  mother|          75|              35.0|
|   other|          40|33.333333333333336|
+--------+------------+------------------+



# PAC Set-up

In [4]:
budget_list: List[float] = [1/64, 1/32, 1/16, 1/8, 1/4, 1/2, 1., 2., 4.]
sampling_rate: float = 0.5
m: int = 10
c: float = 1e-6
mi: float = 1./4

In [5]:
# initialise the worker and set all/any 4 query parameters
from pac_db_re import PACWorker
from pac_db_re import AggregationType
from pac_db_re import FilterTypeEnum
import time
import datetime

filter_type = FilterTypeEnum.GREATER_THAN

def query(df):
    return (df
            .groupBy(F.col("guardian"))
            .agg(F.avg("absences"), F.max("absences")))

pac_worker = PACWorker(
                filter_col='absences',
                filter_value='20',
                filter_type=filter_type,
                group_by_col='guardian',
                query_function=query
            )



In [6]:
start_time = datetime.datetime.now()
result, groups_list = pac_worker.estimate_noise(math_df, v1=True)
intermediate_time = datetime.datetime.now()

pac_worker.release_pac_value(math_df, noise=result, groups_list=groups_list, threshold_value=20)
end_time = datetime.datetime.now()

  sun.misc.Unsafe or java.nio.DirectByteBuffer.<init>(long, int) not available
Attempting non-optimization as 'spark.sql.execution.arrow.pyspark.fallback.enabled' is set to true.
  warn(msg)


+--------+-------------+-------------+
|guardian|avg(absences)|max(absences)|
+--------+-------------+-------------+
|  father|          0.0|          0.0|
|  mother|          0.0|          0.0|
|   other|          0.0|          0.0|
+--------+-------------+-------------+



  return _methods._var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)
24/08/23 16:05:07 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors
                                                                                

Sample: [0. 6. 2.] + Noise = Noised: [-166.06335111349205, -5.336288302651644, 82.84371124388684]


  sun.misc.Unsafe or java.nio.DirectByteBuffer.<init>(long, int) not available
Attempting non-optimization as 'spark.sql.execution.arrow.pyspark.fallback.enabled' is set to true.
  warn(msg)
24/08/23 16:16:55 WARN AttachDistributedSequenceExec: clean up cached RDD(78861) in AttachDistributedSequenceExec(847067)


+--------+-------------------+
|guardian|              count|
+--------+-------------------+
|  father|-166.06335111349205|
|  mother| -5.336288302651644|
|   other|  82.84371124388684|
+--------+-------------------+

Sample: [ 0.          0.         34.83333333 56.         30.         38.        ] + Noise = Noised: [53.96623604976048, -247.60871752304345, 59.99983813691367, 393.61858155490916, -72.2916119334684, 151.97055257122628]


  sun.misc.Unsafe or java.nio.DirectByteBuffer.<init>(long, int) not available
Attempting non-optimization as 'spark.sql.execution.arrow.pyspark.fallback.enabled' is set to true.
  warn(msg)


+--------+-----------------+-------------------+
|guardian|    avg(absences)|      max(absences)|
+--------+-----------------+-------------------+
|  father|53.96623604976048|-247.60871752304345|
|  mother|59.99983813691367| 393.61858155490916|
|   other|-72.2916119334684| 151.97055257122628|
+--------+-----------------+-------------------+

+--------+-----------------+-------------------+
|guardian|    avg(absences)|      max(absences)|
+--------+-----------------+-------------------+
|  father|53.96623604976048|-247.60871752304345|
|  mother|              0.0|                0.0|
|   other|-72.2916119334684| 151.97055257122628|
+--------+-----------------+-------------------+



24/08/23 16:16:56 WARN AttachDistributedSequenceExec: clean up cached RDD(78953) in AttachDistributedSequenceExec(848035)
24/08/23 16:16:56 WARN AttachDistributedSequenceExec: clean up cached RDD(78973) in AttachDistributedSequenceExec(848344)


In [7]:
time_to_estimate_noise = intermediate_time - start_time
print(time_to_estimate_noise)

time_to_release_pac_value = end_time - intermediate_time
print(time_to_release_pac_value)

0:11:55.878801
0:00:01.800918


In [8]:
start_time_ = datetime.datetime.now()
result_, groups_list_ = pac_worker.estimate_noise(math_df, v2=True)
intermediate_time_ = datetime.datetime.now()

pac_worker.release_pac_value(math_df, noise=result_, groups_list=groups_list_, threshold_value=20)
end_time_ = datetime.datetime.now()

  sun.misc.Unsafe or java.nio.DirectByteBuffer.<init>(long, int) not available
Attempting non-optimization as 'spark.sql.execution.arrow.pyspark.fallback.enabled' is set to true.
  warn(msg)


+--------+-------------+-------------+
|guardian|avg(absences)|max(absences)|
+--------+-------------+-------------+
|  father|          0.0|          0.0|
|  mother|          0.0|          0.0|
|   other|          0.0|          0.0|
+--------+-------------+-------------+



  return _methods._var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)
                                                                                

Sample: [0. 4. 2.] + Noise = Noised: [-161.49424534268823, 10.341500735851202, 203.11546120594923]


  sun.misc.Unsafe or java.nio.DirectByteBuffer.<init>(long, int) not available
Attempting non-optimization as 'spark.sql.execution.arrow.pyspark.fallback.enabled' is set to true.
  warn(msg)
24/08/23 16:49:58 WARN AttachDistributedSequenceExec: clean up cached RDD(229904) in AttachDistributedSequenceExec(2501084)


+--------+-------------------+
|guardian|              count|
+--------+-------------------+
|  father|-161.49424534268823|
|  mother| 10.341500735851202|
|   other| 203.11546120594923|
+--------+-------------------+

Sample: [ 0.    0.   39.25 56.   31.   40.  ] + Noise = Noised: [-73.0973684428683, -153.7521251574019, 41.12384837094093, 133.60927974921583, -75.35582685223348, -137.90544191488195]


  sun.misc.Unsafe or java.nio.DirectByteBuffer.<init>(long, int) not available
Attempting non-optimization as 'spark.sql.execution.arrow.pyspark.fallback.enabled' is set to true.
  warn(msg)


+--------+------------------+-------------------+
|guardian|     avg(absences)|      max(absences)|
+--------+------------------+-------------------+
|  father| -73.0973684428683| -153.7521251574019|
|  mother| 41.12384837094093| 133.60927974921583|
|   other|-75.35582685223348|-137.90544191488195|
+--------+------------------+-------------------+

+--------+------------------+-------------------+
|guardian|     avg(absences)|      max(absences)|
+--------+------------------+-------------------+
|  father| -73.0973684428683| -153.7521251574019|
|  mother|               0.0|                0.0|
|   other|-75.35582685223348|-137.90544191488195|
+--------+------------------+-------------------+



24/08/23 16:49:59 WARN AttachDistributedSequenceExec: clean up cached RDD(229996) in AttachDistributedSequenceExec(2502052)
24/08/23 16:49:59 WARN AttachDistributedSequenceExec: clean up cached RDD(230016) in AttachDistributedSequenceExec(2502361)


In [9]:
time_to_estimate_noise_ = intermediate_time_ - start_time_
print(time_to_estimate_noise_)

time_to_release_pac_value_ = end_time_ - intermediate_time_
print(time_to_release_pac_value_)

0:20:58.300983
0:00:01.718860
