# Spark Set-up

In [1]:
from typing import Callable, List
import numpy as np
import pandas as pd

import pyspark.sql
from pyspark.sql import functions as F
from pyspark.sql import SparkSession

spark: SparkSession = (SparkSession.builder.appName("pacdb")
         .config("spark.executor.memory", "512M")
         .config("spark.sql.warehouse.dir", ".spark")
         .enableHiveSupport()
         .getOrCreate())

spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")

import matplotlib as mpl
import matplotlib.pyplot as plt

# set font to Times New Roman
LATEX = False
if LATEX:
    mpl.rcParams['text.usetex'] = True
    mpl.rcParams["font.family"] = "serif"
    mpl.rcParams["font.serif"] = "Times"
else:
    mpl.rcParams['text.usetex'] = False
    mpl.rcParams["font.family"] = "Times New Roman"
    mpl.rcParams["mathtext.fontset"] = "stix"
    
plt.rcParams['svg.fonttype'] = 'none'
mpl.rcParams['savefig.dpi'] = 300

import matplotlib_inline.backend_inline  # type: ignore
matplotlib_inline.backend_inline.set_matplotlib_formats('svg')

mpl.rcParams['axes.titleweight'] = 'bold'

24/08/22 21:56:18 WARN Utils: Your hostname, Chaitanyasumas-MacBook-Air.local resolves to a loopback address: 127.0.0.1; using 192.168.0.139 instead (on interface en0)
24/08/22 21:56:18 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/08/22 21:56:19 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


# Problem Set-up - Dataset, True Query

In [2]:
math_df: pyspark.sql.DataFrame = spark.read.csv("data/student_performance/student-mat.csv", header=True, inferSchema=True, sep=";")
math_df.show(5)

portugese_df: pyspark.sql.DataFrame = spark.read.csv("data/student_performance/student-por.csv", header=True, inferSchema=True, sep=";")
portugese_df.show(5)

24/08/22 21:56:24 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


+------+---+---+-------+-------+-------+----+----+-------+--------+------+--------+----------+---------+--------+---------+------+----+----------+-------+------+--------+--------+------+--------+-----+----+----+------+--------+---+---+---+
|school|sex|age|address|famsize|Pstatus|Medu|Fedu|   Mjob|    Fjob|reason|guardian|traveltime|studytime|failures|schoolsup|famsup|paid|activities|nursery|higher|internet|romantic|famrel|freetime|goout|Dalc|Walc|health|absences| G1| G2| G3|
+------+---+---+-------+-------+-------+----+----+-------+--------+------+--------+----------+---------+--------+---------+------+----+----------+-------+------+--------+--------+------+--------+-----+----+----+------+--------+---+---+---+
|    GP|  F| 18|      U|    GT3|      A|   4|   4|at_home| teacher|course|  mother|         2|        2|       0|      yes|    no|  no|        no|    yes|   yes|      no|      no|     4|       3|    4|   1|   1|     3|       6|  5|  6|  6|
|    GP|  F| 17|      U|    GT3|      T|

### Query 1: 

Filter: for students with absences > 10
Join: None
Group By: Guardian
Agg: Avg, Max absences

In [3]:
from pyspark.sql.functions import max, avg
result = (
    math_df.filter(F.col("absences") > 10)
           .groupBy("guardian")
           .agg(max("absences").alias("max_absences"), avg("absences").alias("avg_absences"))
)
result.show()

+--------+------------+------------------+
|guardian|max_absences|      avg_absences|
+--------+------------+------------------+
|  father|          21|14.818181818181818|
|  mother|          75|19.727272727272727|
|   other|          40|20.181818181818183|
+--------+------------+------------------+



# PAC Set-up

In [4]:
budget_list: List[float] = [1/64, 1/32, 1/16, 1/8, 1/4, 1/2, 1., 2., 4.]
sampling_rate: float = 0.5
m: int = 10
c: float = 1e-6
mi: float = 1./4

In [5]:
# initialise the worker and set all/any 4 query parameters
from pac_db_re import PACWorker
from pac_db_re import AggregationType
from pac_db_re import FilterTypeEnum

filter_type = FilterTypeEnum.GREATER_THAN
agg_type = AggregationType.AVG

def query(df):
    return df.groupBy(F.col("guardian")).agg(F.avg("absences"), F.max("absences"))

pac_worker = PACWorker(
                filter_col='absences',
                filter_value='10',
                filter_type=filter_type,
                group_by_col='guardian',
                agg_type=agg_type,
                agg_col='absences',
                query_function=query
            )

result = pac_worker.estimate_noise(math_df)

pac_worker.release_pac_value()

24/08/22 21:56:38 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors
  sun.misc.Unsafe or java.nio.DirectByteBuffer.<init>(long, int) not available
Attempting non-optimization as 'spark.sql.execution.arrow.pyspark.fallback.enabled' is set to true.
  warn(msg)


+--------+-------------+-------------+
|guardian|avg(absences)|max(absences)|
+--------+-------------+-------------+
|  father|          0.0|          0.0|
|  mother|          0.0|          0.0|
|   other|          0.0|          0.0|
+--------+-------------+-------------+



TypeError: _produce_one_sampled_output() missing 1 required positional argument: 'df'

In [None]:
# indicate sampling n times
pac_worker.sample_df(sampling_rate)
pac_worker.calculate_noise()
pac_worker.release_pac_value()