In [1]:
from typing import Callable, List
import numpy as np
import pandas as pd

import pyspark.sql
from pyspark.sql import functions as F
from pyspark.sql import SparkSession

spark: SparkSession = (SparkSession.builder.appName("pacdb")
         .config("spark.executor.memory", "512M")
         .config("spark.sql.warehouse.dir", ".spark")
         .enableHiveSupport()
         .getOrCreate())

spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")

import matplotlib as mpl
import matplotlib.pyplot as plt

# set font to Times New Roman
LATEX = False
if LATEX:
    mpl.rcParams['text.usetex'] = True
    mpl.rcParams["font.family"] = "serif"
    mpl.rcParams["font.serif"] = "Times"
else:
    mpl.rcParams['text.usetex'] = False
    mpl.rcParams["font.family"] = "Times New Roman"
    mpl.rcParams["mathtext.fontset"] = "stix"
    
plt.rcParams['svg.fonttype'] = 'none'
mpl.rcParams['savefig.dpi'] = 300

import matplotlib_inline.backend_inline  # type: ignore
matplotlib_inline.backend_inline.set_matplotlib_formats('svg')

mpl.rcParams['axes.titleweight'] = 'bold'

24/08/09 00:14:10 WARN Utils: Your hostname, Chaitanyasumas-MacBook-Air.local resolves to a loopback address: 127.0.0.1; using 10.0.0.105 instead (on interface en0)
24/08/09 00:14:10 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/08/09 00:14:10 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
math_df: pyspark.sql.DataFrame = spark.read.csv("./data/student_performance/student-mat.csv", header=True, inferSchema=True, sep=";")

In [3]:
math_df.show(5)

24/08/09 00:14:15 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


+------+---+---+-------+-------+-------+----+----+-------+--------+------+--------+----------+---------+--------+---------+------+----+----------+-------+------+--------+--------+------+--------+-----+----+----+------+--------+---+---+---+
|school|sex|age|address|famsize|Pstatus|Medu|Fedu|   Mjob|    Fjob|reason|guardian|traveltime|studytime|failures|schoolsup|famsup|paid|activities|nursery|higher|internet|romantic|famrel|freetime|goout|Dalc|Walc|health|absences| G1| G2| G3|
+------+---+---+-------+-------+-------+----+----+-------+--------+------+--------+----------+---------+--------+---------+------+----+----------+-------+------+--------+--------+------+--------+-----+----+----+------+--------+---+---+---+
|    GP|  F| 18|      U|    GT3|      A|   4|   4|at_home| teacher|course|  mother|         2|        2|       0|      yes|    no|  no|        no|    yes|   yes|      no|      no|     4|       3|    4|   1|   1|     3|       6|  5|  6|  6|
|    GP|  F| 17|      U|    GT3|      T|

In [4]:
from pacdb import PACDataFrame, PACOptions, SamplerOptions



In [5]:
df = math_df

query_name: str = "count"
budget_list: List[float] = [1/64, 1/32, 1/16, 1/8, 1/4, 1/2, 1., 2., 4.]
sample_size: int = 3
sampling_rate: float = 0.5
m: int = 10
c: float = 1e-6
mi: float = 1./4

In [6]:
math_df.groupBy("guardian")

GroupedData[grouping expressions: [guardian], value: [school: string, sex: string ... 31 more fields], type: GroupBy]

In [7]:
intm_math_df = math_df.filter(F.col("absences") > 10)
intm_math_df.show()

+------+---+---+-------+-------+-------+----+----+--------+--------+----------+--------+----------+---------+--------+---------+------+----+----------+-------+------+--------+--------+------+--------+-----+----+----+------+--------+---+---+---+
|school|sex|age|address|famsize|Pstatus|Medu|Fedu|    Mjob|    Fjob|    reason|guardian|traveltime|studytime|failures|schoolsup|famsup|paid|activities|nursery|higher|internet|romantic|famrel|freetime|goout|Dalc|Walc|health|absences| G1| G2| G3|
+------+---+---+-------+-------+-------+----+----+--------+--------+----------+--------+----------+---------+--------+---------+------+----+----------+-------+------+--------+--------+------+--------+-----+----+----+------+--------+---+---+---+
|    GP|  M| 17|      U|    GT3|      T|   3|   2|services|services|    course|  mother|         1|        1|       3|       no|   yes|  no|       yes|    yes|   yes|     yes|      no|     5|       5|    5|   2|   4|     5|      16|  6|  5|  5|
|    GP|  F| 16|    

In [8]:
intm_math_df.count()

66

In [9]:
grouped_count_df = math_df.groupBy("guardian").count()
grouped_count_df.show()

+--------+-----+
|guardian|count|
+--------+-----+
|  father|   90|
|  mother|  273|
|   other|   32|
+--------+-----+



24/08/09 00:14:24 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors


## Threshold Testing

A. Without Threshold

#### Single Aggregation - Avg

#### True Values

In [42]:
from pyspark.sql.functions import max, avg
result = (
    math_df.filter(F.col("absences") > 10)
           .groupBy("guardian")
           .agg(avg("absences").alias("avg_absences"))
)
result.show()

+--------+------------------+
|guardian|      avg_absences|
+--------+------------------+
|  father|14.818181818181818|
|  mother|19.727272727272727|
|   other|20.181818181818183|
+--------+------------------+



In [46]:
from pyspark.sql.functions import max, avg
result = (
    math_df
           .groupBy("guardian")
           .agg(avg("absences").alias("avg_absences"))
)
result.show()

+--------+-----------------+
|guardian|     avg_absences|
+--------+-----------------+
|  father|3.977777777777778|
|  mother|5.835164835164835|
|   other|              9.5|
+--------+-----------------+



#### PAC Values

In [47]:
def query(df):
    return df.groupBy(F.col("guardian")).agg(F.avg("absences"))

mi = 1
pac_df = (PACDataFrame.fromDataFrame(df)
                    .withOptions(PACOptions(trials = m, max_mi = mi, c = c))
                    .withSamplerOptions(SamplerOptions(fraction=sampling_rate))
                    .withQuery(lambda x: query(x)))

noise = pac_df.releaseValue()

Found output format of query: 


  sun.misc.Unsafe or java.nio.DirectByteBuffer.<init>(long, int) not available
Attempting non-optimization as 'spark.sql.execution.arrow.pyspark.fallback.enabled' is set to true.
  warn(msg)


Hybrid Noise: Using the identity matrix as the projection matrix.



0it [17:20, ?it/s]
0it [00:10, ?it/s]
  sun.misc.Unsafe or java.nio.DirectByteBuffer.<init>(long, int) not available
Attempting non-optimization as 'spark.sql.execution.arrow.pyspark.fallback.enabled' is set to true.
  warn(msg)


Sample: [3.63043478 6.12413793 5.47058824] + Noise = Noised: [3.427957486106539, 5.506936399576095, 6.8172503379030855]
Inserting to dataframe:




+--------+------------------+
|guardian|     avg(absences)|
+--------+------------------+
|  father| 3.427957486106539|
|  mother| 5.506936399576095|
|   other|6.8172503379030855|
+--------+------------------+



#### Multiple Aggregation 

#### True Values

In [7]:
from pyspark.sql.functions import max, avg
result = math_df.groupBy("guardian").agg(max("absences").alias("max_absences"), avg("absences").alias("avg_absences"))
result.show()

+--------+------------+-----------------+
|guardian|max_absences|     avg_absences|
+--------+------------+-----------------+
|  father|          21|3.977777777777778|
|  mother|          75|5.835164835164835|
|   other|          40|              9.5|
+--------+------------+-----------------+



#### PAC Values

In [8]:
def query(df):
    return df.groupBy(F.col("guardian")).agg(F.avg("absences"), F.max("absences"))

mi = 1
pac_df = (PACDataFrame.fromDataFrame(df)
                    .withOptions(PACOptions(trials = m, max_mi = mi, c = c))
                    .withSamplerOptions(SamplerOptions(fraction=sampling_rate))
                    .withQuery(lambda x: query(x)))

noise = pac_df.releaseValue()

Found output format of query: 


  sun.misc.Unsafe or java.nio.DirectByteBuffer.<init>(long, int) not available
Attempting non-optimization as 'spark.sql.execution.arrow.pyspark.fallback.enabled' is set to true.
  warn(msg)


Hybrid Noise: Using the identity matrix as the projection matrix.


0it [01:16, ?it/s]
  sun.misc.Unsafe or java.nio.DirectByteBuffer.<init>(long, int) not available
Attempting non-optimization as 'spark.sql.execution.arrow.pyspark.fallback.enabled' is set to true.
  warn(msg)


Sample: [ 3.88888889  4.796875    7.47368421 21.         30.         20.        ] + Noise = Noised: [11.234769083145151, 11.314013701544233, -11.833881064202387, 81.89639744954465, 10.784551298126619, 165.59680231801315]
Inserting to dataframe:




+--------+-------------------+------------------+
|guardian|      avg(absences)|     max(absences)|
+--------+-------------------+------------------+
|  father| 11.234769083145151| 81.89639744954465|
|  mother| 11.314013701544233|10.784551298126619|
|   other|-11.833881064202387|165.59680231801315|
+--------+-------------------+------------------+



#### B. With Threshold

#### Single Aggregation - Avg

In [10]:
def query(df):
    return df.groupBy(F.col("guardian")).agg(F.avg("absences"))

mi = 1
pac_df = (PACDataFrame.fromDataFrame(df)
                    .withOptions(PACOptions(trials = m, max_mi = mi, c = c))
                    .withSamplerOptions(SamplerOptions(fraction=sampling_rate))
                    .withQuery(lambda x: query(x)))


pac_df.setGroupBy('guardian')
pac_df.setAgg('avg')

noise = pac_df.releaseValue(threshold=True, threshold_value=91)

  sun.misc.Unsafe or java.nio.DirectByteBuffer.<init>(long, int) not available
Attempting non-optimization as 'spark.sql.execution.arrow.pyspark.fallback.enabled' is set to true.
  warn(msg)


Hybrid Noise: Using the identity matrix as the projection matrix.


0it [00:10, ?it/s]


DataFrame[school: string, sex: string, age: int, address: string, famsize: string, Pstatus: string, Medu: int, Fedu: int, Mjob: string, Fjob: string, reason: string, guardian: string, traveltime: int, studytime: int, failures: int, schoolsup: string, famsup: string, paid: string, activities: string, nursery: string, higher: string, internet: string, romantic: string, famrel: int, freetime: int, goout: int, Dalc: int, Walc: int, health: int, absences: int, G1: int, G2: int, G3: int]
guardian


TypeError: applyGroupBy() takes 2 positional arguments but 3 were given

#### Multiple Aggregation 

In [None]:
def query(df):
    return df.groupBy(F.col("guardian")).agg(F.avg("absences"), F.max("absences"))

mi = 1
pac_df = (PACDataFrame.fromDataFrame(df)
                    .withOptions(PACOptions(trials = m, max_mi = mi, c = c))
                    .withSamplerOptions(SamplerOptions(fraction=sampling_rate))
                    .withQuery(lambda x: query(x)))

noise = pac_df.releaseValue(threshold=True)

### Threshold Testing

In [6]:
for mi in [1/64, 1/32, 1/16, 1/8, 1/4, 1/2, 1., 2., 4.]:
    pac_df = (PACDataFrame.fromDataFrame(df)
                        .withOptions(PACOptions(trials = m, max_mi = mi, c = c))
                        .withSamplerOptions(SamplerOptions(fraction=sampling_rate))
                        .withQuery(lambda x: query(x)))

    noise = pac_df.releaseValue()
    print(f"For MI = {mi}, Noise is: {noise}")

24/07/11 23:46:17 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors


Found output format of query: 


  sun.misc.Unsafe or java.nio.DirectByteBuffer.<init>(long, int) not available
Attempting non-optimization as 'spark.sql.execution.arrow.pyspark.fallback.enabled' is set to true.
  warn(msg)


+--------+-----------------+-------------+
|guardian|    avg(absences)|max(absences)|
+--------+-----------------+-------------+
|  father|4.386363636363637|           21|
|  mother|6.977777777777778|           75|
|   other|7.733333333333333|           40|
+--------+-----------------+-------------+

chai_debug: The identity matrix dimensions are : 2
max_mi: 0.015625, eta: 0.5, dimensions: 2
Hybrid Noise: Using the identity matrix as the projection matrix.


0it [00:00, ?it/s]ERROR:tornado.general:SEND Error: Host unreachable


AssertionError: 

## Sample Release

In [None]:
def query(df):
    return df.filter(df["absences"] >= 5).agg(F.count("*"))

pac_df = (PACDataFrame.fromDataFrame(df)
                    .withOptions(PACOptions(trials = m, max_mi = mi, c = c))
                    .withSamplerOptions(SamplerOptions(fraction=sampling_rate))
                    .withQuery(lambda x: query(x)))

pac_df.releaseValue()

In [None]:
def query(df):
    return df.groupBy(F.col("guardian")).agg(F.count("*"))

pac_df = (PACDataFrame.fromDataFrame(df)
                    .withOptions(PACOptions(trials = m, max_mi = mi, c = c))
                    .withSamplerOptions(SamplerOptions(fraction=sampling_rate))
                    .withQuery(lambda x: query(x)))

pac_df.releaseValue()

## Hybrid Anisotropic Experimentation

In [None]:
all_noise = []
for mi in [1/64, 1/32, 1/16, 1/8, 1/4, 1/2, 1., 2., 4.]:
    pac_df = (PACDataFrame.fromDataFrame(df)
                        .withOptions(PACOptions(trials = m, max_mi = mi, c = c))
                        .withSamplerOptions(SamplerOptions(fraction=sampling_rate))
                        .withQuery(lambda x: query(x)))

    noise = pac_df.releaseValue()
    all_noise.append(noise)
    print(f"For MI = {mi}, Noise is: {noise}")

In [None]:
all_noise = []
for mi in [1/64, 1/32, 1/16, 1/8, 1/4, 1/2, 1., 2., 4.]:
    pac_df = (PACDataFrame.fromDataFrame(df)
                        .withOptions(PACOptions(trials = m, max_mi = mi, c = c))
                        .withSamplerOptions(SamplerOptions(fraction=sampling_rate))
                        .withQuery(lambda x: query(x)))

    noise = pac_df.releaseValue()
    all_noise.append(noise)
    print(f"For MI = {mi}, Noise is: {noise}")

In [None]:
all_noise_3 = []
for mi in [1/64, 1/32, 1/16, 1/8, 1/4, 1/2, 1., 2., 4.]:
    pac_df = (PACDataFrame.fromDataFrame(df)
                        .withOptions(PACOptions(trials = m, max_mi = mi, c = c))
                        .withSamplerOptions(SamplerOptions(fraction=sampling_rate))
                        .withQuery(lambda x: query(x)))

    noise = pac_df.releaseValue()
    all_noise_3.append(noise)
    print(f"For MI = {mi}, Noise is: {noise}")

In [None]:
import matplotlib.pyplot as plt

x_values = [1/64, 1/32, 1/16, 1/8, 1/4, 1/2, 1, 2, 4]

etas = [0.025, 0.05, 0.1]

# Define categories
categories = {
    'Category 1': 'Avg Father',
    'Category 2': 'Avg Mother',
    'Category 3': 'Avg Other',
    'Category 4': 'Max Father',
    'Category 5': 'Max Mother',
    'Category 6': 'Max Other'
}

# Define colors for each category and each eta
colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b']

plt.figure(figsize=(10, 6))

for i, (category, label) in enumerate(categories.items()):
    for j, eta in enumerate(etas):
        color = colors[j]  # Select color for current eta
        plt.plot(x_values, y_values[category][j], label=f'eta={eta}', color=color)
    
    # Add category annotation above the first line in the category
    plt.text(x_values[0], y_values[category][0][0], label, fontsize=10, color=colors[i*len(etas)], weight='bold', va='bottom', ha='left')

plt.legend()

plt.xlabel('X values (log scale)')
plt.ylabel('Y values')
plt.title('Y values for different categories and eta values')
plt.xscale('log')
plt.grid(True)
plt.show()



In [7]:
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.lines import Line2D

def plot_for_max_mi(y_values, fig_number, x_values = [1/64, 1/32, 1/16, 1/8, 1/4, 1/2, 1., 2., 4.]):
    colors = ['blue', 'orange', 'green', 'red', 'purple', 'brown']
    color_labels = ['Avg - Father', 'Avg - Mother', 'Avg - Other', 'Max - Father', 'Max - Mother', 'Max - Other']

    fig, ax = plt.subplots(figsize=(10, 6))

    for i in range(y_values.shape[1]):
        ax.plot(x_values, y_values[:, i], label=color_labels[i])
        
    ax.set_xlabel('Mutual Information (Log Scale)')
    ax.set_ylabel('Noise Estimate')
    ax.set_title('Hybrid Anisotropic')
    ax.set_xscale('log', base=2)
    ax.set_xticks(x_values)
    ax.set_xticklabels([f'MI = 1/{int(1/x_val)}' if x_val < 1 else f'MI = {x_val}' for x_val in x_values])
    ax.legend()

    plt.grid(True)
    plt.savefig(f"figs/test_hybrid_{fig_number}.png")

    plt.show()


In [None]:
y_values = np.array([[y_dict[i] for i in range(6)] for y_dict in all_noise])
plot_for_max_mi(y_values, fig_number=0)

# Hybrid Anisotropic with Latest Changes by Mayuri

In [None]:
def query(df):
    return df.groupBy(F.col("guardian")).agg(F.avg("absences"), F.max("absences"))
all_noise_new = []
for mi in [1/64, 1/32, 1/16, 1/8, 1/4, 1/2, 1., 2., 4.]:
    pac_df = (PACDataFrame.fromDataFrame(df)
                        .withOptions(PACOptions(trials = m, max_mi = mi, c = c))
                        .withSamplerOptions(SamplerOptions(fraction=sampling_rate))
                        .withQuery(lambda x: query(x)))

    noise = pac_df.releaseValue()
    all_noise_new.append(noise)
    print(f"For MI = {mi}, Noise is: {noise}")

In [None]:
y_values_new = np.array([[y_dict[i] for i in range(6)] for y_dict in all_noise_new])
print(y_values_new)
plot_for_max_mi(y_values_new, fig_number=2)

## Full Anisotropy

Conversion between dataframes and numpy vectors

In [25]:
from pyspark.sql.functions import max, avg
result = math_df.groupBy("guardian").agg(max("absences").alias("max_absences"), avg("absences").alias("avg_absences"))
result.show()

+--------+------------+-----------------+
|guardian|max_absences|     avg_absences|
+--------+------------+-----------------+
|  father|          21|3.977777777777778|
|  mother|          75|5.835164835164835|
|   other|          40|              9.5|
+--------+------------+-----------------+



In [11]:
from typing import Optional
import pyspark.sql.dataframe
import pyspark.sql.types as T
import pyspark.pandas as ps

df = math_df

def _unwrapDataFrame(df: pyspark.sql.DataFrame) -> np.ndarray:
    """
    Convert a PySpark DataFrame into a numpy vector.
    """
    
    numeric_columns: List[str] = [f.name for f in df.schema.fields if isinstance(f.dataType, T.NumericType)]
    df_numeric: pyspark.sql.DataFrame = df.select(*numeric_columns)  # select only numeric columns
    np_array: np.ndarray = np.array(df_numeric.collect())

    flat: np.ndarray = np_array.flatten(order="F")

    return flat

def _updateDataFrame(vec: np.ndarray, df: pyspark.sql.DataFrame) -> pyspark.sql.DataFrame:
    """
    Use the values of the numpy vector to update the PySpark DataFrame.
    """

    numeric_columns: List[str] = [f.name for f in df.schema.fields if isinstance(f.dataType, T.NumericType)]
    df_numeric: pyspark.sql.DataFrame = df.select(*numeric_columns)  # select only numeric columns
    shape = np.array(df_numeric.collect()).shape

    np_array = vec.reshape(shape, order="F")
    new_pandas: ps.DataFrame = ps.DataFrame(np_array, columns=numeric_columns)
    print(new_pandas)

    old_pandas = df.pandas_api()
    old_pandas.update(new_pandas)

    return old_pandas.to_spark()


u1 = _unwrapDataFrame(df.filter(df["absences"] >= 5).agg(F.count("*")))
u2 = _unwrapDataFrame(df.groupBy(F.col("guardian")).agg(F.count("*")))
u3 = _unwrapDataFrame(df.groupBy(F.col("guardian")).agg(F.avg("absences"), F.max("absences")))

print(u1)
print(u2)
print(u3)

r1 = _updateDataFrame(u1, df.filter(df["absences"] >= 5).agg(F.count("*")))
r2 = _updateDataFrame(u2, df.groupBy(F.col("guardian")).agg(F.count("*")))
r3 = _updateDataFrame(u3, df.groupBy(F.col("guardian")).agg(F.avg("absences"), F.max("absences")))

r1.show()
r2.show()
r3.show()



[151]
[ 90 273  32]
[ 3.97777778  5.83516484  9.5        21.         75.         40.        ]


  if is_categorical_dtype(series.dtype):


   count(1)
0       151


  if is_categorical_dtype(series.dtype):


   count(1)
0        90
1       273
2        32


  if is_categorical_dtype(series.dtype):


   avg(absences)  max(absences)
0       3.977778           21.0
1       5.835165           75.0
2       9.500000           40.0




+--------+
|count(1)|
+--------+
|     151|
+--------+

+--------+--------+
|guardian|count(1)|
+--------+--------+
|  father|      90|
|  mother|     273|
|   other|      32|
+--------+--------+

+--------+-----------------+-------------+
|guardian|    avg(absences)|max(absences)|
+--------+-----------------+-------------+
|  father|3.977777777777778|         21.0|
|  mother|5.835164835164835|         75.0|
|   other|              9.5|         40.0|
+--------+-----------------+-------------+

