In [6]:
from typing import Callable, List
import numpy as np
import pandas as pd

import pyspark.sql
from pyspark.sql import functions as F
from pyspark.sql import SparkSession

spark: SparkSession = (SparkSession.builder.appName("pacdb")
         .config("spark.executor.memory", "512M")
         .config("spark.sql.warehouse.dir", ".spark")
         .enableHiveSupport()
         .getOrCreate())

spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")

import matplotlib as mpl
import matplotlib.pyplot as plt

# set font to Times New Roman
LATEX = False
if LATEX:
    mpl.rcParams['text.usetex'] = True
    mpl.rcParams["font.family"] = "serif"
    mpl.rcParams["font.serif"] = "Times"
else:
    mpl.rcParams['text.usetex'] = False
    mpl.rcParams["font.family"] = "Times New Roman"
    mpl.rcParams["mathtext.fontset"] = "stix"
    
plt.rcParams['svg.fonttype'] = 'none'
mpl.rcParams['savefig.dpi'] = 300

import matplotlib_inline.backend_inline  # type: ignore
matplotlib_inline.backend_inline.set_matplotlib_formats('svg')

mpl.rcParams['axes.titleweight'] = 'bold'

24/04/09 13:52:30 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [7]:
math_df: pyspark.sql.DataFrame = spark.read.csv("./data/student_performance/student-mat.csv", header=True, inferSchema=True, sep=";")
#portuguese_df = spark.read.csv("./data/student_performance/student-por.csv", header=True, inferSchema=True, sep=";")

In [8]:
from pacdb import PACDataFrame, PACOptions, SamplerOptions



In [4]:
df = math_df

query_name: str = "count"
budget_list: List[float] = [1/64, 1/32, 1/16, 1/8, 1/4, 1/2, 1., 2., 4.]
sample_size: int = 3
sampling_rate: float = 0.5
m: int = 10
c: float = 1e-6
mi: float = 1./4

In [5]:
def query(df):
    return df.filter(df["absences"] >= 5).agg(F.count("*"))

pac_df = (PACDataFrame.fromDataFrame(df)
                    .withOptions(PACOptions(trials = m, max_mi = mi, c = c))
                    .withSamplerOptions(SamplerOptions(fraction=sampling_rate))
                    .withQuery(lambda x: query(x)))

pac_df.releaseValue()

Found output format of query: 


  if is_categorical_dtype(series.dtype):


+--------+
|count(1)|
+--------+
|     0.0|
+--------+

max_mi: 0.25, eta: 0.05, dimensions: 1
Using the identity matrix as the projection matrix.


0it [00:00, ?it/s]

Converged after 291 trials
Final variance estimates: [36.90320142653015]
sqrt total var is 6.074800525657625
Computed noise (variances) is [73.8064028530603]
Sample: [90] + Noise: [73.8064028530603] = Noised: [75.40049736497639]


  if is_categorical_dtype(series.dtype):


Inserting to dataframe:
+-----------------+
|         count(1)|
+-----------------+
|75.40049736497639|
+-----------------+



DataFrame[count(1): double]

In [6]:
def query(df):
    return df.groupBy(F.col("guardian")).agg(F.count("*"))

pac_df = (PACDataFrame.fromDataFrame(df)
                    .withOptions(PACOptions(trials = m, max_mi = mi, c = c))
                    .withSamplerOptions(SamplerOptions(fraction=sampling_rate))
                    .withQuery(lambda x: query(x)))

pac_df.releaseValue()

Found output format of query: 


  if is_categorical_dtype(series.dtype):


+--------+--------+
|guardian|count(1)|
+--------+--------+
|  father|     0.0|
|  mother|     0.0|
|   other|     0.0|
+--------+--------+

max_mi: 0.25, eta: 0.05, dimensions: 3
Using the identity matrix as the projection matrix.


0it [00:00, ?it/s]

Converged after 1301 trials
Final variance estimates: [21.487457469303163, 70.25039687439629, 7.937531645083514]
sqrt total var is 9.983756106234916
Computed noise (variances) is [92.55853536374374, 167.35872160506773, 56.25572079431135]
Sample: [ 49 136  17] + Noise: [92.55853536374374, 167.35872160506773, 56.25572079431135] = Noised: [48.825404333419996, 100.16134904466116, 19.554682773846373]


  if is_categorical_dtype(series.dtype):


Inserting to dataframe:
+--------+------------------+
|guardian|          count(1)|
+--------+------------------+
|  father|48.825404333419996|
|  mother|100.16134904466116|
|   other|19.554682773846373|
+--------+------------------+



DataFrame[guardian: string, count(1): double]

In [9]:
query(df).show()

+--------+--------+
|guardian|count(1)|
+--------+--------+
|  father|      90|
|  mother|     273|
|   other|      32|
+--------+--------+



In [10]:
def query(df):
    return df.groupBy(F.col("guardian")).agg(F.avg("absences"), F.max("absences"))

pac_df = (PACDataFrame.fromDataFrame(df)
                    .withOptions(PACOptions(trials = m, max_mi = mi, c = c))
                    .withSamplerOptions(SamplerOptions(fraction=sampling_rate))
                    .withQuery(lambda x: query(x)))

pac_df.releaseValue()

Found output format of query: 


  if is_categorical_dtype(series.dtype):


+--------+-------------+-------------+
|guardian|avg(absences)|max(absences)|
+--------+-------------+-------------+
|  father|          0.0|          0.0|
|  mother|          0.0|          0.0|
|   other|          0.0|          0.0|
+--------+-------------+-------------+

max_mi: 0.25, eta: 0.05, dimensions: 6
Using the identity matrix as the projection matrix.


0it [00:00, ?it/s]

Converged after 2031 trials
Final variance estimates: [0.2673946669479816, 0.2610972296632544, 3.4483907699257403, 6.138180215522039, 243.33233986939516, 65.47905155951779]
sqrt total var is 17.858512096783763
Computed noise (variances) is [18.46935115605126, 18.250568614365882, 66.32595393452247, 88.49018142275902, 557.1539118546636, 289.01904259423685]
Sample: [ 4.825       6.0738255   8.88888889 21.         75.         40.        ] + Noise: [18.46935115605126, 18.250568614365882, 66.32595393452247, 88.49018142275902, 557.1539118546636, 289.01904259423685] = Noised: [13.888134009570617, -14.747491301075355, 11.092936084403693, -93.90990460306088, -299.8511359518356, 396.8721673352766]
Inserting to dataframe:


  if is_categorical_dtype(series.dtype):


+--------+-------------------+------------------+
|guardian|      avg(absences)|     max(absences)|
+--------+-------------------+------------------+
|  father| 13.888134009570617|-93.90990460306088|
|  mother|-14.747491301075355|-299.8511359518356|
|   other| 11.092936084403693| 396.8721673352766|
+--------+-------------------+------------------+



DataFrame[guardian: string, avg(absences): double, max(absences): double]

Conversion between dataframes and numpy vectors

In [11]:
from typing import Optional
import pyspark.sql.dataframe
import pyspark.sql.types as T
import pyspark.pandas as ps

df = math_df

def _unwrapDataFrame(df: pyspark.sql.DataFrame) -> np.ndarray:
    """
    Convert a PySpark DataFrame into a numpy vector.
    """
    
    numeric_columns: List[str] = [f.name for f in df.schema.fields if isinstance(f.dataType, T.NumericType)]
    df_numeric: pyspark.sql.DataFrame = df.select(*numeric_columns)  # select only numeric columns
    np_array: np.ndarray = np.array(df_numeric.collect())

    flat: np.ndarray = np_array.flatten(order="F")

    return flat

def _updateDataFrame(vec: np.ndarray, df: pyspark.sql.DataFrame) -> pyspark.sql.DataFrame:
    """
    Use the values of the numpy vector to update the PySpark DataFrame.
    """

    numeric_columns: List[str] = [f.name for f in df.schema.fields if isinstance(f.dataType, T.NumericType)]
    df_numeric: pyspark.sql.DataFrame = df.select(*numeric_columns)  # select only numeric columns
    shape = np.array(df_numeric.collect()).shape

    np_array = vec.reshape(shape, order="F")
    new_pandas: ps.DataFrame = ps.DataFrame(np_array, columns=numeric_columns)
    print(new_pandas)

    old_pandas = df.pandas_api()
    old_pandas.update(new_pandas)

    return old_pandas.to_spark()


u1 = _unwrapDataFrame(df.filter(df["absences"] >= 5).agg(F.count("*")))
u2 = _unwrapDataFrame(df.groupBy(F.col("guardian")).agg(F.count("*")))
u3 = _unwrapDataFrame(df.groupBy(F.col("guardian")).agg(F.avg("absences"), F.max("absences")))

print(u1)
print(u2)
print(u3)

r1 = _updateDataFrame(u1, df.filter(df["absences"] >= 5).agg(F.count("*")))
r2 = _updateDataFrame(u2, df.groupBy(F.col("guardian")).agg(F.count("*")))
r3 = _updateDataFrame(u3, df.groupBy(F.col("guardian")).agg(F.avg("absences"), F.max("absences")))

r1.show()
r2.show()
r3.show()



[151]
[ 90 273  32]
[ 3.97777778  5.83516484  9.5        21.         75.         40.        ]


  if is_categorical_dtype(series.dtype):


   count(1)
0       151


  if is_categorical_dtype(series.dtype):


   count(1)
0        90
1       273
2        32


  if is_categorical_dtype(series.dtype):


   avg(absences)  max(absences)
0       3.977778           21.0
1       5.835165           75.0
2       9.500000           40.0




+--------+
|count(1)|
+--------+
|     151|
+--------+

+--------+--------+
|guardian|count(1)|
+--------+--------+
|  father|      90|
|  mother|     273|
|   other|      32|
+--------+--------+

+--------+-----------------+-------------+
|guardian|    avg(absences)|max(absences)|
+--------+-----------------+-------------+
|  father|3.977777777777778|         21.0|
|  mother|5.835164835164835|         75.0|
|   other|              9.5|         40.0|
+--------+-----------------+-------------+

