### Multiple Tables

In [1]:
from pyspark.sql import SparkSession, DataFrame
from pyspark.sql.functions import lower, col, count, concat_ws
from pyspark.sql.types import Row
from pyspark import RDD
from typing import List, Tuple, Callable, Dict, Optional, Any, NamedTuple
import numpy as np
import scipy.stats as stats
from tqdm import tqdm

from pacdb import PACDataFrame, Sampler, DataFrameSampler, SamplerOptions, minimal_permutation_distance

spark = (SparkSession.builder.appName("pacdb")
         .config("spark.executor.memory", "512M")
         .config("spark.sql.warehouse.dir", ".spark")
         .enableHiveSupport()
         .getOrCreate())

spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")

import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
import matplotlib.lines as mlines
import matplotlib.patches as mpatches

# set font to Times New Roman
LATEX = False
if LATEX:
    mpl.rcParams['text.usetex'] = True
    mpl.rcParams["font.family"] = "serif"
    mpl.rcParams["font.serif"] = "Times"
else:
    mpl.rcParams['text.usetex'] = False
    mpl.rcParams["font.family"] = "Times New Roman"
    mpl.rcParams["mathtext.fontset"] = "stix"
    
plt.rcParams['svg.fonttype'] = 'none'
mpl.rcParams['savefig.dpi'] = 300

import matplotlib_inline.backend_inline
matplotlib_inline.backend_inline.set_matplotlib_formats('svg')

mpl.rcParams['axes.titleweight'] = 'bold'

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/03/14 12:15:03 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/03/14 12:15:04 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [2]:
lung_df = spark.read.parquet("./data/lung.parquet")
#lung_df.write.saveAsTable("lung", mode="overwrite") # for SQL queries

                                                                                

In [3]:
# Split the lung table into multiple tables over Patient Id
identity_df: DataFrame = lung_df.select("Name", "index", "Patient Id", "Age", "Gender")
identity_df.write.saveAsTable("identity", mode="overwrite")

symptoms_df: DataFrame = lung_df.select("Patient Id", "Chest Pain", "Coughing of Blood", "Fatigue", "Weight Loss", "Shortness of Breath", "Wheezing", "Swallowing Difficulty", "Clubbing of Finger Nails", "Frequent Cold", "Dry Cough", "Snoring")
symptoms_df.write.saveAsTable("symptoms", mode="overwrite")

risk_factors_df: DataFrame = lung_df.select("Patient Id", "Air Pollution", "Alcohol use", "Dust Allergy", "Occupational Hazards", "Genetic Risk", "Chronic Lung Disease", "Balanced Diet", "Obesity", "Smoking", "Passive Smoker")
risk_factors_df.write.saveAsTable("risk_factors", mode="overwrite")

24/03/14 12:15:28 WARN HiveConf: HiveConf of name hive.stats.jdbc.timeout does not exist
24/03/14 12:15:28 WARN HiveConf: HiveConf of name hive.stats.retries.wait does not exist
24/03/14 12:15:31 WARN ObjectStore: Version information not found in metastore. hive.metastore.schema.verification is not enabled so recording the schema version 2.3.0
24/03/14 12:15:31 WARN ObjectStore: setMetaStoreSchemaVersion called but recording version is disabled: version = 2.3.0, comment = Set by MetaStore michael@10.0.0.4
24/03/14 12:15:32 WARN ObjectStore: Failed to get database global_temp, returning NoSuchObjectException
24/03/14 12:15:34 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
24/03/14 12:15:36 WARN SessionState: METASTORE_FILTER_HOOK will be ignored, since hive.security.authorization.manager is set to instance of HiveAuthorizerFactory.
24/03/14 12:15:37 WARN HiveConf: 

In [4]:
# Count the number of patients in each age group with chest pain who are smokers
spark.sql("""
        SELECT FLOOR(identity.Age / 10) * 10 AS Age_Group, COUNT(*) AS Count
        FROM identity
        JOIN symptoms ON identity.`Patient Id` = symptoms.`Patient Id`
        JOIN risk_factors ON identity.`Patient Id` = risk_factors.`Patient Id`
        WHERE risk_factors.Smoking >= 3
          AND symptoms.`Chest Pain` > 1
        GROUP BY FLOOR(identity.Age / 10) * 10
        ORDER BY Age_Group
        """).toPandas()

Unnamed: 0,Age_Group,Count
0,10,28
1,20,106
2,30,222
3,40,120
4,50,30
5,60,31
6,70,10


### Do all joins and filters first

In [5]:
# 0. Do all joins, etc. to get the data you want into a single Spark DataFrame. Turn it into a PACDataFrame.
spark_df: DataFrame = (identity_df
                       .join(symptoms_df.filter(symptoms_df["Chest Pain"] > 1), "Patient Id") # First do all the joins
                       .join(risk_factors_df.filter(risk_factors_df["Smoking"] >= 3) , "Patient Id") # N
                      )
pac_df: PACDataFrame = PACDataFrame(spark_df)
               
# Define your query as a function that takes a DataFrame and returns (for now) an integer
def A(x: DataFrame) -> int:
    """Function to make private"""
    y = (x.withColumn("Age_Group", (x["Age"] / 10).cast("int") * 10)
          .groupBy("Age_Group")
          .count()
          .orderBy("Age_Group"))
    return y

# Attach the predicate function to the PACDataFrame
pac_df = pac_df.withQuery(A)

In [6]:
# 1. Sampling
# Set sampler options on the PACDataFrame
pac_df = (pac_df.withSamplerOptions(
                SamplerOptions(
                    withReplacement=False, 
                    fraction=0.5
                ))
                .setNumberOfTrials(200)
                .setMutualInformationBound(1./8))

pac_df

<pacdb.main.PACDataFrame at 0x29ea374d0>

In [8]:
pac_df._subsample()
X = pac_df.X
pac_df.X[0]

Subsample: 100%|██████████| 400/400 [00:01<00:00, 285.39it/s]


[DataFrame[Patient Id: string, Name: string, index: bigint, Age: bigint, Gender: bigint, Chest Pain: bigint, Coughing of Blood: bigint, Fatigue: bigint, Weight Loss: bigint, Shortness of Breath: bigint, Wheezing: bigint, Swallowing Difficulty: bigint, Clubbing of Finger Nails: bigint, Frequent Cold: bigint, Dry Cough: bigint, Snoring: bigint, Air Pollution: bigint, Alcohol use: bigint, Dust Allergy: bigint, Occupational Hazards: bigint, Genetic Risk: bigint, Chronic Lung Disease: bigint, Balanced Diet: bigint, Obesity: bigint, Smoking: bigint, Passive Smoker: bigint],
 DataFrame[Patient Id: string, Name: string, index: bigint, Age: bigint, Gender: bigint, Chest Pain: bigint, Coughing of Blood: bigint, Fatigue: bigint, Weight Loss: bigint, Shortness of Breath: bigint, Wheezing: bigint, Swallowing Difficulty: bigint, Clubbing of Finger Nails: bigint, Frequent Cold: bigint, Dry Cough: bigint, Snoring: bigint, Air Pollution: bigint, Alcohol use: bigint, Dust Allergy: bigint, Occupational H

In [9]:
# 2. Measure Stability

# Eventually this should become something like PACDataFrame.analyze()
# The challenge is how to handle the way that sampling affects the outcome of the predicate
# for all predicate types: i.e. we need to multiply to cancel out sampling rate for count and sum

pac_df._measure_stability()
Y = pac_df.Y
Y_pairs = pac_df.Y_pairs
Y[0]

Measure Stability: 100%|██████████| 400/400 [00:16<00:00, 23.67it/s]


[DataFrame[Age_Group: int, count: bigint],
 DataFrame[Age_Group: int, count: bigint],
 DataFrame[Age_Group: int, count: bigint]]

In [11]:
Y[0][0].toPandas()

Unnamed: 0,Age_Group,count
0,10,12
1,20,49
2,30,103
3,40,59
4,50,21
5,60,15
6,70,4


In [None]:
# 3. Estimate Noise
c = 0.001
max_mi = 1/8  # 2 * v

import pandas as pd

# find paired distances between dataframes: take count column as vector
avg_dist = 0
tau = len(np.array(Y_pairs[0][0].toPandas()["count"]))
for trial in range(trials):
    y1, y2 = Y_pairs[trial][0], Y_pairs[trial][1]
    # get the count column as an array of ints
    y1: np.array = np.array(y1.toPandas()["count"])
    y2: np.array = np.array(y2.toPandas()["count"])
    # compute the distance between the two vectors
    dist = 0.

    for ind in range(tau):
        dist += np.linalg.norm(np.array(y1[ind]) - np.array(y2[ind]))**2 / tau
    avg_dist += dist
    #ys.append((y1, y2))

avg_dist /= trials

In [None]:
avg_dist

In [None]:
import paclib
# At different levels of MI, compute noise to add
noise_params = []
for max_mi in [1/64, 1/32, 1/16, 1/8, 1/4, 1/2, 1., 2., 4.]:
    print(f"avg_dist: {avg_dist}, c: {c}, max_mi: {max_mi:8} => {paclib.noise_to_add(avg_dist, c, max_mi)}")
    noise_params.append([
        max_mi, 
        paclib.noise_to_add(avg_dist, c, max_mi).mean, 
        paclib.noise_to_add(avg_dist, c, max_mi).variance
        ])

In [None]:
# 4. Noised Release

# obtain one more sample to use for noised release
Yj = pac_df._applyPredicate(pac_df.sample())
Yj_arr: np.array = np.array(Yj.toPandas()["count"])
print(Yj_arr)

c = 0.001
mi = 1/16

noise_to_add = paclib.noise_to_add(avg_dist, c, mi).sample()
print(noise_to_add)

noised_Yj = Yj_arr + noise_to_add
noised_Yj