# Using Session.Builder

In [None]:
import os
import tempfile
import shutil

from pyspark.sql import SparkSession

from tmlt.analytics.privacy_budget import RhoZCDPBudget
from tmlt.analytics.session import Session
from tmlt.analytics.query_builder import QueryBuilder


### Setup data

In [None]:
spark = SparkSession.builder.getOrCreate()

df1 = spark.createDataFrame(
    [
        ("A", 4),
        ("B", 2),
        ("B", 1),
        ("C", 3),
        ("C", 2),
    ],
    schema=["X", "P"]
)

df2 = spark.createDataFrame(
    [
        ("A", 5),
        ("B", 10),
        ("C", 8),
    ],
    schema=["X", "Q"]
)

csv1_content = """X,R
A,J
A,K
A,L
B,M
B,N
C,O"""

data_dir = tempfile.mkdtemp()
csv1_path = os.path.join(data_dir, "csv1.csv")
with open(csv1_path, "w") as f:
    f.write(csv1_content)
    f.flush()

### Building Session with multiple private sources

In [None]:
session = ( 
    Session.Builder()
    .with_privacy_budget(privacy_budget=RhoZCDPBudget(10))
    .with_private_dataframe(source_id="DF1", dataframe=df1, stability=1)
    .with_private_dataframe(source_id="DF2", dataframe=df2, stability=1)
    .with_private_csv(source_id="CSV1", path=csv1_path,schema={"X":"VARCHAR", "R":"VARCHAR"},stability=1)
    .build()
)

### Construct and evaluate queries

In [None]:
df1_sum_P = (
    QueryBuilder("DF1")
    .groupby_domains({"X":["A", "B", "C"]})
    .sum(column="P", low=1, high=3)
)

df2_average_Q = (
    QueryBuilder(source_id="DF2")
    .average(column="Q", low=1, high=10)
)

csv1_count_R = (
    QueryBuilder(source_id="CSV1")
    .count()
)

answers = []
for query in [df1_sum_P, df2_average_Q, csv1_count_R]:
    answers.append(session.evaluate(
        query_expr=query,
        privacy_budget=RhoZCDPBudget(10/3),
    ))

In [None]:
for answer in answers:
    answer.show()

In [None]:
# Clean up
shutil.rmtree(data_dir)