# Loading/validating input sources

In [None]:
import os
import tempfile

from tmlt.core.domains.base import OutOfDomainError

from tmlt.analytics.query_builder import QueryBuilder, ColumnType
from tmlt.analytics.privacy_budget import PureDPBudget
from tmlt.analytics.session import Session

In [None]:
private_data = """id,lat,lon,zone,remark
1,2.1,3.2,001,xyz
2,3.1,2.2,011,abc
3,2.3,1.2,011,abc"""
public_data = """zone,name,area,alt
001,z1,12.3,1.0
010,z2,INVALID_VALUE,1230
011,z3,451.1,"""
tmp_dir = tempfile.mkdtemp()
private_csv = os.path.join(tmp_dir, "private.csv")
public_csv = os.path.join(tmp_dir, "public.csv")
with open(private_csv, "w") as f:
    f.write(private_data)
with open(public_csv, "w") as f:
    f.write(public_data)

### Define a schema for the datasets

In [None]:
# These schemas do not have to contain all of the columns in the datasets.
private_schema = {
    "id": ColumnType.INTEGER,
    "lat": ColumnType.DECIMAL,
    "lon": ColumnType.DECIMAL,
    "zone": ColumnType.VARCHAR,
}
public_schema = {
    "zone": ColumnType.VARCHAR,
    "name": ColumnType.VARCHAR,
}

### Create a session and add a public source

In [None]:
dp_session = Session.from_csv(
    privacy_budget=PureDPBudget(float("inf")),
    source_id= "private",
    path=private_csv,
    schema=private_schema,
)
dp_session.add_public_csv(
    source_id="public",
    path=public_csv,
    schema=public_schema,
)

In [None]:
print("PRIVATE SCHEMA:", dp_session.get_schema("private"))
print("PUBLIC SCHEMA:", dp_session.get_schema("public"))

In [None]:
# View loaded public source
dp_session.public_source_dataframes["public"].show()

In [None]:
qb = QueryBuilder("private")
count_by_zone_name = qb.join_public("public").groupby_domains({"name":["z1", "z2", "z3"]}).count()

In [None]:
answer = dp_session.evaluate(count_by_zone_name, privacy_budget=PureDPBudget(float("inf")))
answer.show()

### Errors with invalid inputs

In [None]:
# Bad schema -- non-existing column name
private_schema_invalid_column = {
    "ID": ColumnType.INTEGER,  # Suppose there is a typo in the schema
    "lat": ColumnType.DECIMAL,
    "lon": ColumnType.DECIMAL,
    "zone": ColumnType.VARCHAR,
}
try:
    Session.from_csv(
        privacy_budget=PureDPBudget(float("inf")),
        source_id= "private",
        path=private_csv,
        schema=private_schema_invalid_column,
    )
except ValueError as e:
    print(e)

In [None]:
# Column 'remark' contains strings.
private_schema_mismatched_type = {
    "id": ColumnType.INTEGER,
    "lat": ColumnType.DECIMAL,
    "lon": ColumnType.DECIMAL,
    "zone": ColumnType.VARCHAR,
    "remark": ColumnType.INTEGER,
}
try:
    Session.from_csv(
        privacy_budget=PureDPBudget(float("inf")),
        source_id= "private",
        path=private_csv,
        schema=private_schema_mismatched_type,
    )
except OutOfDomainError as e:
    print(e)

In [None]:
# Column 'alt' in public contains a NULL
public_schema_null_values = {
    "zone": ColumnType.VARCHAR,
    "name": ColumnType.VARCHAR,
    "alt": ColumnType.VARCHAR,
}
try:
    dp_session.add_public_csv(
        source_id="public_2",
        path=public_csv,
        schema=public_schema_null_values,
    )
except OutOfDomainError as e:
    print(e)