# Basic PUMS Analysis with OpenDP

This notebook will be a brief tutorial on doing data analysis within the OpenDP system.

We will start out by setting up our environment -- loading the necessary libraries and establishing the very basic
things we need to know before loading our data (the file path and variable names).

In [151]:
# load libraries
from opendp.trans import *
from opendp.meas import *
from opendp.core import *

# establish data information
data_path = os.path.join('.', 'data', 'PUMS_california_demographics_1000', 'data.csv')
var_names = ["age", "sex", "educ", "race", "income", "married", "pid"]

with open(data_path) as input_data:
    data = input_data.read()

### Properties

*TODO* OpenDP architeture description here

In [152]:
# Create a chained computation
preprocessor = (
    # Convert data into Vec<Vec<String>>
    make_split_dataframe(separator=",", col_names=[0, 1, 2]) >>
    # Selects a column of df, Vec<str>
    make_select_column(key=0, T=str) >>
    # Cast the column as Vec<Int>
    make_cast(TI=str, TO=int) >>
    # Impute missing values to 0
    make_impute_constant(0) >>
    # Clamp age values
    make_clamp(20, 50)
)
res = preprocessor(data)
print("Age column: ", res)

Age column:  [20, 50, 31, 36, 50, 39, 34, 50, 50, 40, 27, 50, 31, 50, 50, 39, 50, 32, 50, 24, 48, 50, 43, 29, 44, 50, 27, 50, 32, 50, 28, 50, 35, 36, 50, 21, 29, 44, 35, 43, 50, 50, 42, 32, 50, 20, 40, 42, 50, 50, 45, 39, 28, 46, 45, 32, 22, 50, 21, 50, 50, 38, 40, 34, 48, 50, 46, 40, 26, 37, 30, 50, 42, 24, 31, 20, 33, 47, 20, 33, 50, 23, 50, 23, 47, 48, 43, 31, 47, 50, 50, 50, 29, 33, 50, 28, 29, 38, 42, 50, 50, 37, 40, 30, 20, 50, 50, 22, 50, 48, 46, 45, 37, 50, 25, 40, 34, 50, 43, 42, 22, 45, 32, 23, 20, 50, 44, 43, 34, 32, 38, 40, 50, 31, 42, 47, 50, 30, 42, 50, 26, 50, 22, 28, 25, 30, 50, 33, 50, 23, 36, 50, 29, 29, 20, 44, 50, 50, 50, 33, 50, 35, 47, 50, 30, 31, 30, 50, 50, 22, 50, 49, 44, 20, 35, 38, 50, 32, 29, 36, 21, 43, 50, 45, 37, 41, 46, 29, 40, 24, 50, 50, 43, 50, 38, 50, 32, 50, 27, 50, 50, 50, 50, 50, 31, 50, 29, 20, 27, 50, 50, 27, 37, 50, 38, 50, 23, 34, 39, 50, 30, 39, 40, 43, 34, 50, 50, 50, 43, 31, 36, 50, 50, 38, 45, 32, 50, 50, 47, 42, 30, 36, 42, 50, 50, 50, 50

In [153]:
# set sample size
n = 1_000

# set ranges/feasible values
age_range = (0., 100.)
sex_vals = [0, 1]
educ_vals = [i for i in range(1, 17)]
race_vals = [i for i in range(1, 7)]
income_range = (0., 500_000.)
married_vals = [0, 1]


In [154]:
with open(data_path) as input_data:
    data = input_data.read()

In [155]:
# preprocessor = (
#     # Convert data into Vec<Vec<String>>
#     make_split_dataframe(separator=",", col_names=[0, 1, 2, 3, 4]) >>
#     # Selects a column of df, Vec<str>
#     make_select_column(key=4, T=str) >>
#     # Cast the column as Vec<Int>
#     make_cast(TI=str, TO=float) >>
#     # Impute missing values to 0
#     make_impute_constant(0) >>
#     # Clamp age values
#     make_clamp(1000, 1_000_000) >>
#     make_bounded_sum(lower=0, upper=1_0000)
#     # make_base_geometric(scale=1.0)
#
# )


# res = preprocessor(data)
# print("Income column: ", res)


In [156]:
query = make_bounded_mean(lower=0., upper=10., n=9)
print("Bounded mean: ", query([float(x) for x in range(0,10)]))

In [157]:
clamp = (
    # Impute missing values to 0
    # make_impute_constant(0.) >>
    # Clamp age values
    make_clamp(0., 5.) # >>
    # make_bounded_sum(lower=0., upper=1_0000.)
    # make_base_geometric(scale=1.0)

)

bounded_sum = (
    make_bounded_sum(lower=0., upper=5.)
)


res = clamp([float(x) for x in range(0,10)])
print("Clamped: ", res)
res = bounded_sum(res)
print("Sum: ", res)



Bounded mean:  5.0


Clamped:  [0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 5.0, 5.0, 5.0, 5.0]
Sum:  35.0
