# BVM library examples: single-dataset for all combinations of QIDs

In [1]:
import pandas
from bvmlib.bvm import BVM

In [2]:
import os
import multiprocessing
from itertools import chain, combinations
import tqdm
tqdm.tqdm.monitor_interval = 0

def powerset(iterable):
    "powerset([1,2,3]) --> () (1,) (2,) (3,) (1,2) (1,3) (2,3) (1,2,3)"
    s = list(iterable)
    return chain.from_iterable(combinations(s, r) for r in range(len(s)+1))

## INEP School Census for 2018

In [3]:
def load():
    source = '/home/nunesgh/temp/inep-school-2018.csv'
    df = pandas.read_csv(source, low_memory=False)
    return df

In [4]:
df = load()
display(df)

Unnamed: 0,NU_DIA,NU_MES,NU_ANO,TP_SEXO,TP_COR_RACA,TP_NACIONALIDADE,CO_PAIS_ORIGEM,CO_MUNICIPIO_NASC,CO_MUNICIPIO_END,IN_TRANSPORTE_PUBLICO,IN_NECESSIDADE_ESPECIAL,CO_ENTIDADE,TP_DEPENDENCIA
0,28,11,2006,2,1,1,76,2112407,2112407,0,0,21069131,3
1,7,7,2015,1,3,1,76,3170404,3116159,0,0,31337056,3
2,17,7,2015,1,0,1,76,3148103,3148103,0,0,31200212,4
3,25,1,2006,1,1,1,76,2611606,2611606,0,0,26122162,4
4,22,4,2007,1,3,1,76,1501303,1501303,1,0,15036774,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...
48176418,13,6,2003,2,3,1,76,5003702,5003504,0,0,50015591,2
48176419,5,8,2008,2,1,1,76,3550308,-1,0,0,35813000,4
48176420,12,8,2012,2,3,1,76,3521804,-1,1,0,35498973,3
48176421,2,2,2004,1,1,1,76,3550308,-1,0,0,35103548,4


Description of selected attributes.
- 'NU_DIA': day of birth
- 'NU_MES': month of birth
- 'NU_ANO': year of birth
- 'TP_SEXO': gender
- 'TP_COR_RACA': ethnicity
- 'TP_NACIONALIDADE': nationality
- 'CO_PAIS_ORIGEM': country of birth
- 'CO_MUNICIPIO_NASC': city of birth
- 'CO_MUNICIPIO_END': city of residency
- 'CO_ENTIDADE': school id code
- 'TP_DEPENDENCIA': school type
- 'IN_NECESSIDADE_ESPECIAL': disability status
- 'IN_TRANSPORTE_PUBLICO': public school transportation

In [5]:
data1 = None
data2 = None

def init(_data1,_data2):
    global data1
    data1 = _data1
    global data2
    data2 = _data2

pool_size = multiprocessing.cpu_count() * 1

os.system('taskset -cp 0-%d %s' % (pool_size, os.getpid()))

quasi_identifiers = ['NU_DIA','NU_MES','NU_ANO','TP_SEXO','TP_COR_RACA','TP_NACIONALIDADE','CO_PAIS_ORIGEM',
                     'CO_MUNICIPIO_NASC','CO_MUNICIPIO_END','CO_ENTIDADE','TP_DEPENDENCIA']
sensitive_attributes = ['IN_NECESSIDADE_ESPECIAL','IN_TRANSPORTE_PUBLICO']

def job(quasi_identifiers):
    inep18 = BVM(df)
    inep18.qids(list(quasi_identifiers))
    inep18.sensitive(sensitive_attributes)
    return inep18.assess()

with multiprocessing.Pool(processes=pool_size, maxtasksperchild=1, initializer=init,
                          initargs=(df, sensitive_attributes)) as P:
    pool_outputs = list(tqdm.tqdm(P.imap(job, powerset(quasi_identifiers)),
                                  total=sum(1 for _ in powerset(quasi_identifiers))))

P.close()
P.join()

pid 40261's current affinity list: 0-39
pid 40261's new affinity list: 0-39


100%|██████████| 2048/2048 [8:53:29<00:00, 15.63s/it]   


In [6]:
inep18 ={}
inep18['re_id'] = pool_outputs[1]['re_id']
inep18['att_inf'] = pool_outputs[1]['att_inf']
for subset in range(2, 2**len(quasi_identifiers)):
    inep18['re_id'] = pandas.concat([inep18['re_id'], pool_outputs[subset]['re_id']], ignore_index=True)
    inep18['att_inf'] = pandas.concat([inep18['att_inf'], pool_outputs[subset]['att_inf']], ignore_index=True)

In [7]:
with pandas.option_context('display.max_rows', None, 'display.max_columns', None):
    display(inep18['re_id'].set_index('QID'))
    display(inep18['att_inf'].set_index(['QID', 'Sensitive']))

Unnamed: 0_level_0,dCR,pCR,Prior,Posterior,Histogram
QID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
['NU_DIA'],0.0,31,0.0,1e-06,"{'0': 1.0, '1': 0.0, '2': 0.0, '3': 0.0, '4': ..."
['NU_MES'],0.0,12,0.0,0.0,"{'0': 1.0, '1': 0.0, '2': 0.0, '3': 0.0, '4': ..."
['NU_ANO'],0.0,107,0.0,2e-06,"{'0': 0.9999851587154986, '1': 9.9218657225755..."
['TP_SEXO'],0.0,2,0.0,0.0,"{'0': 1.0, '1': 0.0, '2': 0.0, '3': 0.0, '4': ..."
['TP_COR_RACA'],0.0,6,0.0,0.0,"{'0': 1.0, '1': 0.0, '2': 0.0, '3': 0.0, '4': ..."
['TP_NACIONALIDADE'],0.0,3,0.0,0.0,"{'0': 1.0, '1': 0.0, '2': 0.0, '3': 0.0, '4': ..."
['CO_PAIS_ORIGEM'],0.0,203,0.0,4e-06,"{'0': 0.9999083576628344, '1': 4.8882832168756..."
['CO_MUNICIPIO_NASC'],0.0,5571,0.0,0.000116,"{'0': 0.9978074752457234, '1': 0.0017599895284..."
['CO_MUNICIPIO_END'],0.0,5564,0.0,0.000115,"{'0': 0.999524995867792, '1': 0.00033198811792..."
['CO_ENTIDADE'],2e-06,183706,0.0,0.003813,"{'0': 0.8191146320680553, '1': 0.1466345477745..."


Unnamed: 0_level_0,Unnamed: 1_level_0,dCA,pCA,Prior,Posterior,Histogram
QID,Sensitive,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
['NU_DIA'],IN_NECESSIDADE_ESPECIAL,0.0,1.0,0.975564,0.975564,"{'0': 0.0, '1': 0.0, '2': 0.0, '3': 0.0, '4': ..."
['NU_DIA'],IN_TRANSPORTE_PUBLICO,0.0,1.0,0.817506,0.817506,"{'0': 0.0, '1': 0.0, '2': 0.0, '3': 0.0, '4': ..."
['NU_MES'],IN_NECESSIDADE_ESPECIAL,0.0,1.0,0.975564,0.975564,"{'0': 0.0, '1': 0.0, '2': 0.0, '3': 0.0, '4': ..."
['NU_MES'],IN_TRANSPORTE_PUBLICO,0.0,1.0,0.817506,0.817506,"{'0': 0.0, '1': 0.0, '2': 0.0, '3': 0.0, '4': ..."
['NU_ANO'],IN_NECESSIDADE_ESPECIAL,1e-06,1.0,0.975564,0.975564,"{'0': 0.0, '1': 0.0, '2': 0.0, '3': 0.0, '4': ..."
['NU_ANO'],IN_TRANSPORTE_PUBLICO,0.0,1.0,0.817506,0.817506,"{'0': 0.0, '1': 0.0, '2': 0.0, '3': 0.0, '4': ..."
['TP_SEXO'],IN_NECESSIDADE_ESPECIAL,0.0,1.0,0.975564,0.975564,"{'0': 0.0, '1': 0.0, '2': 0.0, '3': 0.0, '4': ..."
['TP_SEXO'],IN_TRANSPORTE_PUBLICO,0.0,1.0,0.817506,0.817506,"{'0': 0.0, '1': 0.0, '2': 0.0, '3': 0.0, '4': ..."
['TP_COR_RACA'],IN_NECESSIDADE_ESPECIAL,0.0,1.0,0.975564,0.975564,"{'0': 0.0, '1': 0.0, '2': 0.0, '3': 0.0, '4': ..."
['TP_COR_RACA'],IN_TRANSPORTE_PUBLICO,0.0,1.0,0.817506,0.817506,"{'0': 0.0, '1': 0.0, '2': 0.0, '3': 0.0, '4': ..."


---