# BVM library examples: single-dataset

In [1]:
import pandas
from bvmlib.bvm import BVM

## Small example

In [2]:
df0 = pandas.DataFrame(data={'id':[i for i in range(1,11)],
                             'age':[25,25,25,25,25,49,49,49,49,60],
                             'gender':['F','F','F','M','M','F','F','F','M','M'],
                             'grade':['A','A','C','B','B','C','C','E','D','D'],
                             'disability':[False,True,True,True,False,True,True,False,False,False]})

In [3]:
display(df0)

Unnamed: 0,id,age,gender,grade,disability
0,1,25,F,A,False
1,2,25,F,A,True
2,3,25,F,C,True
3,4,25,M,B,True
4,5,25,M,B,False
5,6,49,F,C,True
6,7,49,F,C,True
7,8,49,F,E,False
8,9,49,M,D,False
9,10,60,M,D,False


In [4]:
T0 = BVM(df0)
T0.qids(['age', 'gender'])
T0.sensitive(['grade', 'disability'])

In [5]:
T0.worth('disability',{'True':10,'False':2})
T0.worth('grade',{'A':10,'B':6,'C':6,'D':6,'E':8})

In [6]:
T0_results = T0.assess()

In [7]:
with pandas.option_context('display.max_rows', None, 'display.max_columns', None):
    display(T0_results['re_id'])
    display(T0_results['att_inf'])
    display(T0_results['information_worth'])

Unnamed: 0,QID,dCR,pCR,Prior,Posterior,Histogram
0,"['age', 'gender']",0.2,5,0.1,0.5,"{'0': 0.0, '1': 0.0, '2': 0.0, '3': 0.0, '4': ..."


Unnamed: 0,QID,Sensitive,dCA,pCA,Prior,Posterior,Histogram
0,"['age', 'gender']",grade,0.4,2.666667,0.3,0.8,"{'0': 0.0, '1': 0.0, '2': 0.0, '3': 0.0, '4': ..."
1,"['age', 'gender']",disability,0.2,1.4,0.5,0.7,"{'0': 0.0, '1': 0.0, '2': 0.0, '3': 0.0, '4': ..."


Unnamed: 0,QID,Sensitive,Prior Worth,Posterior Worth
0,"['age', 'gender']",grade,2.0,5.6
1,"['age', 'gender']",disability,5.0,5.4


## [Adult dataset](https://archive.ics.uci.edu/ml/datasets/Adult)

In [8]:
def load1():
    header = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation',
              'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
              'classification']
    attributes = ['age', 'sex', 'race', 'native-country', 'marital-status', 'relationship', 'workclass',
                  'occupation', 'education-num']
    source = 'https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data'
    df = pandas.read_csv(source, names=header, usecols=attributes, low_memory=False)
    return df

In [9]:
df1 = load1()
display(df1)

Unnamed: 0,age,workclass,education-num,marital-status,occupation,relationship,race,sex,native-country
0,39,State-gov,13,Never-married,Adm-clerical,Not-in-family,White,Male,United-States
1,50,Self-emp-not-inc,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,United-States
2,38,Private,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,United-States
3,53,Private,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,United-States
4,28,Private,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,Cuba
...,...,...,...,...,...,...,...,...,...
32556,27,Private,12,Married-civ-spouse,Tech-support,Wife,White,Female,United-States
32557,40,Private,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,United-States
32558,58,Private,9,Widowed,Adm-clerical,Unmarried,White,Female,United-States
32559,22,Private,9,Never-married,Adm-clerical,Own-child,White,Male,United-States


In [10]:
T1 = BVM(df1)
T1.qids(['age', 'sex', 'race', 'native-country', 'marital-status', 'workclass', 'occupation'])
T1.sensitive(['relationship', 'education-num'])

Description of selected sensitive attribute values.
- 'education-num'
    - 1: Preschool
    - 2: 1st-4th
    - 3: 5th-6th
    - 4: 7th-8th
    - 5: 9th
    - 6: 10th
    - 7: 11th
    - 8: 12th
    - 9: HS-grad
    - 10: Some-college
    - 11: Assoc-voc
    - 12: Assoc-acdm
    - 13: Bachelors
    - 14: Masters
    - 15: Prof-school
    - 16: Doctorate

In [11]:
T1.worth('education-num',{'9':3,'13':9,'14':10,'16':12})

In [12]:
T1_results = T1.assess()

In [13]:
with pandas.option_context('display.max_rows', None, 'display.max_columns', None):
    display(T1_results['re_id'])
    display(T1_results['att_inf'])
    display(T1_results['information_worth'])

Unnamed: 0,QID,dCR,pCR,Prior,Posterior,Histogram
0,"['age', 'sex', 'race', 'native-country', 'mari...",0.262738,12649,3.1e-05,0.388471,"{'0': 0.0, '1': 0.009182764657105125, '2': 0.0..."


Unnamed: 0,QID,Sensitive,dCA,pCA,Prior,Posterior,Histogram
0,"['age', 'sex', 'race', 'native-country', 'mari...",relationship,0.648014,2.176685,0.405178,0.881945,"{'0': 0.0, '1': 0.0, '2': 0.0, '3': 0.0, '4': ..."
1,"['age', 'sex', 'race', 'native-country', 'mari...",education-num,0.302693,2.008571,0.322502,0.647769,"{'0': 0.0, '1': 0.0, '2': 0.0, '3': 0.0, '4': ..."


Unnamed: 0,QID,Sensitive,Prior Worth,Posterior Worth
0,"['age', 'sex', 'race', 'native-country', 'mari...",education-num,1.480145,2.542275


## [US Census Data (1990) dataset](https://archive.ics.uci.edu/ml/datasets/US+Census+Data+%281990%29)

In [14]:
def load2():
    header = ['caseid', 'dAge', 'dAncstry1', 'dAncstry2', 'iAvail', 'iCitizen', 'iClass', 'dDepart', 'iDisabl1',
              'iDisabl2', 'iEnglish', 'iFeb55', 'iFertil', 'dHispanic', 'dHour89', 'dHours', 'iImmigr', 'dIncome1',
              'dIncome2', 'dIncome3', 'dIncome4', 'dIncome5', 'dIncome6', 'dIncome7', 'dIncome8', 'dIndustry',
              'iKorean', 'iLang1', 'iLooking', 'iMarital', 'iMay75880', 'iMeans', 'iMilitary', 'iMobility',
              'iMobillim', 'dOccup', 'iOthrserv', 'iPerscare', 'dPOB', 'dPoverty', 'dPwgt1', 'iRagechld',
              'dRearning', 'iRelat1', 'iRelat2', 'iRemplpar', 'iRiders', 'iRlabor', 'iRownchld', 'dRpincome',
              'iRPOB', 'iRrelchld', 'iRspouse', 'iRvetserv', 'iSchool', 'iSept80', 'iSex', 'iSubfam1', 'iSubfam2',
              'iTmpabsnt', 'dTravtime', 'iVietnam', 'dWeek89', 'iWork89', 'iWorklwk', 'iWWII', 'iYearsch',
              'iYearwrk', 'dYrsserv']
    attributes = ['dAge', 'dAncstry1', 'dAncstry2', 'iCitizen', 'iClass', 'iDisabl1', 'iDisabl2', 'iEnglish',
                  'iFertil', 'dHour89', 'iImmigr', 'dIncome1', 'dIncome2', 'dIncome3', 'dIncome4', 'dIncome5',
                  'dIncome6', 'dIncome7', 'dIncome8', 'dIndustry', 'iKorean', 'iLang1', 'iMarital', 'iMeans',
                  'dOccup', 'dPOB', 'dPoverty', 'iRagechld', 'dRearning', 'iSchool', 'iSex',
                  'iVietnam', 'iWWII', 'iYearsch']
    source = 'https://archive.ics.uci.edu/ml/machine-learning-databases/census1990-mld/USCensus1990.data.txt'
    df = pandas.read_csv(source, usecols=attributes, low_memory=False)
    return df

Description of selected attributes.
- 'dAge': Age
- 'dAncstry1': Ancestry 1
- 'dAncstry2': Ancestry 2
- 'iCitizen': Citizenship
- 'iClass': Class of Worker
- 'iDisabl1': Work Limitation Status
- 'iDisabl2': Work Prevented Status
- 'iEnglish': Ability to Speak English
- 'iFertil': Number of Children Ever Born
- 'dHour89': Usual Hours Worked Per Week In 1989
- 'iImmigr': Year of Entry
- 'dIncome1': Wages or Salary Income In 1989
- 'dIncome2': Nonfarm Self Employment Income In 1989
- 'dIncome3': Farm Self Employment Income In 1989
- 'dIncome4': Interests, Dividends, and Net Rental Income In 1989
- 'dIncome5': Social Security Income In 1989
- 'dIncome6': Public Assistance Income In 1989
- 'dIncome7': Retail Income In 1989
- 'dIncome8': All Other Income In 1989
- 'dIndustry': Industry
- 'iKorean': Served Korean Conflict
- 'iLang1': Language Other Than English At Home
- 'iMarital': Marital Status
- 'iMeans': Means of Transportation to Work
- 'dOccup': Occupation
- 'dPOB': Place of Birth
- 'dPoverty': Poverty Status
- 'iRagechld': Presence and Age of Own Chld
- 'dRearning': Total Personal Earnings
- 'iSchool': School Enrollment
- 'iSex': Sex
- 'iVietnam': Served Vietnam Conflict
- 'iWWII': Served World War II Conflict
- 'iYearsch': Education Attainment

In [15]:
df2 = load2()
display(df2)

Unnamed: 0,dAge,dAncstry1,dAncstry2,iCitizen,iClass,iDisabl1,iDisabl2,iEnglish,iFertil,dHour89,...,dOccup,dPOB,dPoverty,iRagechld,dRearning,iSchool,iSex,iVietnam,iWWII,iYearsch
0,5,0,1,0,5,2,2,1,1,4,...,3,0,2,4,3,1,1,0,0,11
1,6,1,1,0,7,2,2,0,3,1,...,2,0,2,4,2,1,1,0,0,5
2,3,1,2,0,7,2,2,0,1,4,...,4,0,2,4,2,1,1,0,0,10
3,4,1,2,0,1,2,2,0,3,3,...,2,0,2,2,2,1,1,0,0,10
4,7,1,1,0,0,2,2,0,3,0,...,0,0,2,4,0,1,1,0,0,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2458280,7,1,2,0,0,2,2,0,0,0,...,0,0,2,0,0,1,0,0,1,7
2458281,1,1,2,0,0,0,0,0,0,0,...,0,0,2,4,0,2,1,0,0,4
2458282,3,3,1,0,1,2,2,1,0,3,...,5,0,2,0,4,1,0,0,0,11
2458283,6,0,1,0,1,2,2,0,1,2,...,2,0,2,4,3,1,1,0,0,10


In [16]:
T2 = BVM(df2)
T2.qids(['dAge', 'dAncstry1', 'dAncstry2', 'iClass', 'iEnglish', 'dHour89', 'iLang1', 'iMarital', 'iMeans',
         'dOccup', 'dPOB', 'iSex'])
T2.sensitive(['iCitizen', 'dRearning'])

Description of selected sensitive attributes values.
- 'iCitizen': Citizenship
    - 0: Born in the U.S.
    - 1: Born in Puerto Rico, Guam, and Outlying
    - 2: Born Abroad of American Parents
    - 3: U.S. Citizen by Naturalization
    - 4: Not a Citizen of the U.S.
- 'dRearning': Total Personal Earnings
    - 0: If original value equals 0
    - 1: If original value less than 0
    - 2: If original value less than 15000
    - 3: If original value less than 30000
    - 4: If original value less than 60000
    - 5: If original value is anything else

    (Original values ranging from 0 to 284000 for a $284000 Topcode. State Medians Included as 284001.)

In [17]:
T2.worth('iCitizen',{'0':1,'4':2})
T2.worth('dRearning',{'0':1,'4':5,'5':10})

In [18]:
T2_results = T2.assess()

In [19]:
with pandas.option_context('display.max_rows', None, 'display.max_columns', None):
    display(T2_results['re_id'])
    display(T2_results['att_inf'])
    display(T2_results['information_worth'])

Unnamed: 0,QID,dCR,pCR,Prior,Posterior,Histogram
0,"['dAge', 'dAncstry1', 'dAncstry2', 'iClass', '...",0.06383,252597,4.067877e-07,0.102753,"{'0': 0.560129927978245, '1': 0.11174660383153..."


Unnamed: 0,QID,Sensitive,dCA,pCA,Prior,Posterior,Histogram
0,"['dAge', 'dAncstry1', 'dAncstry2', 'iClass', '...",iCitizen,0.948912,1.079108,0.913132,0.985367,"{'0': 0.0, '1': 0.0, '2': 0.0, '3': 0.0, '4': ..."
1,"['dAge', 'dAncstry1', 'dAncstry2', 'iClass', '...",dRearning,0.477841,1.76496,0.470504,0.830422,"{'0': 0.0, '1': 0.0, '2': 0.0, '3': 0.0, '4': ..."


Unnamed: 0,QID,Sensitive,Prior Worth,Posterior Worth
0,"['dAge', 'dAncstry1', 'dAncstry2', 'iClass', '...",iCitizen,0.913132,1.001833
1,"['dAge', 'dAncstry1', 'dAncstry2', 'iClass', '...",dRearning,0.500969,1.061239


---