In [194]:
import pandas as pd
from sas7bdat import SAS7BDAT

In [195]:
regio = lambda x: 'North America' if x == "CAN" or x == b'CAN' or x == "USA" or x == b"USA" else 'Rest of World'

def freqs(df,cols,new_col):
    return df.groupby(cols).size().reset_index().rename(columns={0:new_col})

def add_freqs(ds,cols,new_col):
    return pd.merge(ds,freqs(ds,cols,new_col), how='left')

In [196]:
with SAS7BDAT('dm.sas7bdat') as f:
    dm = f.to_data_frame()
dm = dm[['SUBJID','SEX','AGE','RACE','COUNTRY','ARMCD']]

dm['REGION'] = dm.COUNTRY.map(regio)
dm.shape

[dm.sas7bdat] header length 65536 != 8192
[dm.sas7bdat] header length 65536 != 8192
[dm.sas7bdat] header length 65536 != 8192
[dm.sas7bdat] header length 65536 != 8192
[dm.sas7bdat] header length 65536 != 8192
[dm.sas7bdat] header length 65536 != 8192
[dm.sas7bdat] header length 65536 != 8192
[dm.sas7bdat] header length 65536 != 8192
[dm.sas7bdat] header length 65536 != 8192
[dm.sas7bdat] header length 65536 != 8192
[dm.sas7bdat] header length 65536 != 8192
[dm.sas7bdat] header length 65536 != 8192
[dm.sas7bdat] header length 65536 != 8192


(889, 7)

In [417]:
dmx=pd.read_sas('dm.sas7bdat')
dmx = dmx[['SUBJID','SEX','AGE','RACE','COUNTRY','ARMCD']]
dmx = dmx[dmx.ARMCD == b'0']
dmx['REGION'] = dmx.COUNTRY.map(regio)
dmx.shape

(449, 7)

In [418]:
dmx

Unnamed: 0,SUBJID,SEX,AGE,RACE,COUNTRY,ARMCD,REGION
0,b'101-003',b'M',51.0,b'NATIVE HAWAIIAN OR OTHER PACIFIC ISLANDER',b'CAN',b'0',North America
5,b'106-006',b'F',56.0,b'WHITE',b'CAN',b'0',North America
7,b'106-008',b'M',53.0,b'WHITE',b'CAN',b'0',North America
8,b'106-010',b'F',58.0,b'WHITE',b'CAN',b'0',North America
10,b'109-003',b'M',40.0,b'WHITE',b'CAN',b'0',North America
12,b'109-006',b'F',41.0,b'WHITE',b'CAN',b'0',North America
13,b'109-008',b'F',54.0,b'WHITE',b'CAN',b'0',North America
16,b'109-012',b'F',48.0,b'WHITE',b'CAN',b'0',North America
18,b'109-014',b'F',53.0,b'WHITE',b'CAN',b'0',North America
20,b'110-002',b'F',39.0,b'WHITE',b'CAN',b'0',North America


In [240]:
dm1=dm[dm.ARMCD == '0']
ref=dm

In [287]:
def risk_stats(ds,dsref,cols):
    stats={}
    ref=dsref.groupby(cols).size().reset_index().rename(columns={0:'tot'})
    d=pd.merge(ds,ref, how='left')
    d['risk']=1/d.tot
    stats['Average Risk'] = round(d.risk.mean(),5)
    stats['Maximum Risk'] = round(d.risk.max(),5)
    stats['Unique Records'] = int(d[d.risk==1].risk.count())
    stats['Proportion Unique'] = round(stats['Unique Records'] / len(ds),6)
    return {' '.join(sorted(cols)):stats}

In [312]:
risks=[]

In [313]:
risk_stats(dm1,ref,['AGE'])

{'AGE': {'Average Risk': 0.04624,
  'Maximum Risk': 1.0,
  'Proportion Unique': 0.006682,
  'Unique Records': 3}}

In [314]:
risk_stats(dm1,ref,['AGE','SEX','REGION'])

{'AGE REGION SEX': {'Average Risk': 0.14415,
  'Maximum Risk': 1.0,
  'Proportion Unique': 0.033408,
  'Unique Records': 15}}

In [318]:
risks={}
risks.update( risk_stats(dm1,ref,['AGE']) )
risks.update( risk_stats(dm1,ref,['AGE','SEX','REGION']) )
risks

{'AGE': {'Average Risk': 0.04624,
  'Maximum Risk': 1.0,
  'Proportion Unique': 0.006682,
  'Unique Records': 3},
 'AGE REGION SEX': {'Average Risk': 0.14415,
  'Maximum Risk': 1.0,
  'Proportion Unique': 0.033408,
  'Unique Records': 15}}

In [411]:
def variants(dict):
    d=[]
    for i in range(1,2**len(dict)):
        d.append([x for j,x in enumerate(dict) if bin(i)[2:].zfill(len(dict))[j]=='1'])
    return d

In [413]:
def test_all(ds,ref,vars):
    risks={}
    for variant in variants(vars):
        risks.update(risk_stats(ds,ref,variant) )
    return risks        

In [414]:
test_all(dm1,ref,['AGE','SEX','REGION'])

{'AGE': {'Average Risk': 0.04624,
  'Maximum Risk': 1.0,
  'Proportion Unique': 0.006682,
  'Unique Records': 3},
 'AGE REGION': {'Average Risk': 0.08032,
  'Maximum Risk': 1.0,
  'Proportion Unique': 0.01559,
  'Unique Records': 7},
 'AGE REGION SEX': {'Average Risk': 0.14415,
  'Maximum Risk': 1.0,
  'Proportion Unique': 0.033408,
  'Unique Records': 15},
 'AGE SEX': {'Average Risk': 0.08421,
  'Maximum Risk': 1.0,
  'Proportion Unique': 0.008909,
  'Unique Records': 4},
 'REGION': {'Average Risk': 0.00227,
  'Maximum Risk': 0.00592,
  'Proportion Unique': 0.0,
  'Unique Records': 0},
 'REGION SEX': {'Average Risk': 0.00466,
  'Maximum Risk': 0.01923,
  'Proportion Unique': 0.0,
  'Unique Records': 0},
 'SEX': {'Average Risk': 0.00224,
  'Maximum Risk': 0.00295,
  'Proportion Unique': 0.0,
  'Unique Records': 0}}

In [416]:
risks['AGE REGION SEX']['Average Risk']

0.14415