In [1]:
import hashlib
import pandas as pd

## Preprocessing

### Load

In [2]:
patients = pd.read_stata('../data/.sensible/pacientes.dta')

### Mask real ids and patient's data

In [3]:
def get_new_patient_id(old_id):
    sha1 = hashlib.sha1(f'{old_id}'.encode('utf8'))
    return sha1.hexdigest()

def shuffle_columns(col):
    return (col.transform(lambda s: s.sample(frac=1))
               .reset_index(drop=True))

def mask_patient_data(df):
    new_ids = [i for i, _ in enumerate(df.PacienteNro)]
    return (df.assign(PacienteNro = [get_new_patient_id(i) for i in new_ids])
              .apply(shuffle_columns))

In [4]:
patients = patients.pipe(mask_patient_data)

## New datasets

In [5]:
def set_new_col_names(df, col_names):
    replacements = {old:new for old, new in zip(df.columns, col_names)}
    return df.rename(columns=replacements)

In [6]:
def subset_and_rename(df, targets, col_names):
    return (df.filter(items=targets)
              .pipe(set_new_col_names, col_names))

### Define targets

In [7]:
cols_farmacia = [
    'PacienteNro', 'UsoHIPOLIPEM_Inicio', 'UsoANTIAGREG_Inicio',
    'UsoANTIHTA_Inicio', 'UsoLi_Inicio', 'UsoBZD_Inicio',
    'UsoAE_Inicio', 'UsoHIP0T_Inicio', 'UsoInhCol_Inicio',
    'UsoMEM_Inicio', 'UsoANTIDBT_Inicio', 'UsoCORTIC_Inicio',
    'UsoATD_Inicio', 'UsoTRICICL_Inicio' 
]

In [8]:
cols_historial = [
    'PacienteNro', 'IAM_Previo', 'ACV_Previo',
    'TAS_Inicio', 'TAD_Inicio', 'EAP_Inicio', 
    'DBT_Inicio', 'TEP_Inicio', 'EPOC_Inicio', 
    'FA_Inicio', 'OH_Inicio', 'ICC_Inicio',
    'Cncer_Inicio', 'HIV_Inicio', 'IRC_Inicio',
    'AI_Inicio',
]

In [9]:
cols_internacion = [
    'PacienteNro', 'Edad_Ingreso', 'Sexo', 'PesoKg', 'Alturacms', 'TBQ',
    'LDL_Inicio', 'HDL_Inicio', 'TAG_Inicio', 'GlucAy_Inicio', 'muerte'
]

### Subset, tidy and save farmacia

In [10]:
new_col_names = [
    'paciente_id', 'hipolip', 'antiag', 'antihta', 'li', 
    'benzo', 'anticom', 'hipoten', 'inhcol', 'memantina', 
    'antidbt', 'corticoide', 'antidep', 'triciclicos'
]

In [11]:
farmacia = patients.pipe(subset_and_rename, cols_farmacia, new_col_names)

In [12]:
farmacia.to_csv('../data/original/hi_farmacia.csv', index=False)

### Subset, tidy and save historial

In [13]:
new_col_names = [
    'paciente_id', 'iam', 'acv', 'tas', 'tad', 'eap',
    'dbt', 'tep', 'epoc', 'fa', 'oh', 'icc', 'cancer',
    'hiv', 'irc', 'ai'
]

In [14]:
historial = patients.pipe(subset_and_rename, cols_historial, new_col_names)

In [15]:
historial.to_csv('../data/original/hi_historial.csv', index=False)

### Subset, tidy and save internacion

In [16]:
new_col_names = [
    'paciente_id', 'edad', 'female', 'peso', 'altura',
    'tbq', 'ldl', 'hdl', 'tag', 'glu', 'muerte'
]

In [17]:
internacion = patients.pipe(subset_and_rename, cols_internacion, new_col_names)

In [18]:
internacion.to_csv('../data/original/hi_internacion.csv', index=False)