## Load Dataset

In [1]:
import pandas as pd

## example file
!wget -c https://github.com/nlp-pucrs/prescription-outliers/raw/master/data/prescriptions_sample.csv.gz -O data/prescriptions_sample.csv.gz
    
prescription = pd.read_csv('data/prescriptions_sample.csv.gz')
prescription.shape, prescription.columns

((150113, 4),
 Index(['medication', 'frequency', 'dose', 'target'], dtype='object'))

## Aggregate Tuples

In [2]:
columns = ['medication', 'frequency', 'dose']
models = pd.DataFrame(columns=columns)
prescription['count'] = 1
agg_prescription = prescription[columns+['count']].groupby(columns).count().reset_index()
agg_prescription.sample(10)

Unnamed: 0,medication,frequency,dose,count
185,MORFINA,1.0,32.0,13
322,PARACETAMOL,4.0,30.0,566
2,ALOPURINOL,1.0,150.0,21
101,DIAZEPAM,6.0,10.0,118
416,SULFAMETOXAZOL,0.5,960.0,4
375,POLIMIXINA,2.0,375000.0,6
256,MORFINA,6.0,57.0,2
306,PARACETAMOL,1.0,40.0,2
436,VARFARINA,0.5,5.0,1
139,ENOXAPARINA,2.0,60.0,6734


## Build Models

In [3]:
import outliers
import warnings
warnings.filterwarnings('ignore')

columns = ['medication', 'frequency', 'dose', 'count', 'score']
models = pd.DataFrame(columns=columns)
selected_medications = ['ENALAPRIL','PARACETAMOL','VARFARINA']

for m in selected_medications:
    result = outliers.build_model(agg_prescription, m)
    agg = result[columns].groupby(columns).count().reset_index()
    models = models.append(agg)

Threshold ENALAPRIL : 0.125
Threshold PARACETAMOL : 0.022
Threshold VARFARINA : 0.083


In [7]:
models.sample(10)

Unnamed: 0,medication,frequency,dose,count,score
4,PARACETAMOL,1.0,60.0,33.0,3.0
8,PARACETAMOL,3.0,40.0,21.0,3.0
1,PARACETAMOL,1.0,40.0,2.0,3.0
4,VARFARINA,1.0,5.0,3819.0,0.0
22,PARACETAMOL,4.0,40.0,3515.0,0.0
36,PARACETAMOL,6.0,36.0,9.0,3.0
33,PARACETAMOL,6.0,25.0,1.0,3.0
43,PARACETAMOL,6.0,60.0,4.0,3.0
12,PARACETAMOL,3.0,60.0,73.0,2.0
42,PARACETAMOL,6.0,55.0,1.0,3.0


## Save Model

In [8]:
models.to_csv('data/models.csv.gz', compression='gzip', index=None)