## Load Dataset

In [1]:
import pandas as pd

## example file
!wget -c https://github.com/nlp-pucrs/prescription-outliers/raw/master/data/prescriptions_sample.csv.gz -O data/prescriptions_sample.csv.gz
    
prescription = pd.read_csv('data/prescriptions_sample.csv.gz')
prescription.shape, prescription.columns

--2019-04-01 15:48:03--  https://github.com/nlp-pucrs/prescription-outliers/raw/master/data/prescriptions_sample.csv.gz
Resolving github.com (github.com)... 192.30.253.113, 192.30.253.112
Connecting to github.com (github.com)|192.30.253.113|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/nlp-pucrs/prescription-outliers/master/data/prescriptions_sample.csv.gz [following]
--2019-04-01 15:48:03--  https://raw.githubusercontent.com/nlp-pucrs/prescription-outliers/master/data/prescriptions_sample.csv.gz
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.252.133
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.252.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 95274 (93K) [application/octet-stream]
Saving to: ‘data/prescriptions_sample.csv.gz’


2019-04-01 15:48:04 (2,11 MB/s) - ‘data/prescriptions_sample.csv.gz’ saved [95274/95274]



((150113, 4),
 Index(['medication', 'frequency', 'dose', 'target'], dtype='object'))

## Aggregate Tuples

In [2]:
columns = ['medication', 'frequency', 'dose']
models = pd.DataFrame(columns=columns)
prescription['count'] = 1
agg_prescription = prescription[columns+['count']].groupby(columns).count().reset_index()
agg_prescription.sample(10)

Unnamed: 0,medication,frequency,dose,count
5,ALOPURINOL,1.0,600.0,3
389,POLIMIXINA,2.0,1500000.0,247
40,BISACODIL,3.0,10.0,141
224,MORFINA,6.0,5.0,7
152,HIDRALAZINA,3.0,50.0,3309
4,ALOPURINOL,1.0,300.0,1219
304,MORFINA,24.0,67.0,1
179,MORFINA,1.0,14.0,7
33,BISACODIL,1.0,20.0,10
219,MORFINA,4.0,45.0,6


## Build Models

In [3]:
import outliers
import warnings
warnings.filterwarnings('ignore')

columns = ['medication', 'frequency', 'dose', 'count', 'score']
models = pd.DataFrame(columns=columns)
selected_medications = ['ENALAPRIL','PARACETAMOL','VARFARINA']

for m in selected_medications:
    result = outliers.build_model(agg_prescription, m)
    agg = result[columns].groupby(columns).count().reset_index()
    models = models.append(agg)

Threshold ENALAPRIL : 0.125
Threshold PARACETAMOL : 0.022
Threshold VARFARINA : 0.083


In [8]:
models.groupby('medication').count()

Unnamed: 0_level_0,frequency,dose,count,score
medication,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ENALAPRIL,8,8,8,8
PARACETAMOL,46,46,46,46
VARFARINA,12,12,12,12


In [4]:
models.sample(10)

Unnamed: 0,medication,frequency,dose,count,score
4,ENALAPRIL,2.0,20.0,9183,0.0
5,ENALAPRIL,2.0,20.1,2,3.0
35,PARACETAMOL,6.0,35.0,4,3.0
37,PARACETAMOL,6.0,37.0,9,3.0
6,PARACETAMOL,2.0,40.0,1,3.0
7,ENALAPRIL,3.0,20.0,18,3.0
30,PARACETAMOL,4.0,70.0,264,0.0
2,PARACETAMOL,1.0,45.0,2,3.0
14,PARACETAMOL,3.0,75.0,11,3.0
6,ENALAPRIL,2.0,40.0,58,2.0


## Save Model

In [5]:
models.to_csv('data/models.csv.gz', compression='gzip', index=None)

In [6]:
agg_prescription.to_csv('data/prescriptions.csv.gz', compression='gzip', index=None)

In [9]:
agg_prescription[agg_prescription['medication'].isin(selected_medications)].to_csv('data/test.csv.gz', compression='gzip', index=None)