In [1]:
%matplotlib inline
import sys
import outliers
import pandas as pd
import matplotlib.pyplot as plt
from outliers import ddc_outlier
from sklearn.preprocessing import normalize
from sklearn.metrics import f1_score
import numpy as np

import multiprocessing
import tempfile
import os
from joblib import Parallel, delayed
from joblib.pool import has_shareable_memory
cpuN = multiprocessing.cpu_count()

import warnings
warnings.filterwarnings('ignore')

prescription = pd.read_csv('data/prescriptions2017_clean.csv.gz', compression='gzip')

In [2]:
medications = outliers.getOverdoseMedications(prescription)
medications.shape

(73,)

In [3]:
def computeClf(X,Y,epsilon,scores):
    clf = ddc_outlier(alpha=epsilon,metric='jaccard')
    clf.fit(X)
    y_pred = clf.predict(X)

    y_pred[y_pred == 1] = 0
    y_pred[y_pred == -1] = 1

    f = f1_score(y_pred, Y)
    scores[int(epsilon*100)] = f

In [88]:
regression_data = pd.DataFrame()

folder = tempfile.mkdtemp()
score_name = os.path.join(folder, 'score')

for med in np.asarray(medications):

    X, Y = outliers.getPrescriptions(prescription, med)
    X = normalize(X)

    if len(X) < 1000: 
        continue
    
    scores = np.memmap(score_name, dtype=float, shape=(150), mode='w+')
    scores_df = pd.DataFrame()
        
    ep_range = np.arange(0.01,1.0,0.01)
    f_scores = []
    
    Parallel(n_jobs=cpuN)(delayed(computeClf)(X,Y,epsilon,scores)
                   for epsilon in ep_range)
    
    for ep in ep_range:
        idx = int(ep*100)
        scores_df.loc[med,ep] = scores[idx]
    
    data_stats = []

    data_stats.append( len(X) )
    data_stats.append( len(np.unique(X,axis=0)) )
    data_stats.append( np.mean(X) )
    data_stats.append( np.std(X) )
    data_stats.append( np.median(X) )
    data_stats.append( np.percentile(X,25) )
    data_stats.append( np.percentile(X,50) )
    data_stats.append( np.percentile(X,75) )

    data_stats.append( np.mean(X[:,0]) )
    data_stats.append( np.std(X[:,0]) )
    data_stats.append( np.median(X[:,0]) )
    data_stats.append( np.percentile(X[:,0],25) )
    data_stats.append( np.percentile(X[:,0],50) )
    data_stats.append( np.percentile(X[:,0],75) )

    data_stats.append( np.mean(X[:,1]) )
    data_stats.append( np.std(X[:,1]) )
    data_stats.append( np.median(X[:,1]) )
    data_stats.append( np.percentile(X[:,1],25) )
    data_stats.append( np.percentile(X[:,1],50) )
    data_stats.append( np.percentile(X[:,1],75) )
    
    epsilon = scores_df.idxmax(1).values[0]
    f1 = scores_df.max(1).values[0]
    
    
    print(med,len(X),epsilon, f1)
    
    data_stats.append( epsilon )
    
    if f1 > .5:
        reshape_data = pd.DataFrame( np.reshape(np.asarray(data_stats), (1,len(data_stats))  ) )
        reshape_data['med'] = med
        regression_data = regression_data.append( reshape_data )
    
regression_data.shape

ACICLOVIR 200 mg CP 2359 0.01 0.0
ALBENDAZOL 400 mg CP 2660 0.01 0.0
ALOPURINOL 100 mg CP 4729 0.01 0.0
AMPICILINA + SULBACTAM 2 g + 1 g SOL INJ 7226 0.18 0.761768901569
ANLODIPINO 10 mg CP 15584 0.08 0.971014492754
ANLODIPINO 5 mg CP 10392 0.02 0.684931506849
BISACODIL 5 mg CP 9483 0.01 0.258064516129
CARBAMAZEPINA 20 mg/ml SUSP ORAL 1511 0.45 0.387096774194
CEFEPIMA 2 g SOL INJ - SEM DILUENTE 1460 0.05 0.647058823529
CEFTRIAXONA 1g SOL INJ IV - SEM DILUENTE 1339 0.06 0.622222222222
CETOPROFENO 100 mg SOL INJ IV 10907 0.19 0.12893982808
CLORETO DE POTASSIO 6 % (0,8 mEq/ml) SOL AQ C/ADOCANTE ARTIFICIAL - FR. 100 A 200 ml 5593 0.28 0.604215456674
DEXAMETASONA 10 mg/2,5 ml SOL INJ FRASCO-AMPOLA 3286 0.03 0.5
DIAZEPAM 10 mg CP 9030 0.02 0.313253012048
DICLOFENACO 50 mg CP 5559 0.02 0.903225806452
DIPIRONA 1 g/2 ml SOL INJ 163027 0.06 0.57730032695
DIPIRONA 500 mg/ml SOL ORAL - com conta-gotas acoplado na tampa do frasco ou na embalagem do medicamento 28967 0.39 0.556057494867
DOXAZOSINA 2

(23, 22)

In [89]:
from sklearn.svm import SVR, LinearSVR
from sklearn.model_selection import GridSearchCV, cross_val_score

target_id = 20
data = regression_data.drop([target_id,'med'],1).values
target = regression_data[target_id].values

C_range = [2 ** i for i in range(-7, 7, 1)]
parameters = {'kernel':['rbf'], 'C':C_range}
grid_search = GridSearchCV(SVR(), parameters, cv=5, n_jobs=3, verbose=1, scoring='neg_mean_absolute_error')
grid_search.fit(data, target)
print(grid_search.best_estimator_)
print(grid_search.best_score_)
print(grid_search.best_params_)

Fitting 5 folds for each of 14 candidates, totalling 70 fits
SVR(C=0.0078125, cache_size=200, coef0=0.0, degree=3, epsilon=0.1,
  gamma='auto', kernel='rbf', max_iter=-1, shrinking=True, tol=0.001,
  verbose=False)
-0.0839501811594
{'C': 0.0078125, 'kernel': 'rbf'}


[Parallel(n_jobs=3)]: Done  70 out of  70 | elapsed:    0.1s finished


In [124]:
from sklearn.linear_model import Ridge, LinearRegression, LogisticRegression, Lasso, BayesianRidge, LassoLars
from sklearn.neural_network import MLPRegressor

MLPR = MLPRegressor(activation='logistic', hidden_layer_sizes=(100) )

scores = cross_val_score(LinearSVR(C=0.9), data, target, scoring='neg_mean_absolute_error')
print(scores.mean())
scores = cross_val_score(Ridge(), data, target, scoring='neg_mean_absolute_error')
print(scores.mean())
scores = cross_val_score(LinearRegression(), data, target, scoring='neg_mean_absolute_error')
print(scores.mean())
scores = cross_val_score(Lasso(), data, target, scoring='neg_mean_absolute_error')
print(scores.mean())
scores = cross_val_score(BayesianRidge(), data, target, scoring='neg_mean_absolute_error')
print(scores.mean())
scores = cross_val_score(LassoLars(), data, target, scoring='neg_mean_absolute_error')
print(scores.mean())
scores = cross_val_score(MLPR, data, target, scoring='neg_mean_absolute_error')
print(scores.mean())

-0.435634446751
-0.0966584181345
-49.3611575882
-0.0883192187315
-0.0851659312411
-0.071496031746
-0.0808009568304


In [93]:
parameters = {'alpha':np.arange(0.1,2,0.01)}
grid_search = GridSearchCV(LassoLars(), parameters, cv=5, n_jobs=3, verbose=1, scoring='neg_mean_absolute_error')
grid_search.fit(data, target)
print(grid_search.best_estimator_)
print(grid_search.best_score_)
print(grid_search.best_params_)

Fitting 5 folds for each of 190 candidates, totalling 950 fits
LassoLars(alpha=0.10000000000000001, copy_X=True, eps=2.2204460492503131e-16,
     fit_intercept=True, fit_path=True, max_iter=500, normalize=True,
     positive=False, precompute='auto', verbose=False)
-0.0720086448004
{'alpha': 0.10000000000000001}


[Parallel(n_jobs=3)]: Done 950 out of 950 | elapsed:    0.4s finished


In [107]:
target

array([ 0.18,  0.08,  0.02,  0.05,  0.06,  0.28,  0.02,  0.06,  0.39,
        0.03,  0.09,  0.04,  0.1 ,  0.18,  0.06,  0.02,  0.23,  0.05,
        0.02,  0.03,  0.02,  0.14,  0.09])

In [106]:
regr = LassoLars(alpha=0.1)
regr.fit(data, target)
print(regr.coef_)
print(regr.intercept_)

[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.]
0.0973913043478


In [95]:
parameters = {'alpha':np.arange(0.01,3,0.01)}
grid_search = GridSearchCV(LassoLars(), parameters, cv=5, n_jobs=3, verbose=1, scoring='neg_mean_absolute_error')
#data_less = regression_data.drop([0,1,2,3,5,7,8,11,12,13,14,15],1).values
## reduce dimensionality
data_less = regression_data[[2]].values
target = regression_data[20].values
grid_search.fit(data_less, target)
print(grid_search.best_estimator_)
print(grid_search.best_score_)
print(grid_search.best_params_)

Fitting 5 folds for each of 299 candidates, totalling 1495 fits
LassoLars(alpha=0.01, copy_X=True, eps=2.2204460492503131e-16,
     fit_intercept=True, fit_path=True, max_iter=500, normalize=True,
     positive=False, precompute='auto', verbose=False)
-0.0720086448004
{'alpha': 0.01}


[Parallel(n_jobs=3)]: Done 1495 out of 1495 | elapsed:    0.5s finished


In [128]:
regr = LassoLars(alpha=0.1)
regr.fit(data, target)
print(regr.coef_)
print(regr.intercept_)

[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.]
0.0973913043478


In [98]:
regr = LassoLars(alpha=0.1)
regr.fit(data_less, target)
print(regr.coef_)
print('---')
print(regr.intercept_)

[ 0.]
---
0.0973913043478


In [125]:
MLPR.fit(data, target)
predicted_ep = MLPR.predict(data)

In [130]:
predicted_ep = regr.predict(data)

In [131]:
idx = 0
best_scores = []
predicted_scores = []
for med in np.asarray(regression_data['med'].values):

    X, Y = outliers.getPrescriptions(prescription, med)
    X_norm = normalize(X)
    
    clf = ddc_outlier(alpha=target[idx],metric='jaccard')
    clf.fit(X_norm)
    y_pred = clf.predict(X_norm)
    y_pred[y_pred == 1] = 0
    y_pred[y_pred == -1] = 1
    f = f1_score(y_pred, Y)
    best_scores.append(f)
    
    clf = ddc_outlier(alpha=predicted_ep[idx],metric='jaccard')
    clf.fit(X_norm)
    y_pred = clf.predict(X_norm)
    y_pred[y_pred == 1] = 0
    y_pred[y_pred == -1] = 1
    f = f1_score(y_pred, Y)
    predicted_scores.append(f)
    
    print(med + ': Best:', 
          target[idx], 
          best_scores[idx], 
          ', Predicted: ',
          predicted_ep[idx],
          predicted_scores[idx])
    
    idx += 1
    
    #break

AMPICILINA + SULBACTAM 2 g + 1 g SOL INJ: Best: 0.18 0.761768901569 , Predicted:  0.0973913043478 0.0
ANLODIPINO 10 mg CP: Best: 0.08 0.971014492754 , Predicted:  0.0973913043478 0.971014492754
ANLODIPINO 5 mg CP: Best: 0.02 0.684931506849 , Predicted:  0.0973913043478 0.684931506849
CEFEPIMA 2 g SOL INJ - SEM DILUENTE: Best: 0.05 0.647058823529 , Predicted:  0.0973913043478 0.647058823529
CEFTRIAXONA 1g SOL INJ IV - SEM DILUENTE: Best: 0.06 0.0 , Predicted:  0.0973913043478 0.622222222222
CLORETO DE POTASSIO 6 % (0,8 mEq/ml) SOL AQ C/ADOCANTE ARTIFICIAL - FR. 100 A 200 ml: Best: 0.28 0.604215456674 , Predicted:  0.0973913043478 0.20202020202
DICLOFENACO 50 mg CP: Best: 0.02 0.903225806452 , Predicted:  0.0973913043478 0.193103448276
DIPIRONA 1 g/2 ml SOL INJ: Best: 0.06 0.57730032695 , Predicted:  0.0973913043478 0.40873015873
DIPIRONA 500 mg/ml SOL ORAL - com conta-gotas acoplado na tampa do frasco ou na embalagem do medicamento: Best: 0.39 0.556057494867 , Predicted:  0.097391304347

In [132]:
#Original
#0.680534078037

#MLP
#0.493819906432

#Lars
#0.4531622325

print(np.mean(best_scores))
print(np.mean(predicted_scores))

0.680534078037
0.4531622325
