In [1]:
%matplotlib inline
import sys
import outliers
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import normalize
from sklearn.metrics import f1_score
import numpy as np

import warnings
warnings.filterwarnings('ignore')

In [2]:
medications = outliers.getOverdoseMedications()
medications.shape

(118,)

In [11]:
regression_data = pd.DataFrame()

for med in np.asarray(medications):

    X, Y = outliers.getPrescriptions(med)
    X_norm = normalize(X,norm='l2')
    print(med + ': ',len(X))

    if len(X) < 100: 
        continue
    
    ep_range = np.arange(0.01,1.0,0.03)
    f_scores = {}
    
    for epsilon in ep_range:
        clf = IsolationForest(contamination=epsilon)
        clf.fit(X_norm)
        y_pred = clf.predict(X_norm)

        params = clf.get_params()
        
        y_pred[y_pred == 1] = 0
        y_pred[y_pred == -1] = 1

        f = f1_score(y_pred, Y)
        ep = epsilon.mean()

        f_scores[ep] = f
        
        #sys.stdout.write(str(ep) + '='+ str(round(f,2)) +', ')
        #break
    
    data_stats = []

    data_stats.append( np.mean(X_norm) ) # 0
    data_stats.append( np.std(X_norm) ) # 1
    data_stats.append( np.median(X_norm) ) # 2
    data_stats.append( np.percentile(X_norm,50) ) # 3
    data_stats.append( np.percentile(X_norm,70) ) # 4
    
    epsilon = max(f_scores, key=f_scores.get) 
    print(epsilon, f_scores[epsilon])
    
    data_stats.append( epsilon )
    
    reshape_data = pd.DataFrame( np.reshape(np.asarray(data_stats), (1,len(data_stats))  ) )
    regression_data = regression_data.append( reshape_data )
    
    #break
    
regression_data.shape

ABACAVIR 300 mg CP:  861
0.01 0.965517241379
ACETAZOLAMIDA 250 mg CP:  120
0.13 0.342857142857
ACICLOVIR 200 mg CP:  1470
0.01 0.666666666667
ACIDO FOLICO 5 mg CP:  1937
0.31 0.00720164609053
AMICACINA 500 mg/2 ml SOL INJ:  492
0.19 0.536231884058
AMIODARONA 200 mg CP:  448
0.01 0.833333333333
AMOXICILINA + CLAVULANATO 1 g + 200 mg SOL INJ:  2403
0.07 0.263959390863
AMOXICILINA + CLAVULANATO 500 mg + 125 mg CP:  963
0.04 0.5
AMOXICILINA 500 mg CP:  103
0.31 0.842105263158
AMPICILINA + SULBACTAM 2 g + 1 g SOL INJ:  1802
0.04 0.542168674699
AMPICILINA 500 mg CP:  90
ANLODIPINO 10 mg CP:  4775
0.01 0.820895522388
ANLODIPINO 5 mg CP:  2874
0.01 0.535714285714
ATORVASTATINA 80 mg CP:  3474
0.01 0.288770053476
AZITROMICINA 500 mg CP:  416
0.31 0.00479616306954
AZITROMICINA 500 mg SOL INJ:  726
0.13 0.0388349514563
BICARBONATO DE SODIO 10 g PO:  26
BISACODIL 5 mg CP:  3145
0.61 0.000635727908455
CARVEDILOL 6,25 mg CP:  463
0.01 0.923076923077
CEFUROXIMA 250 mg CP:  167
0.1 0.444444444444
CETO

(94, 6)

In [40]:
from sklearn.svm import SVR, LinearSVR
from sklearn.model_selection import GridSearchCV, cross_val_score

data = regression_data.drop(5,1).values
target = regression_data[5].values

C_range = [2 ** i for i in range(-7, 7, 1)]
parameters = {'kernel':['linear','rbf'], 'C':C_range}
grid_search = GridSearchCV(SVR(), parameters, cv=5, n_jobs=3, verbose=1, scoring='neg_mean_absolute_error')
grid_search.fit(data, target)
print(grid_search.best_estimator_)
print(grid_search.best_score_)
print(grid_search.best_params_)

Fitting 5 folds for each of 28 candidates, totalling 140 fits
SVR(C=2, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='auto',
  kernel='linear', max_iter=-1, shrinking=True, tol=0.001, verbose=False)
-0.147906998284
{'C': 2, 'kernel': 'linear'}


[Parallel(n_jobs=3)]: Done 140 out of 140 | elapsed:    0.1s finished


In [41]:
from sklearn.linear_model import Ridge, LinearRegression, LogisticRegression, Lasso, BayesianRidge, LassoLars

scores = cross_val_score(LinearSVR(), data, target, scoring='neg_mean_absolute_error')
print(scores.mean())
scores = cross_val_score(Ridge(), data, target, scoring='neg_mean_absolute_error')
print(scores.mean())
scores = cross_val_score(LinearRegression(), data, target, scoring='neg_mean_absolute_error')
print(scores.mean())
scores = cross_val_score(Lasso(), data, target, scoring='neg_mean_absolute_error')
print(scores.mean())
scores = cross_val_score(BayesianRidge(), data, target, scoring='neg_mean_absolute_error')
print(scores.mean())
scores = cross_val_score(LassoLars(), data, target, scoring='neg_mean_absolute_error')
print(scores.mean())

-0.142918522055
-0.165534138574
-0.169595810953
-0.16639608935
-0.166385956913
-0.16639608935


In [42]:
parameters = {'C':np.arange(0.01,2,0.01)}
grid_search = GridSearchCV(LinearSVR(), parameters, cv=5, n_jobs=3, verbose=1, scoring='neg_mean_absolute_error')
data_less = regression_data[[0]].values
grid_search.fit(data_less, target)
print(grid_search.best_estimator_)
print(grid_search.best_score_)
print(grid_search.best_params_)

Fitting 5 folds for each of 199 candidates, totalling 995 fits
LinearSVR(C=0.14000000000000001, dual=True, epsilon=0.0, fit_intercept=True,
     intercept_scaling=1.0, loss='epsilon_insensitive', max_iter=1000,
     random_state=None, tol=0.0001, verbose=0)
-0.139533192613
{'C': 0.14000000000000001}


[Parallel(n_jobs=3)]: Done 995 out of 995 | elapsed:    0.5s finished


In [43]:
regr = LinearSVR(C=0.01)
regr.fit(data_less, target)
print(regr.coef_)
print(regr.intercept_)

[ 0.02312557]
[ 0.05791566]


In [44]:
predicted_ep = regr.predict(data_less)

In [46]:
idx = 0
best_scores = []
predicted_scores = []
for med in np.asarray(medications):

    X, Y = outliers.getPrescriptions(med)
    X_norm = normalize(X,norm='l2')

    if len(X) < 100: 
        continue
    
    clf = IsolationForest(contamination=target[idx])
    clf.fit(X_norm)
    y_pred = clf.predict(X_norm)
    y_pred[y_pred == 1] = 0
    y_pred[y_pred == -1] = 1
    f = f1_score(y_pred, Y)
    best_scores.append(f)
    
    clf = IsolationForest(contamination=predicted_ep[idx])
    clf.fit(X_norm)
    y_pred = clf.predict(X_norm)
    y_pred[y_pred == 1] = 0
    y_pred[y_pred == -1] = 1
    f = f1_score(y_pred, Y)
    predicted_scores.append(f)
    
    print(med + ': Best:', 
          target[idx], 
          best_scores[idx], 
          ', Predicted: ',
          predicted_ep[idx],
          predicted_scores[idx])
    
    idx += 1
    
    #break

ABACAVIR 300 mg CP: Best: 0.01 0.965517241379 , Predicted:  0.0695156306417 0.0986842105263
ACETAZOLAMIDA 250 mg CP: Best: 0.13 0.342857142857 , Predicted:  0.0698660850603 0.0
ACICLOVIR 200 mg CP: Best: 0.01 0.666666666667 , Predicted:  0.0696437443944 0.145161290323
ACIDO FOLICO 5 mg CP: Best: 0.31 0.00720164609053 , Predicted:  0.0715207016888 0.00720164609053
AMICACINA 500 mg/2 ml SOL INJ: Best: 0.19 0.536231884058 , Predicted:  0.0695083454615 0.166666666667
AMIODARONA 200 mg CP: Best: 0.01 0.833333333333 , Predicted:  0.0695760857401 0.135135135135
AMOXICILINA + CLAVULANATO 1 g + 200 mg SOL INJ: Best: 0.07 0.263959390863 , Predicted:  0.0696466821998 0.263959390863
AMOXICILINA + CLAVULANATO 500 mg + 125 mg CP: Best: 0.04 0.5 , Predicted:  0.0695338723292 0.320987654321
AMOXICILINA 500 mg CP: Best: 0.31 0.842105263158 , Predicted:  0.069817611634 0.0
AMPICILINA + SULBACTAM 2 g + 1 g SOL INJ: Best: 0.04 0.542168674699 , Predicted:  0.0739806597376 0.371900826446
ANLODIPINO 10 mg CP

PARACETAMOL 500 mg CP: Best: 0.01 0.0425531914894 , Predicted:  0.069559031659 0.00573445081606
PIPERACILINA + TAZOBACTAM 4 g + 500 mg SOL INJ: Best: 0.01 0.206349206349 , Predicted:  0.0739970907649 0.036775106082
PREDNISONA 20 mg CP: Best: 0.07 0.60510805501 , Predicted:  0.0699173311521 0.615984405458
PROMETAZINA 50 mg/2 ml SOL INJ: Best: 0.31 0.590717299578 , Predicted:  0.0714263537151 0.127906976744
RIFAMPICINA 300 mg CP: Best: 0.01 0.133333333333 , Predicted:  0.0695004823737 0.0454545454545
RISPERIDONA 2 mg CP: Best: 0.07 0.127906976744 , Predicted:  0.0734494773535 0.127906976744
SULFAMETOXAZOL + TRIMETOPRIMA 800 mg + 160 mg CP: Best: 0.1 0.246073298429 , Predicted:  0.0694921538074 0.246073298429
SULFATO DE MAGNESIO 10% 10 ml: Best: 0.04 0.727272727273 , Predicted:  0.0698704024042 0.372093023256
SULFATO DE MAGNESIO 50 % (4,2 mEq/ml) 10 ml: Best: 0.04 0.117647058824 , Predicted:  0.070651770482 0.0808080808081
TRAMADOL 50mg/ml 1ml SOL INJ: Best: 0.76 0.007732670533 , Predicte

In [47]:
print(np.mean(best_scores))
print(np.mean(predicted_scores))

0.448535847451
0.191757079141
