### IMPORTS

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from datetime import date
import datetime
import numpy as np
import seaborn as sns
import re
import json
import pandas as pd
import math
import ast
import os
from collections import defaultdict
import sys
sys.path.append('..')
from utilities import logger
from utilities import configuration
from utilities import health_data

from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.preprocessing import StandardScaler

from sklearn.neural_network import MLPClassifier

from scipy import stats
from scipy import sparse
from sklearn.metrics import roc_auc_score, confusion_matrix

### PARAMS

In [2]:
params = {'fix_skew': True,
          'normalize': True,
          'fix_missing_in_testing': True,
          'numerical_features': True,
          'categorical_features': True,
          'diagnosis_features': True,
          'intervention_features':True,
          'use_idf':True,
          }

### Training and Testing sets

In [3]:
training ,testing = health_data.Admission.get_training_testing_data()
    
if params['fix_missing_in_testing']:
    for admission in testing:
        admission.fix_missings(training)

print(f'Training size: {len(training):,}')
print(f'Testing size:  {len(testing):,}')

Training instances before filtering: 419988
Training instances after filtering:  419988
Testomg instances before filtering:  104998
Testomg instances after filtering:   104998
Training size: 419,988
Testing size:  104,998


### Training and testing Matrices

In [4]:
# ---------- ---------- ---------- ---------- ---------- ---------- ---------- ---------- 
# Training 
# ---------- ---------- ---------- ---------- ---------- ---------- ---------- ---------- 
features = []
if params['numerical_features']:
    numerical_df = health_data.Admission.numerical_features(training, 
                                                            fix_skew=params['fix_skew'], 
                                                            normalize=params['normalize'])
    features.append(sparse.csr_matrix(numerical_df.values))

if params['categorical_features']:
    categorical_df,main_pt_services_list = health_data.Admission.categorical_features(training)
    features.append(sparse.csr_matrix(categorical_df.values))

if params['diagnosis_features']:
    vocab_diagnosis, diagnosis_matrix = health_data.Admission.diagnosis_codes_features(training, 
                                                                                       use_idf=params['use_idf'])
    features.append(diagnosis_matrix)

if params['intervention_features']:
    vocab_interventions, intervention_matrix = health_data.Admission.intervention_codes_features(training, 
                                                                                                 use_idf=params['use_idf'])
    features.append(intervention_matrix)

X_train = sparse.hstack(features)
y_train = health_data.Admission.get_y(training)

# ---------- ---------- ---------- ---------- ---------- ---------- ---------- ---------- 
# Testing
# ---------- ---------- ---------- ---------- ---------- ---------- ---------- ---------- 
features = []
if params['numerical_features']:
    numerical_df = health_data.Admission.numerical_features(testing, 
                                                            fix_skew=params['fix_skew'], 
                                                            normalize=params['normalize'])
    features.append(sparse.csr_matrix(numerical_df.values))

if params['categorical_features']:
    categorical_df, _ = health_data.Admission.categorical_features(testing, main_pt_services_list=main_pt_services_list)
    features.append(sparse.csr_matrix(categorical_df.values))

if params['diagnosis_features']:
    vocab_diagnosis, diagnosis_matrix = health_data.Admission.diagnosis_codes_features(testing, 
                                                                                       vocabulary=vocab_diagnosis, 
                                                                                       use_idf=params['use_idf'])
    features.append(diagnosis_matrix)

if params['intervention_features']:
    vocab_interventions, intervention_matrix = health_data.Admission.intervention_codes_features(testing, 
                                                                                                 vocabulary=vocab_interventions, 
                                                                                                 use_idf=params['use_idf']
                                                                                                 )
    features.append(intervention_matrix)

# numerical_df = health_data.Admission.numerical_features(testing, fix_skew=params['fix_skew'], normalize=params['normalize'])
# categorical_df = health_data.Admission.categorical_features(testing)
# vocab_diagnosis, diagnosis_matrix = health_data.Admission.diagnosis_codes_features(testing, 
#                                                                                    vocabulary=vocab_diagnosis)
# vocab_interventions, intervention_matrix = health_data.Admission.intervention_codes_features(testing, 
#                                                                                              vocabulary=vocab_interventions)

X_test = sparse.hstack(features)
y_test = health_data.Admission.get_y(testing)


print(f'X_train.shape = ({X_train.shape[0]:,} x {X_train.shape[1]:,})')
print(f'y_train.shape = ({y_train.shape[0]:,} x )')
print()
print(f'X_test.shape =  ({X_test.shape[0]:,} x {X_test.shape[1]:,})')
print(f'y_test.shape =  ({y_test.shape[0]:,} x )')

X_train.shape = (419,345 x 17,083)
y_train.shape = (419,345 x )

X_test.shape =  (104,884 x 17,083)
y_test.shape =  (104,884 x )


In [5]:
clf = LogisticRegression(class_weight='balanced', max_iter=7000,).fit(X_train, y_train,)

y_true = y_train
y_pred = clf.predict(X_train)
y_score= clf.predict_proba(X_train)

model_name = str(clf)
columns = ['Model','params','split','TN','FP','FN','TP','Precision','Recall','F1-Score','AUC']
str_ = ';'.join(columns)
logging.debug(str_)
tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
str_ = f'{model_name};{str(params)};train;{tn};{fp};{fn};{tp};{precision_score(y_true, y_pred,)};{recall_score(y_true, y_pred,)};'\
    f'{f1_score(y_true, y_pred,)};{roc_auc_score(y_true=y_true, y_score=y_pred)}\n'
logging.debug(str_)


vec1 = [model_name,
        str(params),
        'TRAIN',
        tn,
        fp,
        fn,
        tp,
        precision_score(y_true, y_pred,),
        recall_score(y_true, y_pred,),
        f1_score(y_true, y_pred,),
        roc_auc_score(y_true=y_true, y_score=y_pred),
        ]

y_true = y_test
y_pred = clf.predict(X_test)
y_score= clf.predict_proba(X_test)

tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
str_ = f'{model_name};{str(params)};test;{tn};{fp};{fn};{tp};{precision_score(y_true, y_pred,)};{recall_score(y_true, y_pred,)};'\
    f'{f1_score(y_true, y_pred,):};{roc_auc_score(y_true=y_true, y_score=y_pred)}\n'

vec2 = [model_name,
        str(params),
        'TEST',
        tn,
        fp,
        fn,
        tp,
        precision_score(y_true, y_pred,),
        recall_score(y_true, y_pred,),
        f1_score(y_true, y_pred,),
        roc_auc_score(y_true=y_true, y_score=y_pred),
        ]


logging.debug(str_)

# print('** Results on Training **')
# print('     ~Real values~')
# print(f'         0       1')
# print(f'0: {tn:7,} {fn:7,}')
# print(f'1: {fp:7,} {tp:7,}')
# print()

# print(f'Precision= {precision_score(y_true, y_pred,):7.4f}')
# print(f'Recall=    {recall_score(y_true, y_pred,):7.4f}')
# print(f'F1-score=  {f1_score(y_true, y_pred,):7.4f}')
# print(f'AUC=       {roc_auc_score(y_true=y_true, y_score=y_pred):7.4f}')
# print()


# print('** Results on Testing **')
# print('     ~Real values~')
# print(f'         0       1')
# print(f'0: {tn:7,} {fn:7,}')
# print(f'1: {fp:7,} {tp:7,}')
# print()


# print(f'Precision= {precision_score(y_true, y_pred,):7.4f}')
# print(f'Recall=    {recall_score(y_true, y_pred,):7.4f}')
# print(f'F1-score=  {f1_score(y_true, y_pred,):7.4f}')
# print(f'AUC=       {roc_auc_score(y_true=y_true, y_score=y_pred):7.4f}')



Model;params;split;TN;FP;FN;TP;Precision;Recall;F1-Score;AUC
LogisticRegression(class_weight='balanced', max_iter=7000);{'fix_skew': True, 'normalize': True, 'fix_missing_in_testing': True, 'numerical_features': True, 'categorical_features': True, 'diagnosis_features': True, 'intervention_features': True, 'use_idf': True};train;269812;132121;4050;13362;0.09184578266876542;0.7674017918676774;0.16405660087786608;0.7193439011113185
LogisticRegression(class_weight='balanced', max_iter=7000);{'fix_skew': True, 'normalize': True, 'fix_missing_in_testing': True, 'numerical_features': True, 'categorical_features': True, 'diagnosis_features': True, 'intervention_features': True, 'use_idf': True};test;66793;33558;1744;2789;0.07673260516686384;0.6152658283697331;0.1364481409001957;0.6404297971257441



In [5]:
# training ,testing = health_data.Admission.get_training_testing_data()
# if params['fix_missing_in_testing']:
#     for admission in testing:
#         admission.fix_missings(training)

missing_count=0
for admission in testing:
    if admission.case_weight is None or np.isnan(admission.case_weight):
        missing_count+=1
print(missing_count)
print(f'{missing_count/len(training):.4%}')

23
0.0055%


In [5]:
len(set([admission.main_pt_service for admission in training]))

49

In [4]:
categorical_df,_ = health_data.Admission.categorical_features(training)
categorical_df

Unnamed: 0,male,female,transfusion given,is alc,is central zone,elective admission,new born admission,urgent admission,level 1 comorbidity,level 2 comorbidity,...,General Medicine,Paediatric Medicine,Neurology,OBS Aborted,Cardiovascular Surgery,Haematology,Otolaryngology and ORL,Allergy,Genetics,OBS Delivered
0,0,1,0,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,1,0,0,1,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
3,0,1,0,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,1,0,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
419134,1,0,0,0,0,0,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0
419135,1,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
419136,1,0,0,0,0,0,0,1,1,0,...,1,0,0,0,0,0,0,0,0,0
419137,0,1,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [23]:
categorical_df.iloc[instance_no,:][categorical_df.iloc[instance_no,:]!=0]

male                   1
is central zone        1
urgent admission       1
level 1 comorbidity    1
level 2 comorbidity    1
Family Practice        1
Name: 8, dtype: int64

In [5]:
vocab, df = health_data.Admission.intervention_codes_features(training)

In [6]:
np.sum(df>1)

0

In [24]:
freq = defaultdict(int)
for admission in training+testing:
    freq[admission.postal_code]+=1

zip_codes = [(key,value) for key,value in freq.items()]
sorted_zip_codes = sorted(zip_codes, key=lambda key_value: key_value[1])
print(sorted_zip_codes[:2])
print(sorted_zip_codes[-2:])


[('B2N7H9', 1), ('B0E3P0', 1)]
[('B0K1H0', 5104), ('B0P1R0', 5232)]


In [49]:
field = 'diagnosis'
missing = len([_ for admission in training+testing if getattr(admission,field).codes is None])
print(f'{missing/len(training+testing):.3%}')

0.000%
