In [3]:
#Libraries for math and data manipulation
import numpy as np
import pandas as pd
import copy
import math
import numpy.random as rand
import scipy as sp
import sys
from datetime import datetime

#Plotting stuff
import matplotlib.pyplot as plt
import seaborn
import corner
%matplotlib inline
from matplotlib import rc
rc('font',**{'family':'sans-serif','sans-serif':['Helvetica']})
rc('text', usetex=True) #Want to be able to use tex in the plot labels
seaborn.set_style('ticks')
seaborn.set_color_codes()

#Machine learning stuff
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier   #This is a single decision tree
from sklearn.ensemble import ExtraTreesClassifier #Random forest of trees
from sklearn.ensemble import GradientBoostingClassifier
import graphviz                                   #This package lets you visualize the tree
from sklearn.metrics import classification_report, confusion_matrix, precision_recall_fscore_support, precision_score
from sklearn.model_selection import train_test_split

### Notes:
* Are there situations in which it can actually be bad that clinicians are in general quite good at making a diagnosis? For instance, the 'CMalaria' variable is extremely predictive of malaria - do the patients that are *falsely diagnosed with malaira* tend to have this variable as a yes? And, if so, are they *not* diagnosed with malaria if this variable is excluded?
* Corollary: do the *false negative* malaria patients have a *negativ* CMalaria variable?

#### Step 1:
* Identify false-positive malaria patients

In [139]:
#Read in the raw dataframe - don't edit this data!
print 'reading data...'
data_raw = pd.read_csv("../Data/cns_data.csv")   #Using Pandas for data manipulation

#Delete useless columns - but keep patient registration number
data_df = data_raw.drop(['surnamenam','patinit','mothinit'],axis=1)

#Split up 'incdate' into month, day, and year. Monthday is meant to capture seasonality.
data_df['month'] = pd.DatetimeIndex(data_df['incdate']).month
data_df['day'] = pd.DatetimeIndex(data_df['incdate']).day
data_df['year'] = pd.DatetimeIndex(data_df['incdate']).year
data_df['monthday'] = data_df['month']*30+data_df['day']  #Assumes there are thirty days per month - accurate enough.

#Select input features. 
#These are from the list that Matt put together (readdata.R), and have been culled for measures that are 'easy to get,'
# i.e. don't involve a test, that don't have a lot of non-NaN entries, and that are not dependent on other measures.
clin_df = data_df.loc[:,['glasgtot','blanttot','clinjaund','clinhepato','clinspleno','clinconv','clindehyd',\
                          'clinoedem','clinlymph','clinresp','clinablung','clincyan','clincapref','clincoldext',\
                         'clinearinf','clinanemia','clinorcand','clinhemmor','clinaids',\
                         'abdpain','vom','diar','head','muscle','conv',\
                         'sex','ageyrs','agemth',\
                         'temp','card','resp','sbp','dbp','weight','height','muac',\
                          'mhhosp', 'mhdevsp', 'mhsickle', 'mhchronstat',\
                         'incdate',\
                         'dimain', \
                         'incirrit', 'inchead', 'incphoto',\
                         'incfont', 'incneuro' , 'incseiza', 'incseizh','incpurp',\
                         'ident',\
                          'diag']] #This is the final diagnosis - will be the 'label'

#Convert the date into a continuous variable. This just counts the days from the beginning of the dataset 
clin_df['incdate'] = pd.to_datetime(clin_df['incdate'])
clin_df['date'] = (clin_df['incdate'] - clin_df['incdate'].min())  / np.timedelta64(1,'D')

print('mapping data...')
#Mapping yes/no to 1/0
clin_df['clinjaund'] = clin_df.clinjaund.map({'Yes': 1, 'No': 0})
clin_df['clinhepato'] = clin_df.clinhepato.map({'Yes': 1, 'No': 0})
clin_df['clinspleno'] = clin_df.clinspleno.map({'Yes': 1, 'No': 0})
clin_df['clinconv'] = clin_df.clinconv.map({'Yes': 1, 'No': 0})
clin_df['clindehyd'] = clin_df.clindehyd.map({'Yes': 1, 'No': 0})
clin_df['clinoedem'] = clin_df.clinoedem.map({'Yes': 1, 'No': 0})
clin_df['clinlymph'] = clin_df.clinlymph.map({'Yes': 1, 'No': 0})
clin_df['clinresp'] = clin_df.clinresp.map({'Yes': 1, 'No': 0})
clin_df['clinablung'] = clin_df.clinablung.map({'Yes': 1, 'No': 0})
clin_df['clincyan'] = clin_df.clincyan.map({'Yes': 1, 'No': 0})
clin_df['clincapref'] = clin_df.clincapref.map({'Yes': 1, 'No': 0})
clin_df['clincoldext'] = clin_df.clincoldext.map({'Yes': 1, 'No': 0})
clin_df['clinorcand'] = clin_df.clinorcand.map({'Yes': 1, 'No': 0})
clin_df['clinhemmor'] = clin_df.clinhemmor.map({'Yes': 1, 'No': 0})
clin_df['clinaids'] = clin_df.clinaids.map({'Yes': 1, 'No': 0})
clin_df['vom'] = clin_df.vom.map({'Yes': 1, 'No': 0})
clin_df['diar'] = clin_df.diar.map({'Yes': 1, 'No': 0})
clin_df['headache'] = clin_df['head'] #Since 'head' is a function
clin_df['conv'] = clin_df.conv.map({'Yes': 1, 'No': 0})
clin_df['ageym'] = clin_df.ageyrs.astype(float)*12.
clin_df.agemth = clin_df.agemth.astype(float)
clin_df['age'] = clin_df.ageym + clin_df.agemth #total age in months

#Medical history variables
clin_df['mhhosp'] = clin_df.mhhosp.map({'Yes': 1, 'No':0})

#some slightly more compliated mappings
clin_df['abdpain'] = clin_df.abdpain.map({"Don't know":0, 'No':0, 'Yes':1, 'Not applicable':np.nan})
clin_df['headache'] = clin_df.headache.map({"Don't know":0, 'No':0, 'Yes':1, 'Not applicable':0})
clin_df['muscle'] = clin_df.muscle.map({"Don't know":0, 'No':0, 'Yes':1, 'Not applicable':np.nan})
clin_df['dimain'] = clin_df.dimain.map({"Cerebral Malaria":0, "Meningoencephalitis":1, "Meningitis":2.,\
                                       "Other":3}) #best guess of the clinician. Very predictive, typically.
clin_df['mhdevsp'] = clin_df.mhdevsp.map({'OTHER':1, 'GLOBAL DEVELOPMENTAL DELAY':1, 'HYDROCEPHALUS':1,
       'HEARING LOSSES':1, 'MOTOR DEVELOPEMENTAL DELAY':1,
       'SPEECH DEVELOPEMENTAL DELAY':1})

#Mapping of inclusion variables 6-8 
clin_df['incirrit'] = clin_df.incirrit.map({"Don't know":0, 'No':0, 'Yes':1, 'Not applicable':0})
clin_df['inchead'] = clin_df.inchead.map({"Don't know":0, 'No':0, 'Yes':1, 'Not applicable':0})
clin_df['incphoto'] = clin_df.incphoto.map({"Don't know":0, 'No':0, 'Yes':1, 'Not applicable':0})
clin_df['incfont'] = clin_df.incfont.map({"Don't know":0, 'No':0, 'Yes':1, 'Not applicable':0})
clin_df['incneuro'] = clin_df.incneuro.map({"Don't know":0, 'No':0, 'Yes':1, 'Not applicable':0})
clin_df['incseiza'] = clin_df.incseiza.map({"Don't know":0, 'No':0, 'Yes':1, 'Not applicable':0})
clin_df['incseizh'] = clin_df.incseizh.map({"Don't know":0, 'No':0, 'Yes':1, 'Not applicable':0})
clin_df['incpurp'] = clin_df.incpurp.map({"Don't know":0, 'No':0, 'Yes':1, 'Not applicable':0})

#A lot of these have very few actual (non-NaN) answers. i.e., 'headache.' Mapping 'NaN' to 'No'. 
values = {'clinjaund':0,'clinhepato':0,'clinaids':0,'clinhemmor':0,'clinorcand':0,\
          'clinresp':0,'clinlymph':0,'clindehyd':0,'clinoedem':0,'clinablung':0,'clincyan':0,'clincapref':0,
          'clincoldext':0, 'mhchronstat':0, 'mhdevsp':0,\
         'conv':0,'abdpain':0,'vom':0,'diar':0,'headache':0,'muscle':0}
clin_df = clin_df.fillna(value=values)

#You really shouldn't use the initial diagnosis as a continuous variable, since it's really categorical.
#Map it into a set of yes/no questions:
clin_df['CMalaria'] = clin_df.dimain.map({0:1,1:0,2:0,3:0})
clin_df['Enceph'] = clin_df.dimain.map({0:0,1:1,2:0,3:0})
clin_df['Mening'] = clin_df.dimain.map({0:0,1:0,2:1,3:0})
clin_df['Other'] = clin_df.dimain.map({0:0,1:0,2:0,3:1})

#That was all features. Now, translate the labels into something usable.
#This coding is going to be important. You'll want to figure out which categories are useful, and which
#are practical.
clin_df['diag'] = clin_df.diag.map({'malaria':0, 'cereb malaria':0,'virus-malaria':0,\
                                    'virus-bacteria':1, 'bacteremia':1,'bact meningitis':1,\
                                    'virus-other':2,'virus':2,'malaria-bacterial':3,\
                                    'tb':3,'crypto':3, '0.0':4})

print('Set of clinical diagnoses:')
print(clin_df.dimain.unique())

reading data...
mapping data...
Set of clinical diagnoses:
[ 0.  1.  2.  3.]


In [234]:
#Pick the features to classify with.
Xlabels = ['CMalaria','Enceph','Mening','Other',\
                      'headache','age','temp',\
                      'date',\
                      'clinjaund','clinhepato','clinaids','clinhemmor','clinorcand',\
                      'clinresp','clinlymph','clindehyd','clinoedem','clinablung','clincyan','clincapref',\
                      'glasgtot', 'dbp', 'sbp','resp','card','weight','height','muac',\
                      'mhhosp','mhchronstat', 'mhdevsp',\
                      'conv','abdpain','vom','diar','muscle',\
                      'incirrit', 'inchead', 'incphoto',\
                      'incfont', 'incneuro' , 'incseiza', 'incseizh',\
                      'incpurp',\
                      'ident']  #Patient id number
FeatureList = copy.copy(Xlabels)
FeatureList = FeatureList + ['diag']
X_pd = clin_df.loc[:,FeatureList]
X_pd = X_pd.dropna()  #Drop cases that have nan for *any* entry

#Get things into the correct format for scikit-learn. I've tried to use the convention throughout that variables
#ending in 'pd' are pandas dataframes.
Input = X_pd.values              #creates a numpy array
Features = Input[:,0:-1].copy()  #separate the features and labels
Labels = Input[:,-1].copy()
ncase = len(Input[:,0])
print(np.shape(Features))        #checking that there are a reasonable number of cases left

X = Features.copy()              #changing to X and y. For no real reason
y = Labels.copy()

#You need to use the same test and train sets here, so can't use built in skl function.
ntrain = 200
X_train = X[:ntrain,:]
y_train = y[:ntrain]

X_test = X[ntrain:,:]
y_test = y[ntrain:]

(361, 45)


In [235]:
#Classify
params = {'n_estimators': 1000, 'max_leaf_nodes': 10, 'max_depth': 3, 'random_state': 4}

clf = GradientBoostingClassifier(n_estimators=1000, random_state=1, learning_rate=0.8, max_depth=3)
#clf = GradientBoostingClassifier(**params)
clf.fit(X_train, y_train)

#Make the predictions on the test data
y_pred = clf.predict(X_test)

#Print the confusion matrix:
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))

[[46  1  0  0 14]
 [ 3  3  0  0 17]
 [ 0  2  0  0  3]
 [ 2  1  0  0  2]
 [ 8 13  0  0 46]]
             precision    recall  f1-score   support

        0.0       0.78      0.75      0.77        61
        1.0       0.15      0.13      0.14        23
        2.0       0.00      0.00      0.00         5
        3.0       0.00      0.00      0.00         5
        4.0       0.56      0.69      0.62        67

avg / total       0.55      0.59      0.57       161



In [236]:
#Compared to random forest:
forest = ExtraTreesClassifier(**params)
forest.fit(X_train[:,:-1], y_train)
y_pred = forest.predict(X_test[:,:-1])

#Print the confusion matrix:
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))

[[48  0  0  0 13]
 [ 0  0  0  0 23]
 [ 1  0  0  0  4]
 [ 1  0  0  0  4]
 [ 8  0  0  0 59]]
             precision    recall  f1-score   support

        0.0       0.83      0.79      0.81        61
        1.0       0.00      0.00      0.00        23
        2.0       0.00      0.00      0.00         5
        3.0       0.00      0.00      0.00         5
        4.0       0.57      0.88      0.69        67

avg / total       0.55      0.66      0.59       161



At what number of training patients does the gradient boosted classifier do better?

params = {'n_estimators': 1000, 'max_leaf_nodes': 4, 'max_depth': 4, 'random_state': 1,
                   'min_samples_split': 2}
                   
clin_df['diag'] = clin_df.diag.map({'malaria':0, 'cereb malaria':0,'virus-malaria':0,\
                                    'virus-bacteria':1, 'bacteremia':1,'bact meningitis':1,\
                                    'virus-other':1,'virus':1,'malaria-bacterial':0,\
                                    'tb':1,'crypto':1, '0.0':2})

|N |    prec_GB  | \,\,  prec_RF|
|---|---|---|
|50  |  0.48   |     0.57|
|60  |  0.61   |     0.55|
|70  |  0.51   |     0.55|
|80  |  0.48   |     0.55|
|90  |  0.45   |     0.53|
|100 |  0.54   |     0.56|
|125 |  0.61   |     0.56|
|150 |  0.58   |     0.69|
|175 |  0.62   |     0.6|
|200 |  0.61   |     0.57|
|225 |  0.62   |     0.59|
|250 |  0.61   |     0.61|
|275 |  0.31   |     0.64|
|300 |  0.75   |     0.69|

Well. That is not impressive. Does GB classification work better when there are more categories?
clin_df['diag'] = clin_df.diag.map({'malaria':0, 'cereb malaria':0,'virus-malaria':0,\
                                    'virus-bacteria':1, 'bacteremia':1,'bact meningitis':1,\
                                    'virus-other':2,'virus':2,'malaria-bacterial':3,\
                                    'tb':3,'crypto':3, '0.0':4})
                                    
|N  |   prec_GB  |   prec_RF|
|---|---|---|
|50  |  0.48    |    0.55|
|75   | 0.48   |     0.54|
|100  | 0.48  |      0.5|
|125  | 0.53    |    0.52|
|150  | 0.55    |    0.56|
|175  | 0.6     |    0.58|
|200  | 0.54    |    0.55|
|225  | 0.57    |    0.56|
|250  | 0.57    |    0.57|
|275  | 0.24    |    0.62 |   
|300  | 0.84    |    0.69|

Same as above, but with no max_depth set, and max_leaf_nodes = 10

(I think it's possible that we're running into issues of 'classifications from far in the past are worse. **Idea: to assess the improvement of the classifier with time, use, e.g., 50 patients to classify, but move the window closer/further away in time.**)

| N | pGB | pRF |
|---|---|---|
| 50 | 0.48 | 0.55 |
| 75 | 0.51 | 0.57 |
| 100 | 0.47 | 0.54 |
| 125 | 0.58 | 0.54 |
| 150 | 0.60 | 0.63 |
| 175 | 0.60 | 0.61 |
| 200 | 0.57 | 0.55 |
| 225 | 0.56 | 0.56 |
| 250 | 0.62 | 0.57 |
| 275 | 0.58 | 0.62 |
| 300 | 0.79 | 0.70|