In [250]:
#Libraries for math and data manipulation
import numpy as np
import pandas as pd
import math
import numpy.random as rand
import scipy as sp
import sys
from datetime import datetime

#Plotting stuff
import matplotlib.pyplot as plt
import seaborn
import corner
%matplotlib inline
from matplotlib import rc
rc('font',**{'family':'sans-serif','sans-serif':['Helvetica']})
rc('text', usetex=True) #Want to be able to use tex in the plot labels
seaborn.set_style('ticks')
seaborn.set_color_codes()

#Machine learning stuff
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
import graphviz 
from sklearn.metrics import classification_report, confusion_matrix  

In [2]:
#Read in Stata datafile
#data_stata = pd.io.stata.read_stata('../ProjectDef/cnsfinall2.dta')
#Convert to .csv. Now you never need to do this again
#data_stata.to_csv('../Data/cns_data.csv')

In [3]:
data_raw = pd.read_csv('../Data/cns_data.csv')

In [4]:
#Delete useless columns (like surename, initials, etc.)
data_df = data_raw.drop(['surnamenam','patinit','patientreg','mothinit'],axis=1)


In [5]:
#Clinical measures
clin_df = data_raw.loc[:,['glasgtot','blanttot','clinjaund','clinhepato','clinspleno','clinconv','clindehyd',\
                          'clinoedem','clinlymph','clinresp','clinablung','clincyan','clincapref','clincoldext',\
                         'clinearinf','clinanemia','clintonsil','clinorcand','clinhemmor','clinaids',\
                         'abdpain','vom','diar','head','muscle','conv',\
                         'sex','ageyrs','agemth','incdate',\
                         'temp','card','resp','sbp','dbp','weight','height','muac']]

In [6]:


#Mapping yes/no to 1/0
clin_df['clinjaund'] = clin_df.clinjaund.map({'Yes': 1, 'No': 0})
clin_df['clinhepato'] = clin_df.clinhepato.map({'Yes': 1, 'No': 0})
clin_df['clinspleno'] = clin_df.clinspleno.map({'Yes': 1, 'No': 0})
clin_df['clinconv'] = clin_df.clinconv.map({'Yes': 1, 'No': 0})
clin_df['clindehyd'] = clin_df.clindehyd.map({'Yes': 1, 'No': 0})
clin_df['clinoedem'] = clin_df.clinoedem.map({'Yes': 1, 'No': 0})
clin_df['clinlymph'] = clin_df.clinlymph.map({'Yes': 1, 'No': 0})
clin_df['clinresp'] = clin_df.clinresp.map({'Yes': 1, 'No': 0})
clin_df['clinablung'] = clin_df.clinablung.map({'Yes': 1, 'No': 0})
clin_df['clincyan'] = clin_df.clincyan.map({'Yes': 1, 'No': 0})
clin_df['clincapref'] = clin_df.clincapref.map({'Yes': 1, 'No': 0})
clin_df['clincoldext'] = clin_df.clincoldext.map({'Yes': 1, 'No': 0})
clin_df['clinearinf'] = clin_df.clinearinf.map({'Yes': 1, 'No': 0})
clin_df['clintonsil'] = clin_df.clintonsil.map({'Yes': 1, 'No': 0})
clin_df['clinorcand'] = clin_df.clinorcand.map({'Yes': 1, 'No': 0})
clin_df['clinhemmor'] = clin_df.clinhemmor.map({'Yes': 1, 'No': 0})
clin_df['clinaids'] = clin_df.clinaids.map({'Yes': 1, 'No': 0})
clin_df['vom'] = clin_df.vom.map({'Yes': 1, 'No': 0})
clin_df['diar'] = clin_df.diar.map({'Yes': 1, 'No': 0})
clin_df['headache'] = clin_df['head'] #Since 'head' is a function
clin_df['conv'] = clin_df.conv.map({'Yes': 1, 'No': 0})
clin_df['ageym'] = clin_df.ageyrs.astype(float)*12.
clin_df.agemth = clin_df.agemth.astype(float)
clin_df['age'] = clin_df.ageym + clin_df.agemth

#some slightly more compliated mappings
clin_df['abdpain'] = clin_df.abdpain.map({"Don't know":np.nan, 'No':0, 'Yes':1, 'Not applicable':np.nan})
clin_df['headache'] = clin_df.headache.map({"Don't know":np.nan, 'No':0, 'Yes':1, 'Not applicable':np.nan})
clin_df['muscle'] = clin_df.muscle.map({"Don't know":np.nan, 'No':0, 'Yes':1, 'Not applicable':np.nan})

In [7]:
#Convert the date into a useful value. This just counts the days from the beginning of the dataset, which is probably
#not the best way of doing it. I suspect the month and day are relevent, but probably not the year. If seasonality
#is strong here, anyway.
clin_df['incdate'] = pd.to_datetime(clin_df['incdate'])
clin_df['date'] = (clin_df['incdate'] - clin_df['incdate'].min())  / np.timedelta64(1,'D')

In [448]:
#Try a decision tree classification with some of these. Will probably be pretty bad.
#You'll have to figure out how to remove the entries with NaNs
X_pd = clin_df.loc[:,['vom','diar','clinjaund','conv','clinaids','clinhepato','clindehyd','temp']]
X_pd = X_pd.dropna()

In [449]:
Input = X_pd.as_matrix()
ncase = len(Input[:,0])
print ncase

477


In [450]:
#First, you want meningitis and cerebral malaria. Question: are all of the malaria diagnoses cerebral? I think yes,
#since these kids all have CNS symptoms.
#The unique lab diagonses are:
#array(['malaria', '0.0', 'virus-bacteria', 'bacteremia', 'bact meningitis',
       #'virus-other', 'virus-malaria', 'malaria-bacterial', 'tb', 'virus',
       #'crypto'], dtype=object)
#I'll map malaria to 0, meningitis to 1, and other to 2
diag_df = data_df.labdiag.map({'malaria':0, 'malaria-bacterial':0, 'virus-malaria':0, 'bact meningitis':2, \
                              '0.0':2,'virus-bacteria':2,'bacteremia':2,'virus-other':2,'tb':2,'virus':2,
                              'crypto':2})

In [451]:
Output = diag_df.as_matrix()

In [452]:
#Select a subset of these kids to train on. 
index = np.linspace(0,ncase-1,ncase).astype(int)
to_train = np.random.choice(index,ncase/2,replace=False)
to_test = np.isin(index,to_train)
to_test = index[~to_test]

X = Input[to_train,:]
y = Output[to_train]

In [453]:
#Well, actually. Scikitlearn will do this for you.
from sklearn.model_selection import train_test_split  
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.50)  

In [457]:
n_classes = 2
clf = DecisionTreeClassifier(criterion='entropy').fit(X_train, y_train)
y_pred = clf.predict(X_test)  

In [458]:
dot_data = tree.export_graphviz(clf, out_file=None) 
graph = graphviz.Source(dot_data)
graph.render("iris")

'iris.pdf'

In [459]:
print(confusion_matrix(y_test, y_pred))  
print(classification_report(y_test, y_pred))  

[[24 26]
 [27 42]]
             precision    recall  f1-score   support

          0       0.47      0.48      0.48        50
          2       0.62      0.61      0.61        69

avg / total       0.56      0.55      0.56       119

