In [1]:
#****************************************
# This juptyter notebook was created for the project, Autism Detection,
# from the Udemy course, Aplied Machine Learning in Healthcare found at: 
#     https://www.udemy.com/course/applied-machine-learning-for-healthcare
#
# The dataset used in this project is a data set located in the UCI repository at the following URL:  
# https://archive.ics.uci.edu/ml/datasets/Autistic+Spectrum+Disorder+Screening+Data+for+Children++
# 
#
#In this project we complete the following activities 
#  1. Load python libraries
#  2. Import the data from the file into a dataframe
#  3. Review the data
#  4. Preprocess the data
#       eliminate unecessary columns
#
#  5. Create X and Y datasets for training
#  6. Create categorical variables (one hot encoded vectors) for both X and Y datasets
#  7. Split the dataset into training and test datasets
#  8. Build a neural network using keras
#    _________________________________________________________________
#    Layer (type)                 Output Shape              Param #   
#    =================================================================
#    dense_1 (Dense)              (None, 8)                 776       
#    _________________________________________________________________
#    dense_2 (Dense)              (None, 4)                 36        
#    _________________________________________________________________
#    dense_3 (Dense)              (None, 2)                 10        
#    =================================================================
#    Total params: 822
#    Trainable params: 822
#    Non-trainable params: 0
#    _________________________________________________________________
#    None

#
#  9. Fit the model to the training data
#  10. Test the model with the Test data set
#    Prediction Results for Neural Network
#    0.9322033898305084
#                  precision    recall  f1-score   support
#    
#               0       0.88      1.00      0.93        28
#               1       1.00      0.87      0.93        31
#    
#        accuracy                           0.93        59
#       macro avg       0.94      0.94      0.93        59
#    weighted avg       0.94      0.93      0.93        59
#
# 11. Add a Dropout layer to the model
#    _________________________________________________________________
#    Layer (type)                 Output Shape              Param #   
#    =================================================================
#    dense_4 (Dense)              (None, 8)                 776       
#    _________________________________________________________________
#    dense_5 (Dense)              (None, 4)                 36        
#    _________________________________________________________________
#    dropout_1 (Dropout)          (None, 4)                 0         
#    _________________________________________________________________
#    dense_6 (Dense)              (None, 2)                 10        
#    =================================================================
#    Total params: 822
#    Trainable params: 822
#    Non-trainable params: 0
#    _________________________________________________________________
#    None
#
#  12. Retrain and retest see if that improved results (it didn't)
#  13. Increase the #of epochs for training, retrain and retest see if that improved results (it didn't)
#
#
# Key learning - how to work with categorical data as many healthcare applications will have 
#                categorical data like this project.
#*****************************************************************

In [2]:
# import libraries and check versions
import sys  # Python
import pandas as pd # Use Pandas for data handling and importing
import sklearn # machine learning tools
import keras # deep learning API to build the neural network

print('Python: {}'.format(sys.version))   # use .format function to substitute a variable into a string
print('Pandas: {}'.format(pd.__version__))
print('Sklearn: {}'.format(sklearn.__version__))
print('Keras: {}'.format(keras.__version__))

Using Theano backend.


Python: 3.7.4 (default, Aug  9 2019, 18:34:13) [MSC v.1915 64 bit (AMD64)]
Pandas: 0.25.1
Sklearn: 0.21.3
Keras: 2.2.4


In [3]:
# import the autism dataset
file = 'Autism-Child-Data.txt'
# above - file we downloaded from the UCI repos, then manually cleaned up and saved as .txt

# read the file into the notebook
data = pd.read_table(file, sep=',', index_col=None)

In [4]:
# print the shape of the dataframe plus some examples
print('Shape of DataFrame: {}'.format(data.shape))
print(data.loc[0])

Shape of DataFrame: (292, 21)
A1_Score                               1
A2_Score                               1
A3_Score                               0
A4_Score                               0
A5_Score                               1
A6_Score                               1
A7_Score                               0
A8_Score                               1
A9_Score                               0
A10_Score                              0
age                                    6
gender                                 m
ethnicity                         Others
jundice                               no
family_history_of_austim              no
contry_of_res                     Jordan
used_app_before                       no
result                                 5
age_desc                    '4-11 years'
relation                          Parent
Class/ASD                             NO
Name: 0, dtype: object


In [5]:
# print out multiple patients
data.loc[:10] # everything up to and including the 10th one - i.e. 1st 11 patients

Unnamed: 0,A1_Score,A2_Score,A3_Score,A4_Score,A5_Score,A6_Score,A7_Score,A8_Score,A9_Score,A10_Score,...,gender,ethnicity,jundice,family_history_of_austim,contry_of_res,used_app_before,result,age_desc,relation,Class/ASD
0,1,1,0,0,1,1,0,1,0,0,...,m,Others,no,no,Jordan,no,5,'4-11 years',Parent,NO
1,1,1,0,0,1,1,0,1,0,0,...,m,'Middle Eastern ',no,no,Jordan,no,5,'4-11 years',Parent,NO
2,1,1,0,0,0,1,1,1,0,0,...,m,?,no,no,Jordan,yes,5,'4-11 years',?,NO
3,0,1,0,0,1,1,0,0,0,1,...,f,?,yes,no,Jordan,no,4,'4-11 years',?,NO
4,1,1,1,1,1,1,1,1,1,1,...,m,Others,yes,no,'United States',no,10,'4-11 years',Parent,YES
5,0,0,1,0,1,1,0,1,0,1,...,m,?,no,yes,Egypt,no,5,'4-11 years',?,NO
6,1,0,1,1,1,1,0,1,0,1,...,m,White-European,no,no,'United Kingdom',no,7,'4-11 years',Parent,YES
7,1,1,1,1,1,1,1,1,0,0,...,f,'Middle Eastern ',no,no,Bahrain,no,8,'4-11 years',Parent,YES
8,1,1,1,1,1,1,1,0,0,0,...,f,'Middle Eastern ',no,no,Bahrain,no,7,'4-11 years',Parent,YES
9,0,0,1,1,1,0,1,1,0,0,...,f,?,no,yes,Austria,no,5,'4-11 years',?,NO


In [6]:
# print data characteristics using pandas.describe() function
data.describe() # this shows only the column types that are numeric

Unnamed: 0,A1_Score,A2_Score,A3_Score,A4_Score,A5_Score,A6_Score,A7_Score,A8_Score,A9_Score,A10_Score,result
count,292.0,292.0,292.0,292.0,292.0,292.0,292.0,292.0,292.0,292.0,292.0
mean,0.633562,0.534247,0.743151,0.55137,0.743151,0.712329,0.606164,0.496575,0.493151,0.726027,6.239726
std,0.482658,0.499682,0.437646,0.498208,0.437646,0.453454,0.489438,0.500847,0.500811,0.446761,2.284882
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0
50%,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,6.0
75%,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,8.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,10.0


In [7]:
# print data types
data.dtypes

A1_Score                     int64
A2_Score                     int64
A3_Score                     int64
A4_Score                     int64
A5_Score                     int64
A6_Score                     int64
A7_Score                     int64
A8_Score                     int64
A9_Score                     int64
A10_Score                    int64
age                         object
gender                      object
ethnicity                   object
jundice                     object
family_history_of_austim    object
contry_of_res               object
used_app_before             object
result                       int64
age_desc                    object
relation                    object
Class/ASD                   object
dtype: object

In [8]:
# above - the object datatypes indicates they contain strings


In [9]:
# drop unwanted columns, result (which is a weighted sum of the question columns, and age_descr since all values are same in this column)
data = data.drop(['result', 'age_desc'], axis=1) # axis = 1 specifies to drop columns. a 0 would have been rows


In [10]:
data.loc[:10]

Unnamed: 0,A1_Score,A2_Score,A3_Score,A4_Score,A5_Score,A6_Score,A7_Score,A8_Score,A9_Score,A10_Score,age,gender,ethnicity,jundice,family_history_of_austim,contry_of_res,used_app_before,relation,Class/ASD
0,1,1,0,0,1,1,0,1,0,0,6,m,Others,no,no,Jordan,no,Parent,NO
1,1,1,0,0,1,1,0,1,0,0,6,m,'Middle Eastern ',no,no,Jordan,no,Parent,NO
2,1,1,0,0,0,1,1,1,0,0,6,m,?,no,no,Jordan,yes,?,NO
3,0,1,0,0,1,1,0,0,0,1,5,f,?,yes,no,Jordan,no,?,NO
4,1,1,1,1,1,1,1,1,1,1,5,m,Others,yes,no,'United States',no,Parent,YES
5,0,0,1,0,1,1,0,1,0,1,4,m,?,no,yes,Egypt,no,?,NO
6,1,0,1,1,1,1,0,1,0,1,5,m,White-European,no,no,'United Kingdom',no,Parent,YES
7,1,1,1,1,1,1,1,1,0,0,5,f,'Middle Eastern ',no,no,Bahrain,no,Parent,YES
8,1,1,1,1,1,1,1,0,0,0,11,f,'Middle Eastern ',no,no,Bahrain,no,Parent,YES
9,0,0,1,1,1,0,1,1,0,0,11,f,?,no,yes,Austria,no,?,NO


In [11]:
# create X and Y data sets for training
x = data.drop(['Class/ASD'], 1) # the 1 is for formatting issues
y = data['Class/ASD']

In [12]:
# convert categorical variables into one hot encoded vectors
X = pd.get_dummies(x)

In [13]:
X.loc[:10] # lets look at first few rows to see the result of the one hot encoding

Unnamed: 0,A1_Score,A2_Score,A3_Score,A4_Score,A5_Score,A6_Score,A7_Score,A8_Score,A9_Score,A10_Score,...,contry_of_res_Syria,contry_of_res_Turkey,used_app_before_no,used_app_before_yes,relation_'Health care professional',relation_?,relation_Parent,relation_Relative,relation_Self,relation_self
0,1,1,0,0,1,1,0,1,0,0,...,0,0,1,0,0,0,1,0,0,0
1,1,1,0,0,1,1,0,1,0,0,...,0,0,1,0,0,0,1,0,0,0
2,1,1,0,0,0,1,1,1,0,0,...,0,0,0,1,0,1,0,0,0,0
3,0,1,0,0,1,1,0,0,0,1,...,0,0,1,0,0,1,0,0,0,0
4,1,1,1,1,1,1,1,1,1,1,...,0,0,1,0,0,0,1,0,0,0
5,0,0,1,0,1,1,0,1,0,1,...,0,0,1,0,0,1,0,0,0,0
6,1,0,1,1,1,1,0,1,0,1,...,0,0,1,0,0,0,1,0,0,0
7,1,1,1,1,1,1,1,1,0,0,...,0,0,1,0,0,0,1,0,0,0
8,1,1,1,1,1,1,1,0,0,0,...,0,0,1,0,0,0,1,0,0,0
9,0,0,1,1,1,0,1,1,0,0,...,0,0,1,0,0,1,0,0,0,0


In [14]:
# let's look at just the column values in the dataframe
X.columns.values


array(['A1_Score', 'A2_Score', 'A3_Score', 'A4_Score', 'A5_Score',
       'A6_Score', 'A7_Score', 'A8_Score', 'A9_Score', 'A10_Score',
       'age_10', 'age_11', 'age_4', 'age_5', 'age_6', 'age_7', 'age_8',
       'age_9', 'age_?', 'gender_f', 'gender_m',
       "ethnicity_'Middle Eastern '", "ethnicity_'South Asian'",
       'ethnicity_?', 'ethnicity_Asian', 'ethnicity_Black',
       'ethnicity_Hispanic', 'ethnicity_Latino', 'ethnicity_Others',
       'ethnicity_Pasifika', 'ethnicity_Turkish',
       'ethnicity_White-European', 'jundice_no', 'jundice_yes',
       'family_history_of_austim_no', 'family_history_of_austim_yes',
       "contry_of_res_'Costa Rica'", "contry_of_res_'Isle of Man'",
       "contry_of_res_'New Zealand'", "contry_of_res_'Saudi Arabia'",
       "contry_of_res_'South Africa'", "contry_of_res_'South Korea'",
       "contry_of_res_'U.S. Outlying Islands'",
       "contry_of_res_'United Arab Emirates'",
       "contry_of_res_'United Kingdom'", "contry_of_res_'United

In [15]:
# print out an example for one patient
X.loc[0]

A1_Score             1
A2_Score             1
A3_Score             0
A4_Score             0
A5_Score             1
                    ..
relation_?           0
relation_Parent      1
relation_Relative    0
relation_Self        0
relation_self        0
Name: 0, Length: 96, dtype: int64

In [16]:
# convert the classification data to categorical values
Y = pd.get_dummies(y)
Y.iloc[:10]

Unnamed: 0,NO,YES
0,1,0
1,1,0
2,1,0
3,1,0
4,0,1
5,1,0
6,0,1
7,0,1
8,0,1
9,1,0


In [17]:
# create X and Y datasets for training

from sklearn import model_selection
# define seed for reproducability
seed = 1
# above - in the course, he chose not to use a random seed this time.

X_train, X_test, Y_train, Y_test = model_selection.train_test_split(X, Y, test_size= 0.2, random_state = seed)
# above  - randomly split data into 80% training and 20% for testing

In [18]:
print(X_train.shape)
print(X_test.shape)
print(Y_train.shape)
print(Y_test.shape)

(233, 96)
(59, 96)
(233, 2)
(59, 2)


In [19]:
# above - 233 patients in the training set with 96 attributes. 20 attribures
#         really, but since we expanded to categorical hot encoded vectors
#         there are 96 columns.

In [26]:
# build a neural network using keras
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.optimizers import Adam

# define a function to build the keras model
def create_model():
    model = Sequential()
    model.add(Dense(8, input_dim=96, kernel_initializer='normal', activation='relu'))
    model.add(Dense(4, kernel_initializer='normal', activation='relu'))
    model.add(Dropout(0.25))
    model.add(Dense(2, activation='sigmoid'))
    # above - need 2 output neurons because we have a categorical vector of length 5
    #         sigmoid will push the values to a 0 or a 1 which is what we want for our Y

    # compile the model
    adam = Adam(lr=0.001) # .001 is a standard learning rate value
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model


In [27]:
# create the model
model = create_model()
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_4 (Dense)              (None, 8)                 776       
_________________________________________________________________
dense_5 (Dense)              (None, 4)                 36        
_________________________________________________________________
dropout_1 (Dropout)          (None, 4)                 0         
_________________________________________________________________
dense_6 (Dense)              (None, 2)                 10        
Total params: 822
Trainable params: 822
Non-trainable params: 0
_________________________________________________________________
None


In [32]:
# fit the model to our training data
model.fit(X_train, Y_train, epochs=75, batch_size=10, verbose=1)
# above - epoch is 50 means we'll go thru all our data 50 times
#       - batch size is the number of instances to look at before compiling and updating the parameters
#         so in our case, look at 10 patients, calculate the gradiant, update the parameters

Epoch 1/75
Epoch 2/75
Epoch 3/75
Epoch 4/75
Epoch 5/75
Epoch 6/75
Epoch 7/75
Epoch 8/75
Epoch 9/75
Epoch 10/75
Epoch 11/75
Epoch 12/75
Epoch 13/75
Epoch 14/75
Epoch 15/75
Epoch 16/75
Epoch 17/75
Epoch 18/75
Epoch 19/75
Epoch 20/75
Epoch 21/75
Epoch 22/75
Epoch 23/75
Epoch 24/75
Epoch 25/75
Epoch 26/75
Epoch 27/75
Epoch 28/75
Epoch 29/75
Epoch 30/75
Epoch 31/75
Epoch 32/75
Epoch 33/75
Epoch 34/75
Epoch 35/75
Epoch 36/75
Epoch 37/75
Epoch 38/75
Epoch 39/75
Epoch 40/75
Epoch 41/75
Epoch 42/75
Epoch 43/75
Epoch 44/75
Epoch 45/75
Epoch 46/75
Epoch 47/75
Epoch 48/75
Epoch 49/75
Epoch 50/75
Epoch 51/75
Epoch 52/75
Epoch 53/75
Epoch 54/75
Epoch 55/75
Epoch 56/75
Epoch 57/75
Epoch 58/75
Epoch 59/75
Epoch 60/75
Epoch 61/75
Epoch 62/75
Epoch 63/75
Epoch 64/75
Epoch 65/75
Epoch 66/75
Epoch 67/75
Epoch 68/75
Epoch 69/75
Epoch 70/75
Epoch 71/75
Epoch 72/75
Epoch 73/75
Epoch 74/75
Epoch 75/75


<keras.callbacks.History at 0x1ad5cbbc788>

In [33]:
# generate classification report using predictions for the  model
from sklearn.metrics import classification_report, accuracy_score

predictions = model.predict_classes(X_test)



In [34]:
predictions

array([0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0,
       0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0], dtype=int64)

In [35]:
print('Prediction Results for Neural Network')
print(accuracy_score(Y_test[['YES']], predictions)) # predictions is only a single value, we 
#      compare that to the YES column in Y_test
print(classification_report(Y_test[['YES']], predictions))

Prediction Results for Neural Network
0.9152542372881356
              precision    recall  f1-score   support

           0       0.85      1.00      0.92        28
           1       1.00      0.84      0.91        31

    accuracy                           0.92        59
   macro avg       0.92      0.92      0.92        59
weighted avg       0.93      0.92      0.92        59

