Notebook for our final project!

Team:
Nolan Jimmo
Nicole Donahue
Frederick Carlson
Xinyu Liu

In [72]:
#Imports, function def and some file reading

import numpy as np
import pandas as pd
import glob
import csv
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.svm import SVC, LinearSVC
from sklearn import linear_model
from sklearn.ensemble import RandomForestClassifier
# TensorFlow and tf.keras
import tensorflow as tf
from tensorflow import keras

from tensorflow.keras.utils import to_categorical
from tensorflow.keras.datasets import cifar10
from tensorflow.keras.models import Sequential 
from tensorflow.keras import models
from tensorflow.keras import layers
from tensorflow.keras import optimizers

def conf_matrix_to_df(conf_matrix, target_names):
    return pd.DataFrame(conf_matrix, columns=target_names, index=target_names)


#reading in EDSS Score data
EDSS_FILENAME = "data/EDSS_Scores.csv"
EDSS_scores = pd.read_csv(EDSS_FILENAME)

Find the subject ids that have valid EDSS scores to be able to just train model on these subjects data. Storing the valid subject id and scores in a dictionary with the structure: {Subject ID: (baseline score, 6mo score)}

In [3]:
valid_sids = {}
for i, row in EDSS_scores.iterrows():
    if type(row["Subject ID "]) == float:
        break
    if row["EDSS Baseline (Score out of 10) "] != np.NaN and row["EDSS 6mo (Score out of 10) "] != np.NaN:
        valid_sids[(row["Subject ID "])] = (str(row["EDSS Baseline (Score out of 10) "]), str(row["EDSS 6mo (Score out of 10) "]))
#print(valid_sids)

converting regular EDSS scores to the binary 0, or 1, for low vs. moderate/severe EDSS score. Everything up to 4 will be 0, everything 4 and above will be moderate/severe score

In [4]:
valid_sids_generalized = {}
for key, value in valid_sids.items():
    if float(value[0]) < 4:
        v1 = 0
    else:
        v1 = 1
    if float(value[1]) < 4:
        v2 = 0
    else:
        v2 = 0
    valid_sids_generalized[key] = (v1, v2)

Get filenames for the valid subject data files out of the data folder, for both the baseline and 6mo data

NOTES: This is all pretty much just data preprocessing, getting the filenames that correspond to the subjects that we know we have EDSS scores for, then going and getting all of the data for each of those valid subjects. For each row of data per subject I add column (feature) that is the target feature, which is just their EDSS score for this time period. I then store that data in a list (called calid_subject_data) in order to facilitate creating the dataframe that I will use in the training/testing of our model

In [5]:
# here, it is the baseline of the gait data
gait_baseline_filenames = glob.glob("data/Processed Data - MS +/Sway/MS1 Session 1/*")
#print((gait_baseline_filenames))
removal = []
for g in gait_baseline_filenames:
    if g[-9:-4] not in valid_sids.keys():
        removal.append(g)

gait_b_filenames = [l for l in gait_baseline_filenames if l not in removal]

###NOTE: In this test below, sometimes the two lists are not the same length
# HOWEVER, the valid EDSS subject ids list is always longer, so we will always have a
# "target" for each feature set, so we should be good to go
#print(len(gait_b_filenames), len(valid_sids.keys()))


# here, it is the 6mo of the gait data
gait_6mo_filenames = glob.glob("data/Processed Data - MS +/Sway/MS1 Session 2/*")
#print((gait_baseline_filenames))
removal = []
for g in gait_baseline_filenames:
    if g[-9:-4] not in valid_sids.keys():
        removal.append(g)

gait_6_filenames = [l for l in gait_baseline_filenames if l not in removal]

# Now, loop through the valid files, get the features from each valid subject and assign
MAX_ROWS_PER_SUBJECT = 20
# their EDSS score as the "target"
valid_subject_data = []
cols = []
for g in gait_b_filenames:
    with open(g, 'r') as file:
        reader = csv.reader(file)
        if cols == []:
            cols = next(file).strip().split(',')
            cols.append('target')
        count = 0
        for row in reader:
            if row[0] != 'timestamp_start':
                row.append(valid_sids[g[-9:-4]][0])
                valid_subject_data.append(row)
                # if valid_sids[g[-9:-4]][0] == 0 and count < MAX_ROWS_PER_SUBJECT:
                #     row.append(valid_sids[g[-9:-4]][0])
                #     valid_subject_data.append(row)
                #     count += 1
                # elif valid_sids[g[-9:-4]][0] == 1:
                #     row.append(valid_sids[g[-9:-4]][0])
                #     valid_subject_data.append(row)
                # else:
                #     break

# doing the exact some thing as before, just with the 6 month data
# We can just add this data straight to the valid_subject_data list because it is all going
# to be training data
# We do have to separate the for loops though because we have to add the proper EDSS value
# from the valid_sids dictionary
for g6 in gait_6mo_filenames:
    with open(g6, 'r') as file:
        reader = csv.reader(file)
        if cols == []:
            cols = next(file).strip().split(',')
            cols.append('target')
        count = 0
        for row in reader:
            if row[0] != 'timestamp_start':
                row.append(valid_sids[g[-9:-4]][1])
                valid_subject_data.append(row)
                # if valid_sids[g[-9:-4]][1] == 0 and count < MAX_ROWS_PER_SUBJECT:
                #     row.append(valid_sids[g[-9:-4]][1])
                #     valid_subject_data.append(row)
                #     count += 1
                # elif valid_sids[g[-9:-4]][1] == 1:
                #     row.append(valid_sids[g[-9:-4]][1])
                #     valid_subject_data.append(row)
                # else:
                #     break
#print(cols)
#print(valid_subject_data)
num_observations = len(valid_subject_data)

Here, I will build a neural network and we will test how well that predicts the outputs compared to the random forest

Final setup for the features dataframe and then training/testing the SVM model!

NOTES:
As you can see from the models that are commented out, I tried a number of different models, and it looks like the random forest classifier is going to be the one that works the best. Basically, here, I drop all of the non-important features colums, break the data in to testing and training partitions, train the model and then test it.

In [6]:
df = pd.DataFrame(valid_subject_data, columns=cols)
#print(df)
#get rid of the non-important or NaN valued "features"
df.drop(df.columns[[0,1,2,6,16]], axis=1, inplace=True)
#print(df)
df.fillna(0)

#Train the model and see what happens!
x_train, x_test, y_train, y_test = train_test_split(df.iloc[:, df.columns != 'target'], np.array(df.iloc[:, df.columns == 'target']).reshape(num_observations,), test_size = 0.2, random_state = 0)
#svm = SVC(kernel="poly")
#lin_model = linear_model.LogisticRegression()
rfc = RandomForestClassifier()
print('training')
rfc.fit(x_train, y_train)
print("predicting")
svm_y_predict = rfc.predict(x_test)

conf_matrix_svm = confusion_matrix(y_test, svm_y_predict)
print("\nPrinting confusion matrix")
ts = list(set(df['target']))
ts = sorted([float(t) for t in ts])
conf_matrix_to_df(conf_matrix_svm, ts)
#print(conf_matrix_svm)

training
predicting

Printing confusion matrix


Unnamed: 0,1.0,1.5,2.0,2.5,3.0,3.5,4.0,5.0,6.0
1.0,9,104,0,0,0,0,0,5,0
1.5,5,1278,0,0,1,1,0,4,9
2.0,0,55,1,0,0,0,0,0,0
2.5,0,45,0,0,0,0,0,0,0
3.0,1,137,0,0,2,0,0,0,0
3.5,1,178,0,0,0,0,0,0,0
4.0,0,52,0,0,0,0,0,0,1
5.0,0,18,0,0,0,0,0,17,0
6.0,0,71,0,0,0,0,0,0,18


Notes moving forward to try and improve performance:

1. Use a regression model rather than an SVM
2. Do a better job of equalizing how much data we have from low EDSS scores (healthier people) vs high EDSS scores (not as healthy people)
    - Currently, there is significantly more data from the healthier people, and not as much data from the not as healthy people, so all of the test data gets predicted as low EDSS (0). We can either omit a proportional amount of the low EDSS score training data, or we can add a bunch of mean-wise approximated data for high EDSS patients
    - This second approach is not as scalable as the first because we can only add data based on data that we already have, so this approach would really only help us for the binary, low/high edss scores, any not the ultimate classification of individual EDSS score (we would then have a high density of data/scores for the small domain of high EDSS scores that we have recorded)

Things done to address the problems/solutions above (3/22/21):
1. Tried a regression model, worked worse than the SVM. Ended up with a RandomForestClassifier() that has proven to work pretty well, certainly much, much better than the SVM or the regression models (even though still not awesome)
2. While it is not a perfect way of dealing with a disproportionate amount of data per target, I just limited the amount of data that there is in the processed dataset based on the target value. I limited healthier scores (target value 0) to 50 rows of data per subject, and did not limit the amount of data per target value 1 subject

******* FINAL PROJECT WORK HERE *********<br>
First attempt at cleaning the data that Nicole sent most recently with the summary stats

In [81]:
SWAY_1_DATA_FILENAME = "data/summary_stats_data/sway1_stats.csv"
SWAY_2_DATA_FILENAME = "data/summary_stats_data/sway2_stats.csv"
ABC_SCORES_1 = "data/summary_stats_data/Scores_1.csv"
ABC_SCORES_2 = "data/summary_stats_data/Scores_2.csv"
feature_names = [ 'subjectID', 'p5_RANGE', 'p25_RANGE', 'median_RANGE', 'p75_RANGE', 'p95_RANGE', 'p5_F50', 'p25_F50', 'median_F50', 'p75_F50', 'p95_F50', 'p5_F95', 'p25_F95', 'median_F95', 'p75_F95', 'p95_F95']

# so, data is going to be a 2 dimensional list, where the value that is in each index of each 'row' corresponds to the positions of the labels in the list above
# so the p5_RANGE value for subject 0 will be in data[0][0], p25_RANGE value for subject 0 will be in data[0][1], etc.
# there aren't any labels, just gotta trust that they are correct. In my opinion this is easier than using a dictionary or something
# because we can literally just pass this array straight to whatever model we want to use.

####### SESSION 1 DATA COLLECTION SWAY DATA#########
data_1 = []
with open(SWAY_1_DATA_FILENAME, 'r') as file:
        reader = csv.reader(file)
        count = 0
        col_nums = []
        for r in reader:
            if count == 0:
                col_nums = [r.index(n) for n in r if n in feature_names]
                count+=1
            else:
                data_row = [i for i in r if r.index(i) in col_nums]
                s_id = data_row.pop()
                data_row.insert(0, s_id)
                data_1.append(data_row)
data_1.pop()
# next thing to do is to associate the ABC scores with the subject id's
# i'm just going to do that randomly for now because I want to be able to test

# now, with scores, keep track of ABC score per subjectID 
scores_1 = {}
with open(ABC_SCORES_1, 'r') as file:
    reader = csv.reader(file)
    for row in reader:
        count = 0
        col_num = -1
        for r in reader:
            scores_1[r[2]] = r[13]

score_affiliated_data_1 = []
x = 0
while x < len(data_1):
    data_1[x].append(scores_1[data_1[x][0]])
    temp = data_1[x][1:]
    score_affiliated_data_1.append(temp)
    x += 1


####### SESSION 2 DATA COLLECTION #########
data_2 = []
with open(SWAY_2_DATA_FILENAME, 'r') as file:
        reader = csv.reader(file)
        count = 0
        col_nums = []
        for r in reader:
            if count == 0:
                col_nums = [r.index(n) for n in r if n in feature_names]
                count+=1
            else:
                data_row = [i for i in r if r.index(i) in col_nums]
                s_id = data_row.pop()
                data_row.insert(0, s_id)
                data_2.append(data_row)
data_2.pop()
# next thing to do is to associate the ABC scores with the subject id's
# i'm just going to do that randomly for now because I want to be able to test

# now, with scores, keep track of ABC score per subjectID 
scores_2 = {}
with open(ABC_SCORES_2, 'r') as file:
    reader = csv.reader(file)
    for row in reader:
        count = 0
        col_num = -1
        for r in reader:
            if r[0] == "":
                continue
            else:
                scores_2[r[2]] = r[11]

score_affiliated_data_2 = []
x = 0
while x < len(data_2):
    try:
        data_2[x].append(scores_2[data_2[x][0]])
        temp = data_2[x][1:]
        score_affiliated_data_2.append(temp)
        x += 1
    except:
        x += 1
        continue

# for s in score_affiliated_data:
#     print(s)
                

### LOGISTIC REGRESSION ON SWAY DATA###
x_train = [l[:-1] for l in score_affiliated_data_1]
y_train = [l[-1] for l in score_affiliated_data_1]

x_test = [l[:-1] for l in score_affiliated_data_2]
y_test = [l[-1] for l in score_affiliated_data_2]


#model = linear_model.LogisticRegression(max_iter=300)
#model = linear_model.LinearRegression()
model = SVC(C=6, kernel='linear')

model.fit(x_train, y_train)

y_pred = model.predict(x_test)
tot_err = 0
diffs = {"<5":0, "5<10":0, ">10":0}
for i in range(len(y_test)):
    err = abs(float(y_pred[i])-float(y_test[i]))
    tot_err += err
    if err <= 5:
        diffs["<5"] += 1
    elif err > 10:
        diffs[">10"] += 1
    else:
        diffs["5<10"] += 1
    #print(f"PRED: {y_pred[i]} ACTUAL: {y_test[i]}.....DIF: {err:.2f}")
print(f"AVG DIFF: {tot_err/len(y_test):.2f}")
print(f"Error dispersion: Err < 5: {diffs['<5']}   Err 5-10: {diffs['5<10']}    Err >10: {diffs['>10']}")

AVG DIFF: 6.83
Error dispersion: Err < 5: 13   Err 5-10: 7    Err >10: 7


In [None]:
### MORE REGRESSION MODEL OPTIONS ###


Here, I will create a Neural Network classifier for ABC scores that are low, medium/low, medium, medium/high and high (based pretty much just on 20% intervals)

In [46]:
### DATA CLASSES CONSTRUCTION ###
classes = {"low": 60, "med_low":70, "med":80, "med_high":90, "high":100}

# training set
class_affiliated_data_1 = []
for s in score_affiliated_data_1:
    classed_data = [float(k) for k in s[:15]]
    if float(s[15]) < classes["low"]:
        classed_data.append(0)
    elif float(s[15]) < classes["med_low"]:
        classed_data.append(1)
    elif float(s[15]) < classes["med"]:
        classed_data.append(2)
    elif float(s[15]) < classes["med_high"]:
        classed_data.append(3)
    else:
        classed_data.append(4)
    class_affiliated_data_1.append(classed_data)

# test set
class_affiliated_data_2 = []
for s in score_affiliated_data_2:
    classed_data = [float(p) for p in s[:15]]
    if float(s[15]) < classes["low"]:
        classed_data.append(0)
    elif float(s[15]) < classes["med_low"]:
        classed_data.append(1)
    elif float(s[15]) < classes["med"]:
        classed_data.append(2)
    elif float(s[15]) < classes["med_high"]:
        classed_data.append(3)
    else:
        classed_data.append(4)
    class_affiliated_data_2.append(classed_data)

# for c in class_affiliated_data_1:
#     print(c[15])

x_nn_train = np.array([l[:-1] for l in class_affiliated_data_2])
y_nn_train = [l[-1] for l in class_affiliated_data_2]
y_nn_train_1hot = to_categorical(y_nn_train)

x_nn_test = np.array([l[:-1] for l in class_affiliated_data_2])
y_nn_test = [l[-1] for l in class_affiliated_data_2]
y_nn_test_1hot = to_categorical(y_nn_test)

### NEURAL NETWORK CONSTRUCTION ###
neural = Sequential()
neural.add(layers.Dense(50, activation='relu'))
neural.add(layers.Dense(30, activation='relu'))
neural.add(layers.Dense(40, activation='relu'))
neural.add(layers.Dense(5, activation='softmax'))

neural.compile(optimizer='SGD', loss="categorical_crossentropy", metrics=['acc'])

neural.fit(x_nn_train, y_nn_train_1hot, batch_size=7, epochs=15)

results = neural.predict(x_nn_test)


Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [47]:
for r in results:
    simplified = [float(f"{h:.2f}") for h in r]
    print(simplified)

[0.19, 0.2, 0.2, 0.19, 0.23]
[0.19, 0.2, 0.2, 0.19, 0.23]
[0.19, 0.2, 0.2, 0.19, 0.23]
[0.19, 0.2, 0.2, 0.19, 0.23]
[0.19, 0.2, 0.2, 0.19, 0.23]
[0.19, 0.2, 0.2, 0.19, 0.23]
[0.19, 0.2, 0.2, 0.19, 0.23]
[0.19, 0.2, 0.2, 0.19, 0.23]
[0.19, 0.2, 0.2, 0.19, 0.23]
[0.19, 0.2, 0.2, 0.19, 0.23]
[0.19, 0.2, 0.2, 0.19, 0.23]
[0.19, 0.2, 0.2, 0.19, 0.23]
[0.19, 0.2, 0.2, 0.19, 0.23]
[0.19, 0.2, 0.2, 0.19, 0.23]
[0.19, 0.2, 0.2, 0.19, 0.23]
[0.19, 0.2, 0.2, 0.19, 0.23]
[0.19, 0.2, 0.2, 0.19, 0.23]
[0.19, 0.2, 0.2, 0.19, 0.23]
[0.19, 0.2, 0.2, 0.19, 0.23]
[0.19, 0.2, 0.2, 0.19, 0.23]
[0.19, 0.2, 0.2, 0.19, 0.23]
[0.19, 0.2, 0.2, 0.19, 0.23]
[0.19, 0.2, 0.2, 0.19, 0.23]
[0.19, 0.2, 0.2, 0.19, 0.23]
[0.19, 0.2, 0.2, 0.19, 0.23]
[0.19, 0.2, 0.2, 0.19, 0.23]
[0.19, 0.2, 0.2, 0.19, 0.23]
