In [None]:
# Robert Minneker
# CSE 599 G1
# This notebook is created to clean the UCI dataset for 
# input to the neural networks created in this project

In [12]:
# imports and constants
import os
import matplotlib.pyplot as plt
%matplotlib inline
path = 'data/uci/hw_dataset'

In [3]:
def plot_and_save(data_dir, output_dir, filename):
    x_vec = []
    y_vec = []
    with open(os.path.join(data_dir, filename)) as f:
        for line in f:
            line_split = line.rstrip().split(';')
            x, y, _, _, _, _, test_type = line_split

            if int(test_type) == 1:  # save image and break
                break
            else:  # plot point
                x_vec.append(int(x))
                y_vec.append(int(y))
    plt.plot(x_vec, y_vec)
    plt.axis('off')
    plt.savefig(os.path.join(output_dir, filename.split('.')[0]), bbox_inches='tight')
    plt.clf()

**Training data prep**

In [4]:
# process the control data
curr_dir = path + '/control'
output_dir = 'data/train' + '/control'
for filename in os.listdir(curr_dir):
    plot_and_save(curr_dir, output_dir, filename)

<Figure size 432x288 with 0 Axes>

In [5]:
# process the parkinson's data
curr_dir = path + '/parkinson'
output_dir = 'data/train' + '/pwp'
for filename in os.listdir(curr_dir):
    plot_and_save(curr_dir, output_dir, filename)

<Figure size 432x288 with 0 Axes>

In [27]:
# randomly split data into training/test/val data
# will use a 70/15/15 train/test/val split
import glob
import random
control = glob.glob('data/all' + '/control/*')
pwp = glob.glob('data/all' + '/pwp/*')

num_control_test_val_samps = int(0.15 * len(control))
num_pwp_test_val_samps = int(0.15 * len(pwp))

# randomly shuffle each list in-place
random.shuffle(control)
random.shuffle(pwp)

# validation split
val_control = control[:num_control_test_val_samps]
val_pwp = pwp[:num_pwp_test_val_samps]

# test split
test_control = control[num_control_test_val_samps:2*num_control_test_val_samps]
test_pwp = pwp[num_pwp_test_val_samps:2*num_pwp_test_val_samps]

# train split
train_control = control[2*num_control_test_val_samps:]
train_pwp = pwp[2*num_pwp_test_val_samps:]

In [28]:
# populate val files
val_control_dst = 'data/val/control/'
for f in val_control:
    name = f.split('/')[-1]
    os.rename('data/all/control/' + name, val_control_dst + name)
    
val_pwp_dst = 'data/val/pwp/'
for f in val_pwp:
    name = f.split('/')[-1]
    os.rename('data/all/pwp/' + name, val_pwp_dst + name)

In [29]:
# populate test files
test_control_dst = 'data/test/control/'
for f in test_control:
    name = f.split('/')[-1]
    os.rename('data/all/control/' + name, test_control_dst + name)
    
test_pwp_dst = 'data/test/pwp/'
for f in test_pwp:
    name = f.split('/')[-1]
    os.rename('data/all/pwp/' + name, test_pwp_dst + name)

In [30]:
# populate train files
train_control_dst = 'data/train/control/'
for f in train_control:
    name = f.split('/')[-1]
    os.rename('data/all/control/' + name, train_control_dst + name)
    
train_pwp_dst = 'data/train/pwp/'
for f in train_pwp:
    name = f.split('/')[-1]
    os.rename('data/all/pwp/' + name, train_pwp_dst + name)