In [2]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pathlib
import pickle
from scipy.stats import skew, kurtosis, pearsonr
from scipy.signal import butter, welch, filtfilt, resample
import copy
import time
import datetime
from multiprocessing.dummy import Pool as ThreadPool
from PreprocessFcns import *

%matplotlib inline

In [3]:
# Set path to folder containing Subject Records
path = r'//FS2.smpp.local\\RTO\\CIS-PD Study\MJFF Curation\Finalized Dataset'
# Set path to Destination Folder
dest = r'//FS2.smpp.local\\RTO\\CIS-PD Study\Patient Record Correlation'
#---------------------------------------------------------------------------------------------------------

In [4]:
# Medication Reports
table_med = 'Table10.csv'
dest_ext_med = 'Medication Reports'
file_name_med = 'med_timepoints.csv'

# Symptom Reports
table_symt = 'Table11.csv'
dest_ext_symt = 'Symptom Reports'
file_name_symt = 'symt_timepoints.csv'

# Diaries
table_diar = 'Table12.csv'
dest_ext_diar = 'Diaries'
file_name_diar = 'diar_timepoints.csv'

In [5]:
def getTimestampsPaths(timepoints_med, timepoints_symt, timepoints_diar):
    
    # Initialize Empty Lists for Each Necessary Piece of Information from Each Record
    StartTimestamps = []
    EndTimestamps = []
    SaveFilePaths = []
    
    
    StartTimes = timepoints_med.apply(lambda row: row.Timestamp + pd.Timedelta(unit = 'minute', value = -30), axis = 1)
    EndTimes = timepoints_med.apply(lambda row: row.Timestamp + pd.Timedelta(unit = 'minute', value = 30), axis = 1)
    SavePaths = timepoints_med.apply(lambda row: os.path.join(dest, dest_ext_med, str(row.SubjID), 
                                                                                  str(row.Timestamp)[:7], 
                                                                                  str(row.Timestamp)[8:10],
                                                                                  str(row.Timestamp)[11:13] +
                                                                                  str(row.Timestamp)[14:16] +
                                                                                  str(row.Timestamp)[17:] + '.csv'), axis = 1)
    StartTimestamps = StartTimestamps + list(StartTimes)
    EndTimestamps = EndTimestamps + list(EndTimes)
    SaveFilePaths = SaveFilePaths + list(SavePaths)
    
    StartTimes = timepoints_symt.apply(lambda row: row.Timestamp + pd.Timedelta(unit = 'minute', value = -30), axis = 1)
    EndTimes = timepoints_symt.apply(lambda row: row.Timestamp, axis = 1)
    SavePaths = timepoints_symt.apply(lambda row: os.path.join(dest, dest_ext_symt, str(row.SubjID), 
                                                                                   str(row.Timestamp)[:7], 
                                                                                   str(row.Timestamp)[8:10],
                                                                                   str(row.Timestamp)[11:13] +
                                                                                   str(row.Timestamp)[14:16] +
                                                                                   str(row.Timestamp)[17:] + '.csv'),axis = 1)
    StartTimestamps = StartTimestamps + list(StartTimes)
    EndTimestamps = EndTimestamps + list(EndTimes)
    SaveFilePaths = SaveFilePaths + list(SavePaths)
    
    StartTimes = timepoints_diar.apply(lambda row: row.Timestamp + pd.Timedelta(unit = 'minute', value = -30), axis = 1)
    EndTimes = timepoints_diar.apply(lambda row: row.Timestamp, axis = 1)
    SavePaths = timepoints_diar.apply(lambda row: os.path.join(dest, dest_ext_diar, str(row.SubjID), 
                                                                                   str(row.Timestamp)[:7], 
                                                                                   str(row.Timestamp)[8:10],
                                                                                   str(row.Timestamp)[11:13] +
                                                                                   str(row.Timestamp)[14:16] +
                                                                                   str(row.Timestamp)[17:] + '.csv'),axis = 1)
    StartTimestamps = StartTimestamps + list(StartTimes)
    EndTimestamps = EndTimestamps + list(EndTimes)
    SaveFilePaths = SaveFilePaths + list(SavePaths)
    
    return StartTimestamps, EndTimestamps, SaveFilePaths

In [6]:
# Load Consolidated DataFrames
timepoints_med = pd.read_csv(os.path.join(dest, dest_ext_med, file_name_med), parse_dates = [1])
timepoints_symt = pd.read_csv(os.path.join(dest, dest_ext_symt, file_name_symt), parse_dates = [1])
timepoints_diar = pd.read_csv(os.path.join(dest, dest_ext_diar, file_name_diar), parse_dates = [1])

# Create List of the Watch Data Files in the Table8 Directory (not including the subdirectory)
watch_dir = [f for f in os.listdir(os.path.join(path, 'Table8')) if os.path.isfile(os.path.join(path, 'Table8', f))]

StartTimestamps, EndTimestamps, SaveFilePaths = getTimestampsPaths(timepoints_med, timepoints_symt, timepoints_diar)

In [208]:
# Initialize Template Empty Data Frame From Which to Create Files
empty_df = pd.DataFrame(columns = ['SubjID', 'Timestamp', 'X', 'Y', 'Z'])

# Generate Empty Files Organized in Subdirectories to Append Time Interval Watch Data
# 'if statements' Required Because Record Type Paths Have Different Character Numbers
for file in SaveFilePaths:
    
    if file[:81] == '//FS2.smpp.local\\\\RTO\\\\CIS-PD Study\\Patient Record Correlation\\Medication Reports':
        try:
            empty_df.to_csv(file, index = False)
        except(FileNotFoundError):
            try:
                os.mkdir(file[:97])
                empty_df.to_csv(file, index = False)
            except(FileNotFoundError):
                try:
                    os.mkdir(file[:94])
                    os.mkdir(file[:97])
                    empty_df.to_csv(file, index = False)
                except(FileNotFoundError):
                    os.mkdir(file[:86])
                    os.mkdir(file[:94])
                    os.mkdir(file[:97])
                    empty_df.to_csv(file, index = False)

    if file[:78] == '//FS2.smpp.local\\\\RTO\\\\CIS-PD Study\\Patient Record Correlation\\Symptom Reports':
        try:
            empty_df.to_csv(file, index = False)
        except(FileNotFoundError):
            try:
                os.mkdir(file[:94])
                empty_df.to_csv(file, index = False)
            except(FileNotFoundError):
                try:
                    os.mkdir(file[:91])
                    os.mkdir(file[:94])
                    empty_df.to_csv(file, index = False)
                except(FileNotFoundError):
                    os.mkdir(file[:83])
                    os.mkdir(file[:91])
                    os.mkdir(file[:94])
                    empty_df.to_csv(file, index = False)

    if file[:70] == '//FS2.smpp.local\\\\RTO\\\\CIS-PD Study\\Patient Record Correlation\\Diaries':
        try:
            empty_df.to_csv(file, index = False)
        except(FileNotFoundError):
            try:
                os.mkdir(file[:86])
                empty_df.to_csv(file, index = False)
            except(FileNotFoundError):
                try:
                    os.mkdir(file[:83])
                    os.mkdir(file[:86])
                    empty_df.to_csv(file, index = False)
                except(FileNotFoundError):
                    os.mkdir(file[:75])
                    os.mkdir(file[:83])
                    os.mkdir(file[:86])
                    empty_df.to_csv(file, index = False)

KeyboardInterrupt: 

In [210]:
# Start Time of Reading One Watch Data File
s_time = time.time()

for file in watch_dir:
    
    # Get the Subject and Month Corresponding to the Watch Data in the File
    sub = file[7:11]
    mon = file[12:19]
    
    # Only Consider File Paths Corresponding to the Relevant Watch File Subject and Month
    # Look at Only Path Characters to 86 for Subject so Date/Time is not Considered
    tempStart = [StartTimestamps[i] for i in range(len(StartTimestamps)) 
                 if sub in SaveFilePaths[i][:86] and mon in SaveFilePaths[i]]
    tempEnd = [EndTimestamps[i] for i in range(len(EndTimestamps)) 
               if sub in SaveFilePaths[i][:86] and mon in SaveFilePaths[i]]
    tempPath = [s for s in SaveFilePaths if sub in s[:86] and mon in s]
    
    # Read Watch Acc File in Chunks
    watch_file_chunk = pd.read_csv(os.path.join(path, 'Table8', file), chunksize = 100000)
    
    chunk_time = time.time()
    
    # Look at One Chunk at a Time
    for chunk in watch_file_chunk:
        
        # Get the Minimum and Maximum DataPoint Timestamp of Each Chunk
        minTime, maxTime = pd.Timestamp(chunk.Timestamp.min()), pd.Timestamp(chunk.Timestamp.max())
        
        # Iterate through each Record
        for s, e, p in zip(tempStart, tempEnd, tempPath):
            
            # Skip the Record if the Chunk does not Contain the Record Time Range
            if s > maxTime or e < minTime:
                continue
                
            # Get the Indices of the DataPoints in the Chunk that are Within the Record Time Range
            indices = (chunk.Timestamp.apply(pd.Timestamp) > s) & (chunk.Timestamp.apply(pd.Timestamp) < e)
            
            # Save the Relevant Data Points to the Pre-Specified Record Path
            chunk[indices].to_csv(path_or_buf = p, mode = 'a', index = False, header = False)
            
        # Track the Time to Iterate Through One Chunk
        print(str(int(((time.time() - chunk_time) / 60) / 60)) + ' hours ' + 
              str(int(((time.time() - chunk_time) / 60) % 60)) + ' minutes ' + 
              str(int((time.time() - chunk_time) % 60)) + ' seconds')
        chunk_time = time.time()
    
    # First File in the Table8 Directory Size = 1,665,829 KB
    break

# Print Time to Read One Watch Data File
print(str(int(((time.time() - s_time) / 60) / 60)) + ' hours ' + 
      str(int(((time.time() - s_time) / 60) % 60)) + ' minutes ' + 
      str(int((time.time() - s_time) % 60)) + ' seconds')

0 hours 0 minutes 0 seconds
0 hours 0 minutes 13 seconds
0 hours 0 minutes 13 seconds
0 hours 0 minutes 13 seconds
0 hours 0 minutes 13 seconds
0 hours 0 minutes 13 seconds
0 hours 0 minutes 13 seconds
0 hours 0 minutes 12 seconds
0 hours 0 minutes 11 seconds
0 hours 0 minutes 41 seconds
0 hours 0 minutes 41 seconds
0 hours 0 minutes 41 seconds
0 hours 0 minutes 41 seconds
0 hours 0 minutes 40 seconds


KeyboardInterrupt: 

In [22]:
# TEST CELL
SaveFilePaths[-1][71:75]
# IF THE LOOP DOESN'T RUN - MAKE SURE THE FUNCTION READS row.Timestamp AS A TIMESTAMP AND NOT A STRING

'1043'

Example Paths for Each Record Type
'//FS2.smpp.local\\\\RTO\\\\CIS-PD Study\\Patient Record Correlation\\Medication Reports\\1004\\2017-06\\21\\142217.csv'
'//FS2.smpp.local\\\\RTO\\\\CIS-PD Study\\Patient Record Correlation\\Diaries\\1043\\2017-08\\15\\120000.csv'
'//FS2.smpp.local\\\\RTO\\\\CIS-PD Study\\Patient Record Correlation\\Symptom Reports\\1013\\2017-09\\06\\163350.csv'