In [50]:
%matplotlib notebook
import mne
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from scipy.signal import butter, cheby1, filtfilt

from scipy.signal import spectrogram, stft, istft, check_NOLA
import pickle

# IMPORT DATA AND PREPROCESS

In [37]:
# 31, 35, 38 and test folders for good data
# odd scalp on left, even on right

# SPECIFY PATIENT AND SCALP TARGET ELECTRODE
patient = 'UFSEEG031'
targetScalpElectrode = 'F7'
mode = 'Wake'

# SPECIFY ARTIFACT ELECTRODES FROM WORD FILES
artifactElectrodes = {}
artifactElectrodes['UFSEEG031'] = ['LTP7', 'LTP8', 'LAH11', 'LAH12', 'LPH10', 'LPH11', 'LPH12','LOF15', 'LOF16']

#filepath = '/blue/gkalamangalam/ALLDATA/SEEG/%s/SEEG/EDF/TestClipSleep.edf' % patient
filepath = '/blue/gkalamangalam/ALLDATA/SEEG/%s/SEEG/EDF/TestClipAwake/TestClip%sME1.edf' % (patient, mode)

raw = mne.io.read_raw_edf(filepath,preload=True)
sfreq = int(raw.info['sfreq'])
nSeconds = raw.n_times/sfreq

scalpElectrodes = {}
scalpElectrodes[patient] = [i for i in raw.ch_names if len(i) == 2]
print()
print(scalpElectrodes)
print(raw)
print(raw.info)

Extracting EDF parameters from /blue/gkalamangalam/ALLDATA/SEEG/UFSEEG031/SEEG/EDF/TestClipAwake/TestClipWakeME1.edf...
EDF file detected
Setting channel info structure...
Creating raw.info structure...
Reading 0 ... 241023  =      0.000 ...   235.374 secs...

{'UFSEEG031': ['F7', 'F8', 'F3', 'F4', 'C3', 'C4', 'P7', 'P8', 'P3', 'P4', 'PR']}
<RawEDF | TestClipWakeME1.edf, 148 x 241024 (235.4 s), ~272.3 MB, data loaded>
<Info | 7 non-empty values
 bads: []
 ch_names: LTP1, LTP2, LTP3, LTP4, LTP5, LTP6, LTP7, LTP8, LAM1, LAM2, ...
 chs: 148 EEG
 custom_ref_applied: False
 highpass: 0.0 Hz
 lowpass: 512.0 Hz
 meas_date: 2001-01-01 10:26:35 UTC
 nchan: 148
 projs: []
 sfreq: 1024.0 Hz
>


In [39]:
# DISCARD ALL CHANNELS EXCEPT GOOD SEEG CHANNELS AND THE SINGLE SCALP TARGET

channels = [i for i in raw.ch_names if i not in artifactElectrodes[patient] and i[0] in {'L', 'R'}] + [targetScalpElectrode]
raw.pick_channels(channels)#.plot(duration=5.0, n_channels=20);

<RawEDF | TestClipWakeME1.edf, 88 x 241024 (235.4 s), ~161.9 MB, data loaded>

In [40]:
# LOWPASS FILTER THE DATA, SUBSAMPLE THE DATA, SCALE ALL CHANNELS, AND EXTRACT TO NUMPY ARRAY

filterWindow = 64
subsampleFreq = filterWindow * 2   # FINAL FREQUENCY IN HERTZ AFTER SUBSAMPLING
filterOrder = 5

df = raw.to_data_frame().drop(labels=['time'], axis=1)
data = df.to_numpy()

# SHOULD WE SCALE HERE OR AFTER FILTER AND SUBSAMPLE?
scaler = StandardScaler()
data = scaler.fit_transform(data)

b, a = butter(filterOrder, filterWindow, btype='lowpass', fs = sfreq)
data = filtfilt(b, a, data, axis=0)
    
dataSubsampled = data[::sfreq // subsampleFreq,:]

# SEE QUESTION ABOVE...
#scaler = StandardScaler()
#dataSubsampled = scaler.fit_transform(dataSubsampled)

pd.DataFrame(dataSubsampled, columns=df.columns)

Unnamed: 0,LTP1,LTP2,LTP3,LTP4,LTP5,LTP6,LAM1,LAM2,LAM3,LAM4,...,LOF6,LOF7,LOF8,LOF9,LOF10,LOF11,LOF12,LOF13,LOF14,F7
0,1.269132,0.611309,0.332447,0.513127,0.050132,-0.921070,-0.487324,-0.081578,-0.525729,0.334276,...,-0.779624,0.273464,0.991077,0.363811,0.290867,0.214715,0.271042,0.612997,0.733364,0.010991
1,1.132279,0.446085,0.220560,0.418795,-0.012608,-1.192597,-0.458433,0.084668,-0.519432,0.192957,...,-0.953009,0.142477,0.825526,0.181249,0.072055,-0.000309,0.048386,0.446467,0.581508,-0.226039
2,1.519371,0.780325,0.556298,0.783363,0.417908,-0.903523,-0.043603,0.561195,-0.101200,0.637068,...,-0.446855,0.709971,1.354460,0.691233,0.617481,0.572477,0.640061,1.112740,1.193733,0.324991
3,1.045590,0.179590,-0.021626,0.243580,-0.178253,-1.434499,-0.418117,0.010637,-0.648773,-0.043092,...,-1.071674,0.041591,0.706453,0.042263,-0.077453,-0.091645,-0.008984,0.540202,0.554050,-0.423745
4,1.524571,0.708372,0.451041,0.814309,0.455070,-0.785436,-0.112308,0.524834,-0.020276,0.738102,...,-0.249632,0.915618,1.582307,0.894184,0.832920,0.837768,0.884481,1.371679,1.375992,0.487048
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30123,1.506721,1.439685,1.480775,1.469246,0.931538,-0.577848,2.334703,1.930697,1.702907,0.847484,...,1.384141,0.812632,0.270104,0.296400,0.236454,0.353329,0.303611,0.715184,0.200309,0.352152
30124,0.785738,0.479184,0.486986,0.662480,-0.095446,-1.590815,1.826070,1.325293,0.950986,-0.261800,...,0.344845,-0.408083,-0.941065,-0.870852,-1.011183,-0.858594,-0.884044,-0.520102,-1.079816,-0.891327
30125,1.091449,0.929328,0.882844,1.134688,0.858337,-0.053454,1.341025,1.312662,1.170604,0.894238,...,1.202606,1.126821,0.736459,0.757362,0.671112,0.587374,0.457989,0.893219,0.473900,0.558070
30126,0.171127,0.138581,0.161984,0.313157,0.326562,-0.123588,0.017988,0.158359,0.107109,0.303561,...,0.312348,0.621445,0.379219,0.399186,0.225524,-0.059595,-0.233437,0.262038,0.069737,-0.136996


In [41]:
# PARTITION TIME SERIES INTO CONTIGUOUS TRAIN AND VALIDATION BLOCKS
# OTHERWISE WHEN VALID SET IS RANDOMLY DISTRIBUTED, IMPLICIT OVERFITTING OCCURS DUE TO TEMPORAL DEPENDENCY
# EACH BLOCK (SPECIFIED IN SECONDS) IS DIVIDED INTO TRAIN (SPECIFIED BY FRACTION, COMES FIRST) AND VALIDIDATION (COMES LAST)

def timeseriesTrainValidSplit(secondsInBlock, totalSeconds, trainFraction, subsampleFreq):
    nBlock = int(totalSeconds / secondsInBlock)
    samplesPerBlock = subsampleFreq * secondsInBlock
    trainIndexProto = np.arange(0, samplesPerBlock * trainFraction, dtype=int)
    validIndexProto = np.arange(samplesPerBlock * trainFraction, samplesPerBlock, dtype=int)

    trainIndexBlocks = [(trainIndexProto + (i * samplesPerBlock)).astype(int) for i in range(nBlock)]
    validIndexBlocks = [(validIndexProto + (i * samplesPerBlock)).astype(int) for i in range(nBlock)]

    trainIndices = np.concatenate(trainIndexBlocks).astype(int)
    validationIndices = np.concatenate(validIndexBlocks).astype(int)
    #return trainIndices, validationIndices
    return trainIndexBlocks, validIndexBlocks, trainIndices, validationIndices

def timeDomainDataMake(indexBlocks, halfWindow, dataSubsampled):
    xList = []
    yList = []
    _, nChannels = dataSubsampled.shape
    for thisBlock in indexBlocks:
        thisData = np.vstack([np.zeros((halfWindow, nChannels)), 
                             dataSubsampled[thisBlock,:], 
                             np.zeros((halfWindow, nChannels))])
        
        for t in range(0, len(thisBlock)):
            thisX = thisData[t:t + (2 * halfWindow) + 1,0:-1].flatten()
            thisY = thisData[t + halfWindow, -1]
            xList.append(thisX)
            yList.append(thisY)

    xTimeDomain = np.stack(xList, axis = 0)
    yTimeDomain = np.expand_dims(np.array(yList), axis=1)
    return xTimeDomain, yTimeDomain

# INDICES FOR TRAIN/VALIDIDATION SPLIT

In [46]:
secondsInBlock = 5
totalSeconds = int(nSeconds)
trainFraction = 1.0

if trainFraction < 1.0:
    trainIndexBlocks, validIndexBlocks, trainIndices, validationIndices = timeseriesTrainValidSplit(secondsInBlock, 
                                                                                                    totalSeconds, 
                                                                                                    trainFraction, 
                                                                                                    subsampleFreq)

# TIME DOMAIN DATA

In [48]:
halfWindowSeconds = .25

halfWindowSamples = int(halfWindowSeconds * subsampleFreq)

if trainFraction < 1.0:
    xTrainTimeDomain, yTrainTimeDomain = timeDomainDataMake(trainIndexBlocks, halfWindowSamples, dataSubsampled)
    xValidTimeDomain, yValidTimeDomain = timeDomainDataMake(validIndexBlocks, halfWindowSamples, dataSubsampled)

else:
    trainIndexBlocks = [np.array(range(0, dataSubsampled.shape[0]))]
    xTrainTimeDomain, yTrainTimeDomain = timeDomainDataMake(trainIndexBlocks, halfWindowSamples, dataSubsampled)
    xValidTimeDomain, yValidTimeDomain = np.array([]), np.array([])
    
print(xTrainTimeDomain.shape, yTrainTimeDomain.shape, xValidTimeDomain.shape, yValidTimeDomain.shape)

(30128, 5655) (30128, 1) (0,) (0,)


In [49]:
arraySavePath = '/blue/gkalamangalam/jmark.ettinger/predictScalp/timeDomain_%s_%s_%s.npz' % (patient, targetScalpElectrode, mode)
np.savez(arraySavePath, 
         xTrainTimeDomain=xTrainTimeDomain, 
         xValidTimeDomain=xValidTimeDomain,
         yTrainTimeDomain=yTrainTimeDomain,
         yValidTimeDomain=yValidTimeDomain)

# STFT DATA

In [None]:
# APPLY SHORT TERM FOURIER TRANSFORM TO THE DATA AND CHECK PARAMETERS FOR INVERTABILITY

secondsInSTFTWindow = .5
nperseg = subsampleFreq * secondsInSTFTWindow
noverlap = nperseg - 1
windowType = ('tukey', .25)

f, t, S = stft(dataSubsampled, fs=subsampleFreq, window=windowType, nperseg=nperseg, noverlap=noverlap, axis=0)

print('freq, ', 'time, ', 'stft shape')
print(f.shape, t.shape, S.shape)
print('inverse ok? ',check_NOLA(windowType, nperseg, noverlap))

In [None]:
x_trainComplex = S[:, 0:-1, trainIndices].transpose([2,0,1])
y_trainComplex = S[:, -1, trainIndices].transpose()

x_validComplex = S[:, 0:-1, validationIndices].transpose([2,0,1])
y_validComplex = S[:, -1, validationIndices].transpose()

# MAKE REAL-VALUED TRAINING DATA BY CONVERTING STFT COMPLEX NUMBERS TO R,THETA
_,_,numCol = x_trainComplex.shape
x_trainRTheta = np.hstack([np.hstack([np.abs(x_trainComplex[:,:,i]), 
                                      np.angle(x_trainComplex[:,:,i])]) for i in range(numCol)])
x_validRTheta = np.hstack([np.hstack([np.abs(x_validComplex[:,:,i]), 
                                      np.angle(x_validComplex[:,:,i])]) for i in range(numCol)])

y_trainRTheta = np.hstack([np.abs(y_trainComplex), np.angle(y_trainComplex)])
y_validRTheta = np.hstack([np.abs(y_validComplex), np.angle(y_validComplex)])

_, nY = y_trainRTheta.shape
x_trainRTheta.shape, x_validRTheta.shape, y_trainRTheta.shape, y_validRTheta.shape

In [None]:
# PLOT THE STFT OF A TIME SERIES (MAGNITUDE ONLY)

index = -1 # -1 is the target
vmax = .2

plt.figure()
plt.pcolormesh(t, f, np.abs(S[:,index,:]), shading='auto', cmap='hot', vmin=0, vmax=vmax)
plt.ylabel('Frequency [Hz]')
plt.xlabel('Time [sec]')
plt.title('Index: %s' % str(index))
plt.show()

In [None]:
arraySavePath = '/blue/gkalamangalam/jmark.ettinger/predictScalp/freqRTheta_%s_%s_%s.npz' % (patient, targetScalpElectrode, mode)
np.savez(arraySavePath, 
         x_trainRTheta=x_trainRTheta, 
         x_validRTheta=x_validRTheta, 
         y_trainRTheta=y_trainRTheta, 
         y_validRTheta=y_validRTheta)

# COMBINE TIME AND FREQUENCY DOMAIN DATA

In [None]:
x_trainTimeFreq = np.hstack([xTrainTimeDomain, x_trainRTheta])
y_trainTimeFreq = np.hstack([yTrainTimeDomain, y_trainRTheta])
x_validTimeFreq = np.hstack([xValidTimeDomain, x_validRTheta])
y_validTimeFreq = np.hstack([yValidTimeDomain, y_validRTheta])

In [None]:
x_trainTimeFreq.shape, y_trainTimeFreq.shape, x_validTimeFreq.shape, y_validTimeFreq.shape

In [None]:
arraySavePath = '/blue/gkalamangalam/jmark.ettinger/predictScalp/timeFreqRTheta_%s_%s_%s.npz' % (patient, targetScalpElectrode, mode)
np.savez(arraySavePath, 
         x_trainTimeFreq=x_trainTimeFreq, 
         x_validTimeFreq=x_validTimeFreq, 
         y_trainTimeFreq=y_trainTimeFreq, 
         y_validTimeFreq=y_validTimeFreq)

# SCRATCH

In [None]:
# stft parameter tests

windowType = ('tukey', .25)

fakeData = np.random.rand(10000, 5)

f, t, S = stft(fakeData, fs=1000, window=windowType, nperseg=100, noverlap=0, axis=0, boundary=None)

f.shape, t.shape, S.shape, f, t

In [None]:
# OLD VERSION FOR TIME DOMAIN

xTrainTimeDomain = dataSubsampled[trainIndices, 0:-1]
yTrainTimeDomain = np.expand_dims(dataSubsampled[trainIndices, -1], axis=1)

xValidTimeDomain = dataSubsampled[validationIndices, 0:-1]
yValidTimeDomain = np.expand_dims(dataSubsampled[validationIndices, -1], axis=1)

xTrainTimeDomain.shape, yTrainTimeDomain.shape, xValidTimeDomain.shape, yValidTimeDomain.shape