In [1]:
%matplotlib widget
import numpy as np
import cuml
from cuml import KMeans
from cuml.cluster import KMeans
import cudf
import sys
import pandas as pd
import matplotlib.pyplot as plt
import datetime
import pytz
timeZone = pytz.timezone('America/Los_Angeles')

from myUtils import *

In [2]:
def reconstruct(toPredict, predictor, kmeansModel, nStack):
    predicted = np.zeros_like(toPredict)
    predicted[:,1] = toPredict[:,1]

    #stackedSamples, _ = residualStack.shape
    _, nSamples = toPredict.shape
    counter = 1

    while counter + nStack <= nSamples:
        if counter % 100000 == 0:
            myPrint("predicting sample: " + str(counter))

        thisToPredict = toPredict[:, counter:counter + nStack].flatten(order='F')
        thisPredictor = np.repeat(predicted[:, counter - 1], nStack)
        thisResidual = thisToPredict - thisPredictor
        
        #kmeansModel.fit(np.expand_dims(thisResidual,0))
        
        index = int(kmeans.predict(np.expand_dims(thisResidual, 0))[0])
        thisCentroid = centroids[index,:]
        for j in range(nStack):
            predicted[:, counter] = data[:, counter - 1 - j] + thisCentroid[j * nChannel: (j+1) * nChannel].transpose()
            counter += 1
            
    return predicted, kmeansModel


def prepareResiduals(nStack, toPredict, predictor):

    _, nSample = toPredict.shape
    residualStack = np.zeros((nSample, nStack * nChannel))
    counter = 0
    for i in range(0, nSample, nStack):
        if i + nStack >= nSample:
            break
        thisPredictor = np.repeat(predictor[:,i].transpose(), nStack)
        thisToPredict = toPredict[:,i + 1: i + 1 + nStack].flatten(order='F').transpose()
        thisResidual = thisToPredict - thisPredictor
        residualStack[counter, :] = thisResidual
        counter += 1

    residualStack = residualStack[0:counter,:]
    return residualStack

# Load original and predicted data

In [3]:
processedData = np.load('/blue/gkalamangalam/jmark.ettinger/eegCompress/processedData/origAndPredictedSVD001_block7.npz')
data = processedData['arr_0']
nChannel, nSample = data.shape

print(data.shape)

(19, 1100367)


# Prepare stacked residuals

In [None]:
nStack = 1

residualStack = prepareResiduals(nStack, data, data)
print(residualStack.shape)

# Fit kmeans model

In [None]:
%%time
kmeansInputData = residualStack
n_clusters = 2**16
n_init = 1

try:
    kmeans.fit(kmeansInputData)
except:
    myPrint("Initializing kmeans model...")
    kmeans = KMeans(n_clusters=n_clusters, verbose=6, n_init=n_init)
    kmeans.fit(kmeansInputData)
    
centroids = kmeans.cluster_centers_
labels = kmeans.labels_
myPrint(kmeans.n_iter_)
myPrint(centroids.shape)
myPrint(kmeans.inertia_)

# Save KMeans model

In [None]:
path = '/blue/gkalamangalam/jmark.ettinger/eegCompress/processedData/kmeansModels/kmeansModel_001_block7_1stack.npz'
np.savez(path, centroids, labels)

# Load KMeans model

In [7]:
path = '/blue/gkalamangalam/jmark.ettinger/eegCompress/processedData/kmeansModels/kmeansModel_001_block7_1stack.npz'
#npzfile = np.load(path)
#centroids = npzfile['arr_0']
#labels = npzfile['arr_1']

n_clusters = 2**16
centroids = np.diff(data)[:,0:n_clusters].transpose()

nCentroids,_ = centroids.shape
kmeans = KMeans(n_clusters=nCentroids, init=centroids, n_init=1)

# Reconstruct the data from stacked residuals and centroids

In [8]:
predicted = data

In [None]:
%%time
iterations = 5
nStack = 1

for i in range(iterations):
    myPrint("reconstruct iteration: " + str(i))
    predicted, kmeans = reconstruct(data, predicted, kmeans, nStack)
    
    myPrint(str((np.mean(np.abs(data - predicted)), np.max(np.abs(data - predicted)))))
    
    residuals = (data[:,1:] - predicted[:,0:-1]).transpose()
    if i == 0:
        residuals = residuals[n_clusters,:]
        
    kmeans.fit(residuals)
    
    # save kmeans model
    directory = '/blue/gkalamangalam/jmark.ettinger/eegCompress/processedData/kmeansModels/'
    time = str(datetime.datetime.now().astimezone(timeZone).strftime('%m-%d %H:%M'))
    filename = 'kmeansModel_stack' + str(nStack) + '_' +  time + '.npz'
    path = directory + filename
    centroids = kmeans.cluster_centers_
    labels = kmeans.labels_
    np.savez(path, centroids, labels)
    
    # save reconstructed data
    filename = 'predicted_' +  time + '.npz'
    path = directory + filename
    np.savez(path, predicted)
    

04-12 13:14: reconstruct iteration: 0
04-12 13:17: predicting sample: 100000
04-12 13:19: predicting sample: 200000
04-12 13:22: predicting sample: 300000
04-12 13:24: predicting sample: 400000
04-12 13:27: predicting sample: 500000
04-12 13:30: predicting sample: 600000
04-12 13:32: predicting sample: 700000


In [None]:
np.mean(np.abs(data - predicted)), np.max(np.abs(data - predicted))

 current best: (0.044626124, 0.6939485)

In [None]:
channel = 0

plt.figure()
plt.plot(predicted[channel,:], label='predicted')
plt.plot(data[channel,:], label='original')
residual = data - predicted
#plt.plot(residual[channel,:])
plt.legend()
plt.show()

# Save original and predicted

In [None]:
path = '/blue/gkalamangalam/jmark.ettinger/eegCompress/processedData/origAndPredictedLossy.npz'
dataToSaveList = [data[:,0:failureIndex], predicted[:, 0:failureIndex]]
np.savez_compressed(path, *dataToSaveList)

# Scratch