In [1]:
baseDir = "/Users/apple/Documents/MATLAB/EE 675/Willett Data/tuning-tasks-all/"
import scipy.io

fiftyWordDat = scipy.io.loadmat(baseDir+'tuningTasks/t12.2022.05.03_fiftyWordSet.mat')

In [2]:
import numpy as np

fiftyWordDat['feat'] = np.concatenate([fiftyWordDat['tx2'][:,:32].astype(np.float32), fiftyWordDat['tx2'][:,96:128].astype(np.float32), fiftyWordDat['spikePow'][:,:32].astype(np.float32), fiftyWordDat['spikePow'][:,96:128].astype(np.float32)], axis=1)
fiftyWordDat['feat'] = np.sqrt(fiftyWordDat['feat'])

In [3]:
# make an array containing the data

fiftyWordSubset = []
labels = []

for cue in fiftyWordDat['cueList'][0]:
    cueIdx = np.where(fiftyWordDat['cueList'] == cue)[1]
    cueTrials = np.where(fiftyWordDat['trialCues'] == cueIdx)[0]
    cueTrialEpochs = [fiftyWordDat['goTrialEpochs'][trialNum] for trialNum in cueTrials]
    cueTrialBins = [fiftyWordDat['feat'][epoch[1] - 50:epoch[1]] for epoch in cueTrialEpochs] # the last 50 bins were found to be the most informative

    fiftyWordSubset.append(cueTrialBins)
    labels.append(np.ones(20)*cueIdx)

fiftyWordSubset = np.concatenate(fiftyWordSubset[1:], axis=0) # (1000 trials, 50 bins, 128 channels) array
labels = np.concatenate(labels[:50]) # (1000,) array

In [4]:
LATENT_DIM = 16

In [5]:
import numpy.ma as ma
from pykalman import KalmanFilter
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.decomposition import FactorAnalysis
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt

In [6]:
X_with_nans = []
for trial in fiftyWordSubset:
    X_with_nans.append(trial)
    X_with_nans.append(np.full((1, 128), np.nan))

X_stacked = np.vstack(X_with_nans)[:-1]
X_masked = ma.masked_invalid(X_stacked)

# factor analysis
fa = FactorAnalysis(n_components=LATENT_DIM)
fa.fit(fiftyWordSubset.reshape(-1, 128))
C_init = fa.components_.T
R_init = np.diag(fa.noise_variance_)

# global 'meta' KF
kf_global = KalmanFilter(
    n_dim_state=LATENT_DIM,
    n_dim_obs=128,
    observation_matrices=C_init,
    observation_covariance=R_init,
    em_vars=['transition_matrices', 'transition_covariance', 
             'initial_state_mean', 'initial_state_covariance']
)
kf_global = kf_global.em(X_masked, n_iter=1) # 10 iters is enough

print("Global Model Trained. Extracting Features...")
# Make sure you run this on CARC!!! takes 40+ minutes with 10/12 latent dimensions, takes 303 minutes with 16 dims on your local machine

KeyboardInterrupt: 

In [7]:
# feature extraction step
def extract_features(data, kf_model):
    features = []
    for trial in data:
        (smoothed, _) = kf_model.smooth(trial)
        
        # split word into windows -- could be adapted in the future to work on live data
        n_windows = 5
        window_size = 50 // n_windows
        
        trial_feats = []
        for w in range(n_windows):
            # Mean of latent state in this window
            window_mean = np.mean(smoothed[w*window_size : (w+1)*window_size], axis=0)
            trial_feats.append(window_mean)
            
        features.append(np.concatenate(trial_feats))
        
    return np.array(features)

# extract features
X_latent = extract_features(fiftyWordSubset, kf_global)
print(f"Feature Matrix Shape: {X_latent.shape}") # (1000, 5*latent_dim_size)

Feature Matrix Shape: (1000, 80)


In [None]:
skf = StratifiedKFold(n_splits=2, shuffle=True, random_state=42) # 5 folds -- gets slightly better acc at 10
lda = LinearDiscriminantAnalysis(solver='lsqr', shrinkage='auto')

accuracies = []

# Ensure y is 1D
y = labels.ravel().astype(int)

for fold, (train_idx, test_idx) in enumerate(skf.split(X_latent, y)):

    X_train, X_test = X_latent[train_idx], X_latent[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]
    
    lda.fit(X_train, y_train)
    
    pred = lda.predict(X_test)
    acc = accuracy_score(y_test, pred)
    accuracies.append(acc)
    
    print(f"Fold {fold+1} Accuracy: {acc*100:.1f}%")

print("\nChance Level: 2%")
print(f"\nAverage Accuracy: {np.mean(accuracies)*100:.1f}%")

Fold 1 Accuracy: 48.0%
Fold 2 Accuracy: 44.0%
Fold 3 Accuracy: 39.5%
Fold 4 Accuracy: 43.0%
Fold 5 Accuracy: 41.5%

Chance Level: 2%

Average Accuracy: 43.2%


In [49]:
# collect the bins for yes and no trials

yesIdx = np.where(fiftyWordDat['cueList'] == 'yes')[1][0]
noIdx = np.where(fiftyWordDat['cueList'] == 'no')[1][0]

noTrials = np.where(fiftyWordDat['trialCues'] == noIdx)[0]
yesTrials = np.where(fiftyWordDat['trialCues'] == yesIdx)[0]

noTrialEpochs = [fiftyWordDat['goTrialEpochs'][trialNum] for trialNum in noTrials]
yesTrialEpochs = [fiftyWordDat['goTrialEpochs'][trialNum] for trialNum in yesTrials]

noTrialBins = [fiftyWordDat['feat'][epoch[1] - 50:epoch[1]] for epoch in noTrialEpochs] # the last 50 bins were found to be the most informative
yesTrialBins = [fiftyWordDat['feat'][epoch[1] - 50:epoch[1]] for epoch in yesTrialEpochs] # (20, 50, 256) 

yesNoData = np.concatenate((noTrialBins, yesTrialBins))
yesNoLabels = np.concatenate((np.zeros(20),np.ones(20)))

In [None]:
X_with_nans = []
for trial in yesNoData:
    X_with_nans.append(trial)
    X_with_nans.append(np.full((1, 128), np.nan))

X_stacked = np.vstack(X_with_nans)[:-1]
X_masked = ma.masked_invalid(X_stacked)

# factor analysis
fa = FactorAnalysis(n_components=LATENT_DIM)
fa.fit(yesNoData.reshape(-1, 128))
C_init = fa.components_.T
R_init = np.diag(fa.noise_variance_)

# global 'meta' KF
kf_global = KalmanFilter(
    n_dim_state=LATENT_DIM,
    n_dim_obs=128,
    observation_matrices=C_init,
    observation_covariance=R_init,
    em_vars=['transition_matrices', 'transition_covariance', 
             'initial_state_mean', 'initial_state_covariance']
)
kf_global = kf_global.em(X_masked, n_iter=10) # 10 iters is enough

print("Global Model Trained. Extracting Features...")

Global Model Trained. Extracting Features...


In [39]:
# feature extraction step
def extract_features(data, kf_model):
    features = []
    for trial in data:
        (smoothed, _) = kf_model.smooth(trial)
        
        # split word into windows -- could be adapted in the future to work on live data
        n_windows = 5
        window_size = 50 // n_windows
        
        trial_feats = []
        for w in range(n_windows):
            # Mean of latent state in this window
            window_mean = np.mean(smoothed[w*window_size : (w+1)*window_size], axis=0)
            trial_feats.append(window_mean)
            
        features.append(np.concatenate(trial_feats))
        
    return np.array(features)

# extract features
X_latent = extract_features(yesNoData, kf_global)
print(f"Feature Matrix Shape: {X_latent.shape}") # (1000, 5*latent_dim_size)

Feature Matrix Shape: (40, 60)


In [None]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42) # 5 folds -- gets slightly better acc at 10
lda = LinearDiscriminantAnalysis(solver='lsqr', shrinkage='auto')

accuracies = []

y = yesNoLabels.ravel().astype(int)

for fold, (train_idx, test_idx) in enumerate(skf.split(X_latent, y)):
    X_train, X_test = X_latent[train_idx], X_latent[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]
    
    lda.fit(X_train, y_train)
    
    pred = lda.predict(X_test)
    acc = accuracy_score(y_test, pred)
    accuracies.append(acc)
    
    print(f"Fold {fold+1} Accuracy: {acc*100:.1f}%")

print("\nChance Level: 2%")
print(f"\nAverage Accuracy: {np.mean(accuracies)*100:.1f}%")

Fold 1 Accuracy: 87.5%
Fold 2 Accuracy: 100.0%
Fold 3 Accuracy: 100.0%
Fold 4 Accuracy: 87.5%
Fold 5 Accuracy: 87.5%

Chance Level: 2%

Average Accuracy: 92.5%


In [113]:
baseDir2 = "/Users/apple/Documents/MATLAB/EE 675/Willett Data/sentences/"

compDat1 = scipy.io.loadmat(baseDir2+'t12.2022.05.19_sentences.mat')

In [114]:
import numpy as np

compDat1['feat'] = np.concatenate([compDat1['tx2'][:,:32].astype(np.float32), compDat1['tx2'][:,96:128].astype(np.float32), compDat1['spikePow'][:,:32].astype(np.float32), compDat1['spikePow'][:,96:128].astype(np.float32)], axis=1)
compDat1['feat'] = np.sqrt(compDat1['feat'])

In [115]:
blockList = compDat1['blockList'][np.where(compDat1['blockTypes'] == 'OL Chang')[0]]

In [116]:
def changSentences(data, blockList):
    sentences = []
    binRange = []
    for block in blockList:
        rangeTemp = np.where(data['blockNum'] == block)[0]
        trialMin = np.where(data['goTrialEpochs'] == rangeTemp[0])[0][0] + 1
        trialMax = np.where(data['goTrialEpochs'] == rangeTemp[-1] + 1)[0][0]
        sentences.append(data['sentences'][trialMin:trialMax + 1])
        binRange.append(data['goTrialEpochs'][trialMin:trialMax + 1])
    return np.concatenate(np.squeeze(sentences)), np.concatenate(binRange)

In [117]:
def normalizeDayN(data):
    sentenceList, binRange = changSentences(data, blockList)

    yesIdx = np.where(sentenceList == 'Yes')[0][0]
    noIdx = np.where(sentenceList == 'No')[0][0]

    noTrialBins = data['feat'][binRange[noIdx, 1] - 50:binRange[noIdx, 1]] # the last 50 bins were found to be the most informative
    yesTrialBins = data['feat'][binRange[yesIdx, 1] - 50:binRange[yesIdx, 1]]

    return noTrialBins, yesTrialBins

In [118]:
noTrialBins, yesTrialBins = normalizeDayN(compDat1)
otherDayYesNo = np.array([noTrialBins, yesTrialBins])
otherDayLabels = [0, 1]

In [119]:
features = []
for trial in otherDayYesNo:
    (smoothed, _) = kf_global.smooth(trial)
    
    # split word into windows -- could be adapted in the future to work on live data
    n_windows = 5
    window_size = 50 // n_windows
    
    trial_feats = []
    for w in range(n_windows):
        # Mean of latent state in this window
        window_mean = np.mean(smoothed[w*window_size : (w+1)*window_size], axis=0)
        trial_feats.append(window_mean)
        
    features.append(np.concatenate(trial_feats))

otherDayFeatures = np.array(features)

In [120]:
pred = lda.predict(otherDayFeatures)
acc = accuracy_score(otherDayLabels, pred)
accuracies.append(acc)

print("\nChance Level: 50%")
print(f"\nAverage Accuracy: {np.mean(accuracies)*100:.1f}%")


Chance Level: 50%

Average Accuracy: 60.6%


In [None]:
# fix accuracy above