In [1]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import pickle
import numpy as np
import csv
import scipy
from sklearn.decomposition import PCA

In [2]:
model = GPT2LMHeadModel.from_pretrained('gpt2')
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

In [3]:
id_eleccount = # Removed to protect patient anonymity

In [4]:
def segment_transcript_by_movie(patient_id, ecog1_t, ecog2_t):
    # Load in interview transcript
    with open('/scratch/gpfs/mayaar/GravityECoG/sourcedata/interview-transcripts/cleaned/cleaned_transcript_ny' + patient_id + '.csv', newline='') as f:
        reader = csv.reader(f)
        data = list(reader)
        
    # Movie 1
    words_1 = []
    onsets_1 = []
    # Movie 2
    words_2 = []
    onsets_2 = []

    for i in range(len(data)):
        if ((float(data[i][2]) > (ecog1_t[0] + 2)) and (float(data[i][2]) < (ecog1_t[-1] - 2))):
            words_1.append(data[i][1])
            onsets_1.append(data[i][2])
        if ((float(data[i][2]) > (ecog2_t[0] + 2)) and (float(data[i][2]) < (ecog2_t[-1] - 2))):
            words_2.append(data[i][1])
            onsets_2.append(data[i][2])

    onsets_1 = np.array(onsets_1, dtype=float)
    onsets_2 = np.array(onsets_2, dtype=float)
    
    return words_1, words_2, onsets_1, onsets_2

In [5]:
def split_interval (onset, offset, n):
    splits = np.linspace(onset, offset, n+1)
    return splits[0:-1], splits[1:]

In [6]:
def get_embeddings(words, onsets):
    embeddings = [] # 1767 = number of tokens, 768 = tensor length
    t_onsets = [] # 771 = number of tokens, 768 = tensor length
   
    for windex, word in enumerate(words):
        text_index = tokenizer.encode(word,add_prefix_space=True)
        vector = model.transformer.wte.weight[text_index,:]

        if len(vector) > 1:
            if (windex + 1) < len(onsets):
                onsets_new, offsets_new = split_interval(onsets[windex],
                                                         onsets[windex + 1], len(vector))
            else:
                onsets_new, offsets_new = split_interval(onsets[windex],
                                                         onsets[windex] + 0.2, len(vector))
            for j in range(len(vector)):
                embeddings.append(vector.detach().numpy()[j, :])
                t_onsets.append(onsets_new[j])
        else:
            onsets_new  = onsets[windex]
            embeddings.append(vector.detach().numpy())
            t_onsets.append(onsets_new)
            
    # Turning Embeddings into numpy array
    emb_np = np.zeros((len(embeddings), 768))
    for i in range(len(embeddings)):
        emb_np[i, :] = embeddings[i]
            
    # Taking number of Principal Components as 50
    pca = PCA(n_components = 50)
    pca.fit(emb_np)
    return pca.transform(emb_np), t_onsets

In [7]:
for patient_id in id_eleccount:
    # Load in ECoG time axis
    ecog1_t = scipy.io.loadmat('/scratch/gpfs/mayaar/GravityECoG/derivatives/preprocessing/sub-ny' + patient_id + '/eeg1_manualica_notch_time.mat')['trial'][0]
    ecog1_t = np.array(ecog1_t, dtype=float)
    ecog2_t = scipy.io.loadmat('/scratch/gpfs/mayaar/GravityECoG/derivatives/preprocessing/sub-ny' + patient_id + '/eeg2_manualica_notch_time.mat')['trial'][0]
    ecog2_t = np.array(ecog2_t, dtype=float)
    
    # Separate words into corresponding ECoG recordings (1 and 2)
    words_1, words_2, onsets_1, onsets_2 = segment_transcript_by_movie(patient_id, ecog1_t, ecog2_t)

    # Generate and save 50 dimensional GloVe embeddings for each word
    embeddings_1, t_onsets_1 = get_embeddings(words_1, onsets_1)
    embeddings_2, t_onsets_2 = get_embeddings(words_2, onsets_2)
    
    np.save("gpt2_emb_1_" + patient_id + ".npy", embeddings_1)
    np.save("gpt2_emb_2_" + patient_id + ".npy", embeddings_2)
    np.save("gpt2_onsets_1_" + patient_id + ".npy", t_onsets_1)
    np.save("gpt2_onsets_2_" + patient_id + ".npy", t_onsets_2)