In [1]:
import sys
sys.path.append("..")  # Adds /lab3-1/ to sys.path for ridge_utils

import gensim
import os
import pickle 
import numpy as np
from sklearn.model_selection import train_test_split
import gensim.downloader as api
from preprocessing import downsample_word_vectors, make_delayed

# Set up path
data_path = "/ocean/projects/mth240012p/shared/data"
raw_text_path = os.path.join(data_path, "raw_text.pkl")

# ----- Data split -----

# Load raw text
with open(raw_text_path, "rb") as f:
    raw_text = pickle.load(f)

# Train/test split
story_names = list(raw_text.keys())
train_stories, test_stories = train_test_split(story_names, test_size=0.2, random_state=42)



In [2]:
# ----- 1. Generate GloVe embeddings -----

# Load pre-trained GloVe (glove-wiki-gigaword-300, 300D)
glove_model = api.load("glove-wiki-gigaword-300")

# Convert for all stories
word_vectors = {}  # dictionary: story -> (T, 300)
for story in story_names:
    words = raw_text[story].data
    vectors = []
    for word in words:
        if word in glove_model:
            vectors.append(glove_model[word])
        else:
            vectors.append(np.zeros(300))  # Zero vector for out-of-vocabulary words
    word_vectors[story] = np.vstack(vectors)
    print(f"{story}: GloVe shape = {word_vectors[story].shape}")  # Expected shape: (T, 300)



sweetaspie: GloVe shape = (697, 300)
thatthingonmyarm: GloVe shape = (2073, 300)
tildeath: GloVe shape = (2297, 300)
indianapolis: GloVe shape = (1554, 300)
lawsthatchokecreativity: GloVe shape = (2084, 300)
golfclubbing: GloVe shape = (1211, 300)
jugglingandjesus: GloVe shape = (887, 300)
shoppinginchina: GloVe shape = (1731, 300)
cocoonoflove: GloVe shape = (1984, 300)
hangtime: GloVe shape = (1927, 300)
beneaththemushroomcloud: GloVe shape = (1916, 300)
dialogue4: GloVe shape = (1692, 300)
thepostmanalwayscalls: GloVe shape = (2220, 300)
stumblinginthedark: GloVe shape = (2681, 300)
kiksuya: GloVe shape = (1699, 300)
haveyoumethimyet: GloVe shape = (2985, 300)
theinterview: GloVe shape = (1079, 300)
againstthewind: GloVe shape = (838, 300)
tetris: GloVe shape = (1350, 300)
canplanetearthfeedtenbillionpeoplepart2: GloVe shape = (2532, 300)
alternateithicatom: GloVe shape = (2174, 300)
goldiethegoldfish: GloVe shape = (1680, 300)
seedpotatoesofleningrad: GloVe shape = (1376, 300)
onap

In [3]:
# ----- 2. Downsample and trim -----

wordseqs = raw_text
stories = train_stories + test_stories

def align_embeddings_with_fmri(stories, word_vectors, wordseqs, subject_path):
    """
    Aligns GloVe embeddings with FMRI recordings.
    Returns trimmed X and Y matched by timepoints.
    """
    # Downsample the embeddings
    downsampled = downsample_word_vectors(stories, word_vectors, wordseqs)

    # Trim the first 5 and last 10 seconds
    X_trimmed = {}
    Y_trimmed = {}

    for story in stories:
        bold_path = os.path.join(subject_path, f"{story}.npy")
        try:
            Y = np.load(bold_path)
            X = downsampled[story]

            # Explicitly trim first 5 and last 10 seconds
            # X = X[5:-10, :]  # Assumes 1 row = 1 second
            # Y = Y[5:-10, :]  # Trim Y to match

            # Match shortest timepoints across both (in case of mismatch)
            min_len = min(X.shape[0], Y.shape[0])

            X_trimmed[story] = X[:min_len, :]
            Y_trimmed[story] = Y[:min_len, :]

            print(f"{story}: aligned length = {min_len}")
        except FileNotFoundError:
            print(f"FMRI file for '{story}' not found, skipping.")
        except Exception as e:
            print(f"Error with {story}: {e}")
    
    return X_trimmed, Y_trimmed

subject_path = os.path.join(data_path, "subject2")
stories = train_stories + test_stories

X_trimmed, Y_trimmed = align_embeddings_with_fmri(stories, word_vectors, raw_text, subject_path)



theadvancedbeginner: aligned length = 318
superheroesjustforeachother: aligned length = 325
seedpotatoesofleningrad: aligned length = 281
mayorofthefreaks: aligned length = 475
adollshouse: aligned length = 241
cautioneating: aligned length = 279
haveyoumethimyet: aligned length = 496
ifthishaircouldtalk: aligned length = 249
myfathershands: aligned length = 170
thecurse: aligned length = 407
hangtime: aligned length = 324
breakingupintheageofgoogle: aligned length = 521
theclosetthatateeverything: aligned length = 314
leavingbaghdad: aligned length = 327
FMRI file for 'myfirstdaywiththeyankees' not found, skipping.
itsabox: aligned length = 355
golfclubbing: aligned length = 201
odetostepfather: aligned length = 404
gpsformylostidentity: aligned length = 326
exorcism: aligned length = 467
waitingtogo: aligned length = 349
theinterview: aligned length = 221
FMRI file for 'dialogue6' not found, skipping.
forgettingfear: aligned length = 237
FMRI file for 'dialogue5' not found, skipping.

In [4]:
# ----- 3. Create lagged versions of the features -----

# Apply make_delayed to all stories in X_trimmed
X_glove_lagged = {}

for story in X_trimmed:
    try:
        X_glove_lagged[story] = make_delayed(X_trimmed[story], delays=[1, 2, 3, 4])
        print(f"{story}: lagged shape = {X_glove_lagged[story].shape}")
    except Exception as e:
        print(f"Error for {story}: {e}")  # Expected shape: (T, 1200)

theadvancedbeginner: lagged shape = (318, 1200)
superheroesjustforeachother: lagged shape = (325, 1200)
seedpotatoesofleningrad: lagged shape = (281, 1200)
mayorofthefreaks: lagged shape = (475, 1200)
adollshouse: lagged shape = (241, 1200)
cautioneating: lagged shape = (279, 1200)
haveyoumethimyet: lagged shape = (496, 1200)
ifthishaircouldtalk: lagged shape = (249, 1200)
myfathershands: lagged shape = (170, 1200)
thecurse: lagged shape = (407, 1200)
hangtime: lagged shape = (324, 1200)
breakingupintheageofgoogle: lagged shape = (521, 1200)
theclosetthatateeverything: lagged shape = (314, 1200)
leavingbaghdad: lagged shape = (327, 1200)
itsabox: lagged shape = (355, 1200)
golfclubbing: lagged shape = (201, 1200)
odetostepfather: lagged shape = (404, 1200)
gpsformylostidentity: lagged shape = (326, 1200)
exorcism: lagged shape = (467, 1200)
waitingtogo: lagged shape = (349, 1200)
theinterview: lagged shape = (221, 1200)
forgettingfear: lagged shape = (237, 1200)
shoppinginchina: lagged