In [42]:
import sys
sys.path.append("..")

In [43]:
# use Python 3 kernel if possible, idk why the env_214 wouldn't let us to download outside pkg..

# !pip install gensim
import gensim

In [44]:
import os
import pickle 
import numpy as np
from sklearn.model_selection import train_test_split
import gensim.downloader as api
from preprocessing import downsample_word_vectors, make_delayed

In [45]:
# set up path
data_path = "/ocean/projects/mth240012p/shared/data"
raw_text_path = os.path.join(data_path, "raw_text.pkl")

In [46]:
# ----- Data split -----

# load raw text
with open(raw_text_path, "rb") as f:
    raw_text = pickle.load(f)

# train/test split
story_names = list(raw_text.keys())
train_stories, test_stories = train_test_split(story_names, test_size=0.2, random_state=42)

In [47]:
# ----- 1. Generate embedding -----

# load pre-trained word2vec (GoogleNews, 300-D)
word2vec_model = api.load("word2vec-google-news-300")

# convert for all stories
word_vectors = {}  # dictionary: story -> (T, 300)
for story in story_names:
    words = raw_text[story].data
    vectors = []
    for word in words:
        if word in word2vec_model:
            vectors.append(word2vec_model[word])
        else:
            vectors.append(np.zeros(300))
    word_vectors[story] = np.vstack(vectors)
    print(f"{story}: Word2Vec shape = {word_vectors[story].shape}") # expected shape: (T, 300)

sweetaspie: Word2Vec shape = (697, 300)
thatthingonmyarm: Word2Vec shape = (2073, 300)
tildeath: Word2Vec shape = (2297, 300)
indianapolis: Word2Vec shape = (1554, 300)
lawsthatchokecreativity: Word2Vec shape = (2084, 300)
golfclubbing: Word2Vec shape = (1211, 300)
jugglingandjesus: Word2Vec shape = (887, 300)
shoppinginchina: Word2Vec shape = (1731, 300)
cocoonoflove: Word2Vec shape = (1984, 300)
hangtime: Word2Vec shape = (1927, 300)
beneaththemushroomcloud: Word2Vec shape = (1916, 300)
dialogue4: Word2Vec shape = (1692, 300)
thepostmanalwayscalls: Word2Vec shape = (2220, 300)
stumblinginthedark: Word2Vec shape = (2681, 300)
kiksuya: Word2Vec shape = (1699, 300)
haveyoumethimyet: Word2Vec shape = (2985, 300)
theinterview: Word2Vec shape = (1079, 300)
againstthewind: Word2Vec shape = (838, 300)
tetris: Word2Vec shape = (1350, 300)
canplanetearthfeedtenbillionpeoplepart2: Word2Vec shape = (2532, 300)
alternateithicatom: Word2Vec shape = (2174, 300)
goldiethegoldfish: Word2Vec shape = (

In [48]:
# ----- 2. Downsaple and trim -----

wordseqs = raw_text
stories = train_stories + test_stories

def align_embeddings_with_fmri(stories, word_vectors, wordseqs, subject_path):
    """
    aligns Word2Vec embeddings with FMRI recordings.
    returns trimmed X and Y matched by timepoints.
    """
    # downsample the embeddings
    downsampled = downsample_word_vectors(stories, word_vectors, wordseqs)

    # trim the first 5 and last 10 seconds
    X_trimmed = {}
    Y_trimmed = {}

    for story in stories:
        bold_path = os.path.join(subject_path, f"{story}.npy")
        try:
            Y = np.load(bold_path)
            X = downsampled[story]

            # match shortest timepoints across both
            min_len = min(X.shape[0], Y.shape[0])

            X_trimmed[story] = X[:min_len, :]
            Y_trimmed[story] = Y[:min_len, :]

            print(f"{story}: aligned length = {min_len}")
        except FileNotFoundError:
            print(f"FMRI file for '{story}' not found, skipping.")
        except Exception as e:
            print(f"Error with {story}: {e}")
    
    return X_trimmed, Y_trimmed

In [49]:
subject_path = os.path.join(data_path, "subject2")
stories = train_stories + test_stories

X_trimmed, Y_trimmed = align_embeddings_with_fmri(stories, word_vectors, raw_text, subject_path)

theadvancedbeginner: aligned length = 318
superheroesjustforeachother: aligned length = 325
seedpotatoesofleningrad: aligned length = 281
mayorofthefreaks: aligned length = 475
adollshouse: aligned length = 241
cautioneating: aligned length = 279
haveyoumethimyet: aligned length = 496
ifthishaircouldtalk: aligned length = 249
myfathershands: aligned length = 170
thecurse: aligned length = 407
hangtime: aligned length = 324
breakingupintheageofgoogle: aligned length = 521
theclosetthatateeverything: aligned length = 314
leavingbaghdad: aligned length = 327
FMRI file for 'myfirstdaywiththeyankees' not found, skipping.
itsabox: aligned length = 355
golfclubbing: aligned length = 201
odetostepfather: aligned length = 404
gpsformylostidentity: aligned length = 326
exorcism: aligned length = 467
waitingtogo: aligned length = 349
theinterview: aligned length = 221
FMRI file for 'dialogue6' not found, skipping.
forgettingfear: aligned length = 237
FMRI file for 'dialogue5' not found, skipping.

In [50]:
# ----- 3. Create lagged versons of the features

# apply make_delayed to all stories in X_trimmed
X_lagged = {}

for story in X_trimmed:
    try:
        X_lagged[story] = make_delayed(X_trimmed[story], delays=[1, 2, 3, 4])
        print(f"{story}: lagged shape = {X_lagged[story].shape}")
    except Exception as e:
        print(f"Error for {story}: {e}") # expected shape: (T, 1200)

theadvancedbeginner: lagged shape = (318, 1200)
superheroesjustforeachother: lagged shape = (325, 1200)
seedpotatoesofleningrad: lagged shape = (281, 1200)
mayorofthefreaks: lagged shape = (475, 1200)
adollshouse: lagged shape = (241, 1200)
cautioneating: lagged shape = (279, 1200)
haveyoumethimyet: lagged shape = (496, 1200)
ifthishaircouldtalk: lagged shape = (249, 1200)
myfathershands: lagged shape = (170, 1200)
thecurse: lagged shape = (407, 1200)
hangtime: lagged shape = (324, 1200)
breakingupintheageofgoogle: lagged shape = (521, 1200)
theclosetthatateeverything: lagged shape = (314, 1200)
leavingbaghdad: lagged shape = (327, 1200)
itsabox: lagged shape = (355, 1200)
golfclubbing: lagged shape = (201, 1200)
odetostepfather: lagged shape = (404, 1200)
gpsformylostidentity: lagged shape = (326, 1200)
exorcism: lagged shape = (467, 1200)
waitingtogo: lagged shape = (349, 1200)
theinterview: lagged shape = (221, 1200)
forgettingfear: lagged shape = (237, 1200)
shoppinginchina: lagged