This notebook analyzes data from https://github.com/HuthLab/deep-fMRI-dataset. To set up, see instructions in the `deep-fMRI-dataset` folder.

In [115]:
%load_ext autoreload
%autoreload 2
import datasets
import numpy as np
from os.path import join
from encoding.ridge_utils.SemanticModel import SemanticModel
from matplotlib import pyplot as plt
from typing import List
from sklearn.linear_model import RidgeCV, LogisticRegressionCV
from sklearn.feature_extraction.text import CountVectorizer
from encoding.feature_spaces import em_data_dir, data_dir, results_dir
from collections import defaultdict
import pandas as pd

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Load some data

In [71]:
dset = datasets.load_dataset('rotten_tomatoes')['train']
# dset = dset.select(np.random.choice(len(dset), size=300, replace=False))
X = dset['text']
y = dset['label']

dset_test = datasets.load_dataset('rotten_tomatoes')['validation']
# dset_test = dset_test.select(np.random.choice(len(dset_test), size=300, replace=False))
X_test = dset_test['text']
y_test = dset_test['label']
print('shapes', len(X), len(X_test))

Using custom data configuration default
Reusing dataset rotten_tomatoes (/home/chansingh/.cache/huggingface/datasets/rotten_tomatoes/default/1.0.0/40d411e45a6ce3484deed7cc15b82a53dad9a72aafd9f86f8f227134bec5ca46)


  0%|          | 0/3 [00:00<?, ?it/s]

Using custom data configuration default
Reusing dataset rotten_tomatoes (/home/chansingh/.cache/huggingface/datasets/rotten_tomatoes/default/1.0.0/40d411e45a6ce3484deed7cc15b82a53dad9a72aafd9f86f8f227134bec5ca46)


  0%|          | 0/3 [00:00<?, ?it/s]

shapes 8530 1066


### Extract different features

In [109]:
def get_vecs(X: List[str], save_location) -> np.ndarray:
    eng1000 = SemanticModel.load(join(em_data_dir, 'english1000sm.hf5'))
    # extract features
    X = [
        [word.encode('utf-8') for word in sentence.split(' ')]
        for sentence in X
    ]
    feats = eng1000.project_stims(X)
    return feats

def get_embs_fmri(X: List[str], save_location, perc_threshold=98) -> np.ndarray:
    feats = get_vecs(X, save_location)
    weights_npz = np.load(join(save_location, 'weights.npz'))
    corrs_val = np.load(join(save_location, 'corrs.npz'))['arr_0']
    
    weights = weights_npz['arr_0']
    N_DELAYS = 4
    # pretty sure this is right, but might be switched...
    weights = weights.reshape(N_DELAYS, -1, feats.shape[-1]) 
    # delays for coefs are not stored next to each other!! (see cell 25 file:///Users/chandan/Downloads/speechmodeltutorial-master/SpeechModelTutorial%20-%20Pre-run.html)
    # weights = weights.reshape(-1, N_DELAYS, feats.shape[-1]) 
    weights = weights.mean(axis=0).squeeze() # mean over delays dimension...
    embs = feats @ weights.T

    # subselect repr
    perc = np.percentile(corrs_val, perc_threshold)
    idxs = (corrs_val > perc)
    # print('emb dim', idxs.sum(), 'val corr cutoff', perc)
    embs = embs[:, idxs]

    return embs

def get_bow_vecs(X: List[str], X_test: List[str]):
    trans = CountVectorizer().fit(X).transform
    return trans(X).todense(), trans(X_test).todense()

### Fit models

In [None]:
seed = 1
perc_threshold_fmri = 98
save_location = join(results_dir, 'eng1000', 'UTS03')
r = defaultdict(list)
for k in ['eng1000fmri']: # ['eng1000vecs', 'eng1000fmri', 'bow']:
    if k == 'eng1000vecs':
        feats_train = get_vecs(X, save_location)
        feats_test = get_vecs(X_test, save_location)
    elif k == 'eng1000fmri':
        feats_train = get_embs_fmri(X, save_location, perc_threshold=perc_threshold_fmri)
        feats_test = get_embs_fmri(X_test, save_location, perc_threshold=perc_threshold_fmri) 
    elif k == 'bow':
        feats_train, feats_test = get_bow_vecs(X, X_test)

    m = LogisticRegressionCV(random_state=seed)
    m.fit(feats_train, y)
    r['feats'].append(k)
    r['acc'].append(m.score(feats_test, y_test))
    r['feats_dim'].append(feats_train.shape[1])

In [122]:
pd.DataFrame.from_dict(r).set_index('feats')

Unnamed: 0_level_0,acc,feats_dim
feats,Unnamed: 1_level_1,Unnamed: 2_level_1
eng1000fmri,0.733583,4778
