In [3]:
%load_ext autoreload
%autoreload 2
import os
import matplotlib.pyplot as plt
import seaborn as sns
from os.path import join
from tqdm import tqdm
import pandas as pd
import sys
from typing import List
import numpy as np
import joblib
from pprint import pprint
import imodelsx.util
import sasc.viz
import pickle as pkl
import json
from copy import deepcopy
from numpy.linalg import norm
from sasc.config import CACHE_DIR, RESULTS_DIR, cache_ngrams_dir, regions_idxs_dir
import sasc.modules.fmri_module
ngrams_list = joblib.load(join(cache_ngrams_dir, 'fmri_UTS02_ngrams.pkl'))
rois_dict = joblib.load(join(regions_idxs_dir, 'rois_S02.jbl'))

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### get predictions from embs

In [4]:
# # embs = joblib.load(join(cache_ngrams_dir, 'fmri_embs.pkl'))
# embs = joblib.load(join(cache_ngrams_dir, 'fmri_embs_llama.pkl'))
# mod = sasc.modules.fmri_module.fMRIModule(
#     subject="UTS02",
#     # checkpoint="facebook/opt-30b",
#     checkpoint="huggyllama/llama-30b",
#     init_model=False,
#     restrict_weights=False,
# )
# voxel_preds = mod(embs=embs, return_all=True)
# outputs_dict = {
#     k: voxel_preds[:, np.array(rois_dict[k])].mean(axis=1)
#     for k in rois_dict
# }
# joblib.dump(outputs_dict, join(
#     # cache_ngrams_dir, 'rois_ngram_outputs_dict.pkl'))
#     cache_ngrams_dir, 'rois_ngram_outputs_dict_llama.pkl'))

In [5]:
outputs_dict = joblib.load(
    join(cache_ngrams_dir, 'rois_ngram_outputs_dict.pkl'))
df_opt = pd.DataFrame(outputs_dict, index=ngrams_list)
outputs_dict = joblib.load(
    join(cache_ngrams_dir, 'rois_ngram_outputs_dict_llama.pkl'))
df_llama = pd.DataFrame(outputs_dict, index=ngrams_list)
df = df_opt + df_llama
ROIS_LOC = ['RSC', 'OPA', 'PPA']
for k in ROIS_LOC:
    df_opt[k + '_only'] = df_opt[k] - \
        df_opt[[c for c in ROIS_LOC if c != k]].mean(axis=1)
    df_llama[k + '_only'] = df_llama[k] - \
        df_llama[[c for c in ROIS_LOC if c != k]].mean(axis=1)
    df[k + '_only'] = df[k] - \
        df[[c for c in ROIS_LOC if c != k]].mean(axis=1)

In [6]:
stability_scores = {
    k: np.corrcoef(df_opt[k], df_llama[k])[0, 1]
    for k in df.columns
}

In [7]:
ascending = False  # should be false to get driving ngrams
top_ngrams_dict = {}
for k in df.columns:
    top_ngrams_dict[k] = df.sort_values(
        k, ascending=ascending).index[:100].tolist()
    # if k in ROIS_LOC:

    # top_ngrams_dict[k + '_only'] = df.sort_values(
    # k + '_only', ascending=ascending).index[:100].tolist()
top_ngrams_df = pd.DataFrame(top_ngrams_dict)
top_ngrams_df.to_csv('top_ngrams_by_roi.csv')
with pd.option_context('display.max_rows', None):
    rois = ['RSC', 'OPA', 'PPA', 'IPS', 'pSTS', 'sPMv',
            'EBA', 'OFA'] + ['RSC_only', 'OPA_only', 'PPA_only']
    display(top_ngrams_df[rois])

Unnamed: 0,RSC,OPA,PPA,IPS,pSTS,sPMv,EBA,OFA,RSC_only,OPA_only,PPA_only
0,drove from vermont,onto the railing,on the railing,there were slats,said excuse me,one mississippi two,wraps his arms,of my childhood,came to florida,towards the ceiling,kind of corny
1,moved to vermont,against the railing,on a dock,onto the railing,says excuse me,said excuse me,lifted her dress,newfound self esteem,back in israel,onto the railing,of bready puns
2,drove to washington,on the railing,on the windowsill,on the railing,i stopped midstride,mississippi two mississippi,arms flailing,so my shrink,moved to london,on the ceiling,like burnt steak
3,here in manhattan,towards the river,mile of cornfields,against the railing,room went silent,said guess what,arms around her,hurtful first dates,traveled to marrakesh,against the railing,pulled a muscle
4,here in boston,onto the sidewalk,the windowsill,the back hatch,scissors someone shouted,am turning forty,arms tighten around,recall many instances,sitting in indianapolis,feet hanging over,his painting sucked
5,was in boston,towards the doors,onto the railing,four connected squares,i provoked gasps,april nineteen forty,flying arms flailing,it felt magical,went to boston,on the railing,like your shirt
6,off into vancouver,towards the door,outside the windows,in long rows,somebody then yelled,says excuse me,hands gripped the,answered many questions,was in boston,towards the doors,'s painting sucked
7,moved to chicago,outside the windows,across the parking,path that jutted,she started laughing,say one mississippi,grabbed her legs,my school days,moved to vermont,seats behind,important like pudding
8,back in manhattan,towards the ceiling,contain strip malls,the double doors,hook excuse me,two mississippi three,the chopsticks flipped,no satisfying fantasies,was in mexico,towards the door,a snake oil
9,went to boston,long hallway toward,against the railing,on the sides,i whirled around,october nineteen forty,his hands folded,my mom often,moved to chicago,lights peeking over,had some scarring


In [8]:
gpt4 = imodelsx.llm.get_llm('gpt-4-turbo-0125-spot')

explanations = {}
for k in top_ngrams_df.columns:

    s = '- ' + '\n- '.join(top_ngrams_df[k].iloc[:60])
    prompt = f'''Here is a list of phrases:
    {s}

    What is a common theme among these phrases? Return only a concise phrase.'''

    explanations[k] = gpt4(prompt)
json.dump(explanations, open('explanations_by_roi.json', 'w'), indent=4)

cached!
cached!
cached!
cached!
cached!
cached!
cached!
cached!
cached!
cached!
cached!
cached!
cached!
cached!
cached!
cached!
cached!
cached!
cached!
cached!
cached!
cached!
cached!
cached!
cached!
cached!
cached!
cached!
cached!
cached!
cached!
cached!
cached!
cached!
cached!
cached!
cached!
cached!


### Export selected rois to pkl

In [9]:
rois = ['RSC', 'OPA', 'PPA', 'IPS', 'pSTS', 'sPMv',
        'EBA', 'OFA'] + ['RSC_only', 'OPA_only', 'PPA_only2']  # 'PPA_only1',
# pprint({k: explanations[k] for k in rois})
explanations_clean = {
    'EBA': 'Body parts',
    'IPS': 'Descriptive elements of scenes or objects',
    'OFA': 'Personal growth and reflection',
    'OPA': 'Direction and location descriptions',
    'OPA_only': 'Spatial positioning and directions',
    'PPA': 'Scenes and settings',
    # 'PPA_only1': 'Lying and falsehoods',
    'PPA_only2': 'Unappetizing foods',
    'RSC': 'Travel and location names',
    'RSC_only': 'Location names',
    'pSTS': 'Verbal interactions',
    'sPMv': 'Time and numbers'}
explanation_avoid_suffixes = {
    'EBA': ' Avoid mentioning any locations.',
    'IPS': ' Avoid mentioning any locations.',
    'OFA': ' Avoid mentioning any locations.',
    'OPA': ' Avoid mentioning any specific location names (like "New York" or "Europe").',
    'OPA_only': ' Avoid mentioning any specific location names (like "New York" or "Europe").',
    'PPA': ' Avoid mentioning any specific location names (like "New York" or "Europe").',
    # 'PPA_only1': ' Avoid mentioning any specific location names (like "New York" or "Europe").',
    'PPA_only2': ' Avoid mentioning any specific location names (like "New York" or "Europe").',
    'RSC': '',
    'RSC_only': '',
    'pSTS': ' Avoid mentioning any locations.',
    'sPMv': ' Avoid mentioning any locations.'
}
for roi in rois:
    print(f'"{roi}":', str(
        top_ngrams_df[roi.replace('1', '').replace('2', '')].iloc[:50].values.tolist()) + ', ')
    # {
    # roi:  for roi in rois
# })
top_ngrams_clean = {
    "RSC": ['drove from vermont', 'to washington', 'in manhattan', 'here in boston', 'off into vancouver', 'moved to chicago', 'was in mexico', 'arrived in indianapolis', 'came to florida', 'i left vermont'],
    "OPA": ['onto the railing', 'towards the river', 'onto the sidewalk', 'towards the doors', 'outside the windows', 'long hallway toward', 'to the horizon', 'towards the street', 'over the gulf', 'to my left', 'path that jutted', 'on the ceiling', 'on the windowsill', 'down this embankment', 'up those stairs', 'above the gulf', 'facing the beach'],
    "PPA": ['mile of cornfields', 'the windowsill', 'the rolling hills', 'beautiful moonlit mountains', 'giant stone cliffs', 'a strip mall', 'nondescript office buildings', 'manicured lawns', 'lakes', 'the dark driveway', 'and shimmering skyscrapers', 'a private beach', 'the leafy garden', 'our modest backyard', 'my dorm'],

    "RSC_only": ['florida', 'israel', 'london', 'marrakesh', 'indianapolis', 'paris', 'pennsylvania', 'tokyo', 'tenessee', 'boston', 'vermont', 'chicago', 'indianapolis'],
    "OPA_only": ['towards the ceiling', 'onto the railing', 'feet hanging over', 'towards the doors', 'seats behind', 'towards the door', 'lights peeking over', 'to my left', 'situated herself behind', 'you sit backward', 'to the horizon', 'maybe twelve feet', 'at the ceiling', 'towards the street', 'of seats behind', 'twenty feet above', 'his back turned', 'see the horizon', 'seats behind the', 'to my right', 'and high rafters', 'about twenty feet', 'door behind me', 'the door behind', 'toward the back', 'over his shoulder', 'feet above the', 'hands went underneath', 'towards the ground', 'his feet hanging', 'feet touch the', 'behind her and', 'stand in front', 'down one side', 'on opposite sides', 'over the ceiling', 'on either side'],
    # "PPA_only": ['kind of corny', 'his painting sucked', 'snake oil', 'liar fake', 'fake name', 'bad puns', 'as an insult', 'called baloney'],
    "PPA_only2": ['like burnt steak', 'like pudding', 'tasted pretty bad', 'stale baked goods', 'the crusts', 'baloney', 'yeast extract', 'a sandwich rejected',],

    "IPS": ['there were slats', 'four connected squares', 'in long rows', 'on the sides', 'a long narrow', 'that forms horizontal', 'long rows of', 'sixty foot wide', 'between buttered slices', 'mile thick ice', 'all four corners', 'along the top'],
    "pSTS": ['said excuse me', 'says excuse me', 'room went silent', 'someone shouted', 'i provoked gasps', 'somebody then yelled', 'she started laughing', 'excuse me', 'asked i laughed', 'exhalation someone shouted', 'retorted rather loudly', 'turned and said', 'hurry she exclaimed', 'i started yelling', 'say excuse me', 'i started laughing', 'interrupted the conversation', 'breath he yelled', 'moment she gasped', 'said guess what'],
    "sPMv": ['one', 'forty', 'april nineteen forty', 'was sixteen seventeen', 'five only twenty', 'three down', 'march twentieth nineteen', 'more time passed', 'fifteen meters fifty', "turning ninety", 'june of nineteen'],
    "EBA": ['wraps his arms', 'lifted her dress', 'arms flailing', 'hands gripped the', 'grabbed her legs', 'his hands folded', 'my feet kicking', 'navigated pushy elbows', 'elbows on knees', 'over his shoulder'],
    "OFA": ['of my childhood', 'newfound self esteem', 'so my shrink', 'hurtful first dates', 'recall many instances', 'it felt magical', 'answered many questions', 'my school days', 'no satisfying fantasies', 'my mom often', 'from our childhood', 'growing up we', 'good friends often', 'shaped their mind', 'everything my parents'],
}

"RSC": ['drove from vermont', 'moved to vermont', 'drove to washington', 'here in manhattan', 'here in boston', 'was in boston', 'off into vancouver', 'moved to chicago', 'back in manhattan', 'went to boston', 'was in mexico', 'back in boston', 'sitting in indianapolis', 'arrived in indianapolis', 'came to florida', 'i left vermont', 'here in houston', 'was in pennsylvania', 'moved to brooklyn', 'arrived in tokyo', 'moved to london', 'off in vancouver', 'traveled to marrakesh', 'moved to washington', "'m in michigan", 'back in brooklyn', 'i drove to', 'back in israel', 'in lower manhattan', 'nineties new york', 'hometown in texas', 'went to manchester', 'it was summer', 'upstate new york', 'suburbs of baltimore', 'camp in upstate', 'we were downtown', 'in nashville tennessee', 'drove out to', 'in downriver michigan', 'normal suburban pittsburgh', 'in upstate', 'were in paris', 'living in chicago', 'i drove out', 'i drove home', 'an hour south', 'go to vancouver', 'back in alabama', 'i 

In [10]:
rows = {
    'roi': rois,
    'expl': [explanations_clean[k] for k in rois],
    'top_ngrams_module_correct': [top_ngrams_clean[k] for k in rois],
    'stability_score': [stability_scores[k.split('_')[0]] for k in rois],
    # 'question': questions,
    'subject': ['UTS02'] * len(rois),
    'voxel_nums': [rois_dict[k.split('_')[0]] for k in rois],
    'prompt_suffix': [explanation_avoid_suffixes[k] for k in rois],
}

In [11]:
pd.DataFrame(rows).to_pickle('rows_roi_uts02_may31.pkl')

In [12]:
pd.DataFrame(rows)

Unnamed: 0,roi,expl,top_ngrams_module_correct,stability_score,subject,voxel_nums,prompt_suffix
0,RSC,Travel and location names,"[drove from vermont, to washington, in manhatt...",0.727578,UTS02,"[26313, 26368, 26369, 26370, 26423, 26424, 264...",
1,OPA,Direction and location descriptions,"[onto the railing, towards the river, onto the...",0.683813,UTS02,"[24026, 27029, 27030, 27031, 27075, 27076, 270...",Avoid mentioning any specific location names ...
2,PPA,Scenes and settings,"[mile of cornfields, the windowsill, the rolli...",0.417527,UTS02,"[9579, 9580, 9634, 11900, 11901, 11902, 11903,...",Avoid mentioning any specific location names ...
3,IPS,Descriptive elements of scenes or objects,"[there were slats, four connected squares, in ...",0.635485,UTS02,"[52728, 52729, 52730, 52781, 52782, 52783, 528...",Avoid mentioning any locations.
4,pSTS,Verbal interactions,"[said excuse me, says excuse me, room went sil...",0.601735,UTS02,"[26011, 26067, 26068, 26123, 26124, 26125, 291...",Avoid mentioning any locations.
5,sPMv,Time and numbers,"[one, forty, april nineteen forty, was sixteen...",0.564742,UTS02,"[71004, 71005, 71006, 71024, 71025, 71053, 710...",Avoid mentioning any locations.
6,EBA,Body parts,"[wraps his arms, lifted her dress, arms flaili...",0.623958,UTS02,"[10330, 10331, 10370, 10371, 10410, 12693, 126...",Avoid mentioning any locations.
7,OFA,Personal growth and reflection,"[of my childhood, newfound self esteem, so my ...",0.574794,UTS02,"[8133, 8161, 8189, 8190, 8221, 8222, 8223, 822...",Avoid mentioning any locations.
8,RSC_only,Location names,"[florida, israel, london, marrakesh, indianapo...",0.727578,UTS02,"[26313, 26368, 26369, 26370, 26423, 26424, 264...",
9,OPA_only,Spatial positioning and directions,"[towards the ceiling, onto the railing, feet h...",0.683813,UTS02,"[24026, 27029, 27030, 27031, 27075, 27076, 270...",Avoid mentioning any specific location names ...


In [30]:
max(sum(rows['voxel_nums'], []))

91536

In [13]:
rows = pd.read_pickle('rows_roi_uts02_may31.pkl')

In [17]:
rows['lens'] = rows['voxel_nums'].apply(len)
rows[['roi', 'lens']]

Unnamed: 0,roi,lens
0,RSC,526
1,OPA,582
2,PPA,429
3,IPS,1471
4,pSTS,800
5,sPMv,552
6,EBA,617
7,OFA,361
8,RSC_only,526
9,OPA_only,582
