# Setup

In [3]:
#### Dependencies ####
import numpy as np
import logging
import os
import sys
import time
import joblib
import matplotlib.pyplot as plt
import torch
import json
import cortex # This dependency is pycortex, which enables the plotting of flatmaps. It can be disabled.
from cvxopt import matrix, solvers # Only necessary for the stacked model.
from transformers import AutoTokenizer, AutoModelForCausalLM # Only necessary for feature extraction.
import subprocess
from tqdm.autonotebook import tqdm, trange

# Repository imports
from ridge_utils.ridge import bootstrap_ridge
import ridge_utils.npp
from ridge_utils.util import make_delayed
from ridge_utils.dsutils import make_word_ds
from ridge_utils.DataSequence import DataSequence
from ridge_utils.tokenization_helpers import generate_efficient_feat_dicts_opt
from ridge_utils.tokenization_helpers import convert_to_feature_mats_opt

# Topic model imports
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer # For generating embeddings
from sklearn.decomposition import PCA # To speed up the UMAP
from sklearn.feature_extraction.text import CountVectorizer 
from bertopic.vectorizers import ClassTfidfTransformer
try:
    import en_core_web_sm
except ModuleNotFoundError:
    !python -m spacy download en_core_web_sm
    import en_core_web_sm
from bertopic.representation import PartOfSpeech, KeyBERTInspired, MaximalMarginalRelevance

## User defined variables

In [4]:
## Paths for the the story data and FMRI data
grids_path = "../data/story_data/grids_huge.jbl"
trfiles_path = "../data/story_data/trfiles_huge.jbl"

## Paths for the topic encoder
model_dir = "../models"
embeddings_dir = "../embeddings"

## If there are any pre-trained models you'd like to use
model_name = "test_model"
model_path = os.path.join(model_dir, model_name)

# If embeddings have been pre-computed
embeddings_path = None

## Download cortex data
Here we automate the download of the brain models from open-neuro. It uses curl to download the files, and then sets the cortex path to be the correct location for us.

In [5]:
pycortex_download_script = "../ds003020-2.2.0.sh"
pycortex_dir = '../pycortex-db'

# Select which subjects to download (full list is ['UTS01', 'UTS02','UTS03','UTS04','UTS05','UTS06','UTS07','UTS08'] ) 
subjects = ['UTS03']

with open(pycortex_download_script, 'r') as f:
    pbar = tqdm(f)
    for line in pbar:
        if 'derivative/pycortex-db/UTS' in line:
            for subject in subjects:
                if subject in line:
                    # Construct the output command
                    output_command = line.replace(' derivative/pycortex-db/', ' ' + pycortex_dir + os.sep)
                    
                    # Extract the output file path from the curl command
                    # Assuming the output path is specified with -o option in the curl command
                    parts = output_command.split()
                    output_file_path = None
                    if '-o' in parts:
                        output_file_index = parts.index('-o') + 1
                        output_file_path = parts[output_file_index]
                    
                    # Check if the file exists
                    if output_file_path and not os.path.exists(output_file_path):
                        subprocess.run(output_command, shell=True)
                    else:
                        pbar.set_description(f"File {output_file_path} already exists. Skipping download.")

# This is your new filestore path
new_filestore_path = os.path.join(os.getcwd(), pycortex_dir)
cortex.options.config.set('basic', 'filestore', new_filestore_path)
# Set the new filestore path
cortex.db.filestore = cortex.options.config.get('basic', 'filestore')
cortex.db.reload_subjects()
cortex.db

0it [00:00, ?it/s]

Pycortex database
  Subjects:
   UTS01
   UTS02
   UTS03

## GPU Setup

Sets up the GPU if there is one there. Biggest benefit will be on CUDA systems, some benefits exist for MacOS 

In [6]:
if torch.backends.mps.is_available():
    device = torch.device("mps")  # Use MPS if available
else:
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # Fallback to CUDA or CPU

# Load a model

In [7]:
topic_model = BERTopic.load(model_path)
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,66961,-1_lived_lucky_bedroom_like little,"[lived, lucky, bedroom, like little, girls, la...",
1,0,8265,0_proud_mike_uh uh_um uh,"[proud, mike, uh uh, um uh, michael, uh um, fr...",
2,1,6293,1_channel_love_english_loves,"[channel, love, english, loves, loving, sight,...",
3,2,2463,2_said yes_said yeah_uh said_said uh,"[said yes, said yeah, uh said, said uh, said s...",
4,3,1869,3_patients_doctor_dr_patient,"[patients, doctor, dr, patient, hospital, brig...",
...,...,...,...,...,...
459,458,11,458_needs_wants_desire_things,"[needs, wants, desire, things, sentence, neede...",
460,459,11,459_shelter_slept_history_mean,"[shelter, slept, history, mean, women, months,...",
461,460,11,460_cared_care_don care_don worry,"[cared, care, don care, don worry, worry, real...",
462,461,10,461_office_deliver_notes_rolling,"[office, deliver, notes, rolling, note, write,...",


In [9]:
topic_model.visualize_topics()

In [11]:
test_strings = [
    "The ocean's vast expanse stretches endlessly, meeting the sky at a distant horizon.",
    "Waves crash rhythmically against the sandy shore, creating a soothing symphony of sounds.",
    "Sunlight sparkles on the water's surface, creating a tapestry of shimmering light.",
    "Seagulls glide gracefully overhead, their calls echoing the mystery of the sea.",
    "The salty breeze brushes against your skin, invigorating your senses with its freshness.",
    "Children build elaborate sandcastles near the water’s edge, their laughter mingling with the ocean's roar.",
    "The beach at sunset offers a breathtaking view, with hues of orange and pink painting the sky.",
    "Collecting seashells along the shore, each one tells a unique story from beneath the waves.",
    "Surfers challenge the mighty waves, skillfully maneuvering their boards in sync with the ocean’s rhythm.",
    "The smell of the ocean, a mix of salt and mystery, fills the air, instantly relaxing anyone who breathes it in."
]

topics, probs = topic_model.transform(test_strings)

In [17]:
topics

array([ 83, 220,  81,  47,  79,  16,  81,   7, 220,  79])

In [29]:
# check a single topic
id = 5
print(test_strings[id])
topic_model.get_topic_info(topics[id])

Children build elaborate sandcastles near the water’s edge, their laughter mingling with the ocean's roar.


Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,16,983,16_child_daughter_children_year old,"[child, daughter, children, year old, infant, ...",
