<a href="https://colab.research.google.com/github/nsdumont/encoding-model-semantic-modularity/blob/main/BerTopic_example.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
try:
  from google.colab import drive
  in_colab = True
except:
  in_colab = False

if in_colab:
  !pip install bertopic
  import os
  drive.mount('/content/drive')
  os.chdir('/content/drive/My Drive/encoding-model-semantic-modularity')

Collecting bertopic
  Downloading bertopic-0.16.2-py2.py3-none-any.whl (158 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m158.8/158.8 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
Collecting hdbscan>=0.8.29 (from bertopic)
  Downloading hdbscan-0.8.37-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/3.6 MB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting umap-learn>=0.5.0 (from bertopic)
  Downloading umap_learn-0.5.6-py3-none-any.whl (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.7/85.7 kB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
Collecting sentence-transformers>=0.4.1 (from bertopic)
  Downloading sentence_transformers-3.0.1-py3-none-any.whl (227 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.1/227.1 kB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0m
Collecting cython<3,>=0.27 (from hdbscan>=0.8.29->bertopic)
  Dow

In [16]:
import numpy as np
import joblib
import matplotlib.pyplot as plt
from transformers import AutoTokenizer, AutoModelForCausalLM
from bertopic import BERTopic
from bertopic.representation import ZeroShotClassification
from ridge_utils.dsutils import make_word_ds

In [21]:
# These files are located in the story_data folder of the Box
grids = joblib.load("grids_huge.jbl") # Load TextGrids containing story annotations
trfiles = joblib.load("trfiles_huge.jbl") # Load TRFiles containing TR information

train_stories = ['adollshouse', 'adventuresinsayingyes', 'alternateithicatom', 'avatar', 'buck', 'exorcism',
            'eyespy', 'fromboyhoodtofatherhood', 'hangtime', 'haveyoumethimyet', 'howtodraw', 'inamoment',
            'itsabox', 'legacy', 'naked', 'odetostepfather', 'sloth',
            'souls', 'stagefright', 'swimmingwithastronauts', 'thatthingonmyarm', 'theclosetthatateeverything',
            'tildeath', 'undertheinfluence']

test_stories = ["wheretheressmoke"]

all_stories = train_stories + test_stories

# Filter out the other stories for the tutorial
for story in list(grids):
    if story not in all_stories:
        del grids[story]
        del trfiles[story]

# Make datasequence for story
wordseqs = make_word_ds(grids, trfiles)

# Define the OPT model and tokenizer
model_name = "facebook/opt-125m"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

def tokenize_text(text_list, tokenizer):
    tokenized_texts = [tokenizer.tokenize(text) for text in text_list]
    return tokenized_texts

text_data = [" ".join(wordseqs[s].data) for s in all_stories]

# Tokenize the text data
tokenized_texts = tokenize_text(text_data, tokenizer)

# Create sliding windows of tokens
def sliding_windows(tokens, window_length, stride):
    windows = []
    for token_list in tokens:
        for i in range(0, len(token_list) - window_length + 1, stride):
            window = token_list[i:i+window_length]
            windows.append(" ".join(window))
    return windows

window_length = 64
stride = 1

# Create sliding windows
windows = sliding_windows(tokenized_texts, window_length, stride)

In [26]:
# Fit topic model: based on clustering, not good (at least when applied on only one story)
topic_model = BERTopic()
topics, probs = topic_model.fit_transform(windows[:100])

# # Create representation model
# candidate_topics = ["people", "places", "numbers", "visual", "touch", "concepts", "mental", "social", "actions", "animals", "objects", "nature"]
# representation_model = ZeroShotClassification(candidate_topics, model=model_name)

# # Use the representation model in BERTopic on top of the default pipeline
# topic_model = BERTopic(representation_model=representation_model)

In [27]:
df = topic_model.get_topic_info()
df

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,8,-1_never_like_ia_met,"[never, like, ia, met, anybody, brought, home,...",[Ġgiving Ġme Ġshit Ġlike Ġhow Ġcome Ġi Ġhaven ...
1,0,21,0_like_meet_you_didn,"[like, meet, you, didn, never, to, home, their...",[Ġwas Ġlike Ġthirty Ġyears Ġold Ġnever Ġmet Ġb...
2,1,20,1_at_like_sitting_you,"[at, like, sitting, you, meet, shit, staring, ...",[Ġhome Ġi Ġnever Ġi Ġdidn 't Ġever Ġlike Ġmeet...
3,2,19,2_never_parents_ia_she,"[never, parents, ia, she, my, was, and, met, b...",[Ġwanted Ġto Ġmeet Ġmy Ġparents Ġshe Ġwas Ġrea...
4,3,17,3_never_you_always_like,"[never, you, always, like, know, anybody, brou...",[Ġhow Ġcome Ġi Ġhaven Ġmet Ġyour Ġparents Ġa i...
5,4,15,4_dating_this_uh_she,"[dating, this, uh, she, and, like, ia, my, was...",[Ġthis Ġgirl Ġlike Ġuh Ġlike Ġtwo Ġyears Ġago ...


In [28]:
topic_model.visualize_topics()


In [29]:
topic_model.visualize_heatmap()
