https://huggingface.co/jxm/cde-small-v1


In [1]:
import transformers
import polars as pl
import os

tokenizer = transformers.AutoTokenizer.from_pretrained("bert-base-uncased")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = "mps"

model = transformers.AutoModel.from_pretrained(
    "jxm/cde-small-v1", trust_remote_code=True
)

_ = model.to(device)

<All keys matched successfully>
<All keys matched successfully>


Disabled 37 dropout modules from model type <class 'transformers_modules.jxm.cde-small-v1.9e2ed1d8d569d34458913d2d246935c1b2324d11.model.BiEncoder'>
modified 12 rotary modules – set rotary_start_pos to 512
Disabled 74 dropout modules from model type <class 'transformers_modules.jxm.cde-small-v1.9e2ed1d8d569d34458913d2d246935c1b2324d11.model.DatasetTransformer'>


In [3]:
model.device

device(type='mps', index=0)

In [None]:
data_dir = "/Users/maxwoolf/Downloads"

df = pl.read_parquet(os.path.join(data_dir, "test_movie_json_input.parquet"))
df = df.sample(fraction=1.0, shuffle=True, seed=42)
df

tconst,startYear,averageRating,json
str,i64,f64,str
"""tt27447581""",2023,6.0,"""{""title"":""All About the Little…"
"""tt0126449""",1972,8.0,"""{""title"":""Then I Sentenced The…"
"""tt2460440""",2012,8.9,"""{""title"":""A Film About Kids an…"
"""tt13358878""",2024,3.5,"""{""title"":""Succubus"",""genres"":[…"
"""tt28128599""",2024,4.5,"""{""title"":""Midas"",""genres"":[""Ac…"
…,…,…,…
"""tt2007409""",2010,6.0,"""{""title"":""Retribution"",""genres…"
"""tt4211840""",2015,5.8,"""{""title"":""Seven Dorms of Death…"
"""tt0205844""",1950,7.6,"""{""title"":""Coriolan"",""genres"":n…"
"""tt2177629""",2013,4.5,"""{""title"":""Pursuit of Love"",""ge…"


## First Stage


In [7]:
import torch
from tqdm.autonotebook import tqdm
import random

In [8]:
df_subset = df.sample(n=1600, shuffle=True, seed=42)
json_docs = df_subset["json"].to_list()

json_docs[:5]

['{"title":"Le business du bonheur","genres":["Documentary"],"is_adult":false,"release_year":2022,"runtime_minutes":52,"directors":["Jean-Christophe Ribot"],"writers":["Claire Alet","Jean-Christophe Ribot"],"producers":["Ga\\u00eblle Guyader"],"actors":["Lison Riess"],"principals":[{"Jean-Christophe Cheneval":"composer"},{"Nathana\\u00ebl Louvet":"cinematographer"},{"C\\u00e9dric Defert":"editor"}]}',
 '{"title":"Excuses!","genres":["Comedy","Drama"],"is_adult":false,"release_year":2003,"runtime_minutes":null,"directors":["Pep Anton G\\u00f3mez"],"writers":["Pep Anton G\\u00f3mez","Joel Joan","Jordi S\\u00e1nchez"],"producers":[],"actors":["M\\u00f2nica Glaenzel","Teresa S\\u00e1nchez","Joel Joan","Jordi S\\u00e1nchez"],"principals":[]}',
 '{"title":"The Secret","genres":["Biography","Documentary"],"is_adult":false,"release_year":2009,"runtime_minutes":63,"directors":["Pontus Hjorth\\u00e9n","Martin J\\u00f6nsson"],"writers":[],"producers":["Kalle Gustafsson Jerneholm"],"actors":[],"pr

In [9]:
document_prefix = "search_document: "

minicorpus_size = model.config.transductive_corpus_size
minicorpus_docs = random.sample(json_docs, minicorpus_size)
len(minicorpus_docs)

512

In [10]:
minicorpus_tokens = tokenizer(
    [document_prefix + doc for doc in minicorpus_docs],
    truncation=True,
    padding=True,
    max_length=512,
    return_tensors="pt",
).to(device)

minicorpus_tokens["input_ids"].size()

torch.Size([512, 378])

In [11]:
dataloader = torch.utils.data.DataLoader(minicorpus_docs, batch_size=32, shuffle=False)

dataset_embeddings = []
for batch in tqdm(dataloader, smoothing=0):
    minicorpus_tokens = tokenizer(
        [document_prefix + doc for doc in batch],
        truncation=True,
        padding="max_length",
        max_length=512,
        return_tensors="pt",
    ).to(device)
    with torch.no_grad():
        dataset_embeddings.append(model.first_stage_model(**minicorpus_tokens))

dataset_embeddings = torch.cat(dataset_embeddings)
dataset_embeddings.size()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
100%|██████████| 16/16 [00:24<00:00,  1.51s/it]


torch.Size([512, 768])

In [12]:
dataloader = torch.utils.data.DataLoader(json_docs, batch_size=32, shuffle=False)

doc_embeddings = []
for batch in tqdm(dataloader, smoothing=0):
    docs_batch = tokenizer(
        [document_prefix + doc for doc in batch],
        truncation=True,
        padding="max_length",
        max_length=512,
        return_tensors="pt",
    ).to(device)
    with torch.no_grad():
        doc_embeddings_batch = model.second_stage_model(
            input_ids=docs_batch["input_ids"],
            attention_mask=docs_batch["attention_mask"],
            dataset_embeddings=dataset_embeddings,
        )
        doc_embeddings.append(doc_embeddings_batch)

doc_embeddings = torch.cat(doc_embeddings)
doc_embeddings /= doc_embeddings.norm(p=2, dim=1, keepdim=True)
doc_embeddings.size()

100%|██████████| 50/50 [03:11<00:00,  3.83s/it]


torch.Size([1600, 768])

In [13]:
df_subset_2 = df_subset.with_columns(embeds=doc_embeddings.cpu().numpy())
df_subset_2

tconst,startYear,averageRating,json,embeds
str,i64,f64,str,"array[f32, 768]"
"""tt21937348""",2022,6.8,"""{""title"":""Le business du bonhe…","[-0.012958, 0.03246, … -0.037463]"
"""tt0425976""",2003,5.0,"""{""title"":""Excuses!"",""genres"":[…","[-0.009819, -0.025217, … -0.069853]"
"""tt1581629""",2009,6.8,"""{""title"":""The Secret"",""genres""…","[-0.013838, 0.008304, … -0.051282]"
"""tt1707240""",2010,6.2,"""{""title"":""Lys"",""genres"":[""Dram…","[0.022719, 0.042718, … -0.061914]"
"""tt32378615""",2024,7.7,"""{""title"":""We Should Make Movie…","[0.041895, 0.016391, … -0.022129]"
…,…,…,…,…
"""tt0463960""",2013,3.3,"""{""title"":""The Devil You Know"",…","[0.01918, 0.006489, … -0.03445]"
"""tt5865148""",2016,6.1,"""{""title"":""Brett Gelman's Dinne…","[0.015193, 0.030122, … -0.061412]"
"""tt0185883""",1949,6.7,"""{""title"":""Aoi sanmyaku"",""genre…","[-0.020236, 0.034688, … -0.058068]"
"""tt27436518""",2022,4.8,"""{""title"":""The Legacy"",""genres""…","[-0.019275, 0.007522, … -0.029518]"


In [14]:
df_subset_2.write_parquet(os.path.join(data_dir, "movie_embeds.parquet"))