# Setup

`flash_attn` is very slow to build, so use a precompiled wheel. Replace the PyTorch version and Python version as necessary.

In [1]:
# !uv pip install polars transformers einops torch accelerate
# !pip install https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.6cxx11abiTRUE-cp310-cp310-linux_x86_64.whl

In [2]:
# For Colab

# from google.colab import auth
# auth.authenticate_user()

Private bucket that contains the preprocessed JSON parquet: Replace with your own.

In [1]:
import transformers
import polars as pl
import os
import torch
from tqdm import tqdm

import torch.nn.functional as F
from transformers import AutoModel, AutoTokenizer

In [2]:
df = pl.read_parquet("test_movie_json_input.parquet")
df = df.sample(fraction=1.0, shuffle=True, seed=42)
df

tconst,startYear,numVotes,averageRating,json
str,i64,i64,f64,str
"""tt2147199""",2011,193,6.4,"""{  ""title"": ""The Gathering! 2…"
"""tt16287710""",2024,93,5.8,"""{  ""title"": ""Crypto Shadows"",…"
"""tt0050919""",1957,633,7.9,"""{  ""title"": ""Rio, Zona Norte""…"
"""tt10262346""",2019,44,5.2,"""{  ""title"": ""Pitkä perjantai""…"
"""tt0057525""",1963,545,7.9,"""{  ""title"": ""Mountain of Fear…"
…,…,…,…,…
"""tt3982768""",2016,83,4.9,"""{  ""title"": ""Revenge"",  ""gen…"
"""tt0480765""",2004,42,7.5,"""{  ""title"": ""The Making of 't…"
"""tt2118609""",2012,117,7.5,"""{  ""title"": ""Death Metal Ango…"
"""tt2051941""",2011,4802,7.0,"""{  ""title"": ""Men in Hope"",  …"


In [3]:
model_path = "Alibaba-NLP/gte-modernbert-base"
device = "cuda:0"

tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModel.from_pretrained(model_path)
_ = model.to(device)

torch.set_float32_matmul_precision('high')

model.safetensors:   0%|          | 0.00/298M [00:00<?, ?B/s]

You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.


In [4]:
docs = df["json"].to_list()

print(docs[0])

{
  "title": "The Gathering! 2",
  "genres": [
    "Comedy",
    "Drama"
  ],
  "is_adult": false,
  "release_year": 2011,
  "runtime_minutes": 119,
  "directors": [
    "Nia Di Nata"
  ],
  "writers": [
    "Nia Di Nata"
  ],
  "producers": [
    "Nia Di Nata"
  ],
  "actors": [
    "Cut Mini Theo",
    "Tora Sudiro",
    "Surya Saputra",
    "Aida Nurmala",
    "Rachel Maryam Sayidina",
    "Atiqah Hasiholan",
    "Rio Dewanto",
    "Edward Gunawan",
    "Adinia Wirasti",
    "Keiko Marwan"
  ],
  "principals": [
    {
      "Mondo Gascaro": "composer"
    },
    {
      "Bembi Gusti": "composer"
    },
    {
      "Aghi Narottama": "composer"
    },
    {
      "Yudi Datau": "cinematographer"
    },
    {
      "Lucky Kuswandi": "editor"
    }
  ]
}


In [5]:
tokenized_docs = tokenizer(
    docs[0], max_length=8192, padding=True, truncation=True, return_tensors="pt"
).to(device)

tokenized_docs

{'input_ids': tensor([[50281,    92,   187, 50276,     3,  5564,  1381,   346,   510,   443,
         44627,     2,   374,   995,   187, 50276,     3,  1541,   373,  1381,
           544,   187, 50274,     3,  2115,  6368,   995,   187, 50274,     3,
          9034,  2902,     3,   187, 50276,  1092,   187, 50276,     3,   261,
            64, 50006,  1381,  3221,    13,   187, 50276,     3, 16690,    64,
          2913,  1381,  4332,    13,   187, 50276,     3, 21005,    64, 32117,
          1381, 12035,    13,   187, 50276,     3, 18711,   641,  1381,   544,
           187, 50274,     3,    47,   571,  6129,   427,   682,     3,   187,
         50276,  1092,   187, 50276,     3, 34782,  1381,   544,   187, 50274,
             3,    47,   571,  6129,   427,   682,     3,   187, 50276,  1092,
           187, 50276,     3,  5551,  8964,  1381,   544,   187, 50274,     3,
            47,   571,  6129,   427,   682,     3,   187, 50276,  1092,   187,
         50276,     3, 46435,  1381,  

In [6]:
dataloader = torch.utils.data.DataLoader(docs, batch_size=32,
                                         shuffle=False,
                                         pin_memory=True,
                                         pin_memory_device=device)

dataset_embeddings = []
for batch in tqdm(dataloader, smoothing=0):
    tokenized_batch = tokenizer(
        batch, max_length=8192, padding=True, truncation=True, return_tensors="pt"
    ).to(device)

    with torch.no_grad():
        outputs = model(**tokenized_batch)
        embeddings = outputs.last_hidden_state[:, 0].detach().cpu()
    dataset_embeddings.append(embeddings)

dataset_embeddings = torch.cat(dataset_embeddings)
dataset_embeddings = F.normalize(dataset_embeddings, p=2, dim=1)
dataset_embeddings.size()

  0%|          | 0/7580 [00:00<?, ?it/s]W0510 17:33:56.160000 23614 site-packages/torch/_inductor/utils.py:1250] [1/0] Not enough SMs to use max_autotune_gemm mode
100%|██████████| 7580/7580 [21:07<00:00,  5.98it/s]


torch.Size([242552, 768])

In [7]:
df_2 = df.with_columns(embedding=dataset_embeddings.cpu().numpy()).sort("tconst")

df_2

tconst,startYear,numVotes,averageRating,json,embedding
str,i64,i64,f64,str,"array[f32, 768]"
"""tt0000009""",1894,224,5.4,"""{  ""title"": ""Miss Jerry"",  ""…","[-0.007815, -0.022642, … 0.005391]"
"""tt0000147""",1897,558,5.3,"""{  ""title"": ""The Corbett-Fitz…","[0.012021, 0.014255, … -0.015754]"
"""tt0000574""",1906,985,6.0,"""{  ""title"": ""The Story of the…","[-0.010052, -0.015825, … 0.040161]"
"""tt0000591""",1907,31,5.6,"""{  ""title"": ""The Prodigal Son…","[0.00765, 0.019661, … -0.010763]"
"""tt0000630""",1908,33,3.2,"""{  ""title"": ""Hamlet"",  ""genr…","[0.03492, 0.00301, … 0.027586]"
…,…,…,…,…,…
"""tt9915790""",2019,45,7.0,"""{  ""title"": ""Bobbyr Bondhura""…","[-0.008241, -0.024547, … -0.014563]"
"""tt9916160""",2019,52,6.2,"""{  ""title"": ""Drømmeland"",  ""…","[-0.014737, -0.035892, … 0.027569]"
"""tt9916190""",2020,263,3.6,"""{  ""title"": ""Safeguard"",  ""g…","[0.014303, -0.018036, … -0.008043]"
"""tt9916270""",2020,1507,5.8,"""{  ""title"": ""Il talento del c…","[0.02358, -0.024546, … 0.017486]"


In [8]:
df_2.write_parquet("movie_data_plus_embeds_all.parquet")

In [9]:
!gsutil cp movie_data_plus_embeds_all.parquet gs://maxw-imdb-embeddings/

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Copying file://movie_data_plus_embeds_all.parquet [Content-Type=application/octet-stream]...
==> NOTE: You are uploading one or more large file(s), which would run          
significantly faster if you enable parallel composite uploads. This
feature can be enabled by editing the
"parallel_composite_upload_threshold" value in your .boto
configuration file. However, note that if you do this large files will
be uploaded as `composite objects
<https://cloud.google.com/storage/docs/composite-objects>`_,which
means that any user who downloads such objects will need to have a
compiled crcmod installed (see "gsutil help crcmod"). This is because
without a compiled crcmod, computing checksums on composite objects is
so slow that gsutil disables downloads of composite objects.

- [1 files][702.0 MiB/702.0 MiB]                                                
Operation completed over 1 objects/702.0 MiB.                                    
