In [15]:
import polars as pl
from transformers import BertTokenizer, BertModel
import torch
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained("bert-base-uncased")

In [16]:
# Check if GPU is available and move the model to GPU if it is
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
print(f"Using device: {device}")

Using device: cuda


In [17]:
def get_mean_bert_embedding(texts):
    encoded_input = tokenizer(texts, padding=True, truncation=True, return_tensors='pt')
    # Move input tensors to the same device as the model
    encoded_input = {k: v.to(device) for k, v in encoded_input.items()}
    
    # Use no_grad to prevent gradient computation and save memory
    with torch.no_grad():
        output = model(**encoded_input)
        sentence_embedding = output.last_hidden_state.mean(dim=1)
    
    # Move back to CPU for numpy conversion and clear GPU cache
    result = sentence_embedding.cpu().detach().numpy()
    torch.cuda.empty_cache()
    return result

In [18]:
df = pl.scan_csv('../../data/clean/rotten_tomatoes_movie_details_clean.csv')

In [19]:
df.collect().shape

(17661, 4)

In [20]:
def embed_batch(df: pl.DataFrame) -> pl.DataFrame:
    # Ensure pure Python strings
    texts = [str(x) for x in df["description"].to_list()]
    
    # Process in smaller chunks to avoid OOM
    chunk_size = 32  # Adjust this based on your GPU memory
    embeddings = []
    
    for i in range(0, len(texts), chunk_size):
        chunk = texts[i:i+chunk_size]
        emb = get_mean_bert_embedding(chunk)
        embeddings.append(emb)
    
    # Concatenate all embeddings
    import numpy as np
    all_embeddings = np.vstack(embeddings)
    
    return pl.DataFrame({"embedding": [row.tolist() for row in all_embeddings]}, schema={"embedding": pl.List(pl.Float64)})

In [21]:
out = df.map_batches(embed_batch, schema={"embedding": pl.List(pl.Float64)})


In [22]:
out.head(1000).collect()

embedding
list[f64]
"[-0.147802, -0.075259, … 0.18019]"
"[-0.448375, -0.116364, … -0.276394]"
"[-0.050962, -0.149426, … -0.045833]"
"[0.023433, -0.272838, … -0.095659]"
"[-0.078904, 0.054318, … -0.25677]"
…
"[0.065718, 0.039937, … -0.165047]"
"[-0.139544, 0.046323, … -0.067661]"
"[0.355277, 0.034353, … -0.106431]"
"[0.089642, -0.320632, … -0.22597]"


In [23]:
df_e = out.collect()

KeyboardInterrupt: 

In [24]:
out = df.select(pl.col("description").map_batches(embed_batch, return_dtype=pl.List(pl.Float64))).head(10)

In [25]:
print("type(out):", type(out))
print("type(df):", type(df))
import polars as pl
print("polars version:", pl.__version__)

type(out): <class 'polars.lazyframe.frame.LazyFrame'>
type(df): <class 'polars.lazyframe.frame.LazyFrame'>
polars version: 1.35.2


In [176]:
df_e = out.collect()  

In [139]:
out = df.select(pl.col('description').map_batches(get_mean_bert_embedding)).head(100).collect()

In [26]:
out.estimated_size("mb") * (17661/100)

AttributeError: 'LazyFrame' object has no attribute 'estimated_size'