In [192]:
import polars as pl
from transformers import BertTokenizer, BertModel
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained("bert-base-uncased")



In [136]:
def get_mean_bert_embedding(texts):
    encoded_input = tokenizer(texts, padding=True, truncation=True, return_tensors='pt')
    output = model(**encoded_input)
    sentence_embedding = output.last_hidden_state.mean(dim=1)
    return sentence_embedding.detach().numpy()

In [30]:
df = pl.scan_csv('../../data/clean/rotten_tomatoes_movie_details_clean.csv')

In [164]:
df.collect().shape

(17661, 4)

In [45]:
((17661/5)*1.9)/60

111.853

In [71]:
(30.3 * (17661/100))/60

89.18805

In [152]:
import polars as pl
import numpy as np

def embed_batch(df: pl.DataFrame) -> pl.DataFrame:
    texts = df["description"].to_list()
    emb = get_mean_bert_embedding(texts)  # numpy array, shape: (batch, 768)

    # Convert numpy rows → Python lists (Polars can store these)
    emb_as_lists = [row.tolist() for row in emb]

    return pl.DataFrame(
        {"embedding": emb_as_lists},
        schema={"embedding": pl.Array(768, pl.Float64)}
    )


In [186]:
def embed_batch(df: pl.DataFrame) -> pl.DataFrame:
    # Ensure pure Python strings
    texts = [str(x) for x in df["description"].to_list()]
    
    emb = get_mean_bert_embedding(texts)  # shape (batch, 768)
    return pl.DataFrame({"embedding": [row.tolist() for row in emb]})


In [187]:
out = df.map_batches(embed_batch)


In [191]:
out.head(1).collect()

In [189]:
df_e = out.collect()

In [175]:
out = df.select(pl.col("description").map_batches(embed_batch, return_dtype=pl.List(pl.Float64))).head(10)

In [177]:
print("type(out):", type(out))
print("type(df):", type(df))
import polars as pl
print("polars version:", pl.__version__)

type(out): <class 'polars.lazyframe.frame.LazyFrame'>
type(df): <class 'polars.lazyframe.frame.LazyFrame'>
polars version: 1.35.2


In [176]:
df_e = out.collect()  

In [139]:
out = df.select(pl.col('description').map_batches(get_mean_bert_embedding)).head(100).collect()

In [69]:
out.estimated_size("mb") * (17661/100)

50.843319025039676