In [1]:
#! uv pip install --extra-index-url=https://pypi.nvidia.com "cuml-cu12>=25.4.*"

In [1]:
import polars as pl
import numpy as np
import cuml

In [2]:
df = pl.read_parquet(
    "movie_data_plus_embeds_all.parquet",
)


df

tconst,startYear,numVotes,averageRating,json,embedding
str,i64,i64,f64,str,"array[f32, 768]"
"""tt0000009""",1894,224,5.4,"""{  ""title"": ""Miss Jerry"",  ""…","[-0.007815, -0.022642, … 0.005391]"
"""tt0000147""",1897,558,5.3,"""{  ""title"": ""The Corbett-Fitz…","[0.012021, 0.014255, … -0.015754]"
"""tt0000574""",1906,985,6.0,"""{  ""title"": ""The Story of the…","[-0.010052, -0.015825, … 0.040161]"
"""tt0000591""",1907,31,5.6,"""{  ""title"": ""The Prodigal Son…","[0.00765, 0.019661, … -0.010763]"
"""tt0000630""",1908,33,3.2,"""{  ""title"": ""Hamlet"",  ""genr…","[0.03492, 0.00301, … 0.027586]"
…,…,…,…,…,…
"""tt9915790""",2019,45,7.0,"""{  ""title"": ""Bobbyr Bondhura""…","[-0.008241, -0.024547, … -0.014563]"
"""tt9916160""",2019,52,6.2,"""{  ""title"": ""Drømmeland"",  ""…","[-0.014737, -0.035892, … 0.027569]"
"""tt9916190""",2020,263,3.6,"""{  ""title"": ""Safeguard"",  ""g…","[0.014303, -0.018036, … -0.008043]"
"""tt9916270""",2020,1507,5.8,"""{  ""title"": ""Il talento del c…","[0.02358, -0.024546, … 0.017486]"


In [3]:
embeds = df["embedding"].to_numpy()
embeds.shape

(242552, 768)

In [4]:
%%time
output_dims = 2

umap_fitted = cuml.UMAP(random_state=None,
                        init="random",
                        n_components=output_dims,
                        n_neighbors=100,
                        min_dist=0.0,
                        build_algo="brute_force_knn",  # required for high n_neighbors
                        n_epochs=100_000)

embeds_np = umap_fitted.fit_transform(embeds)

CPU times: user 6min 30s, sys: 3min 17s, total: 9min 48s
Wall time: 9min 46s


In [5]:
embeds_np.shape

(242552, 2)

In [6]:
print(embeds_np[:, 0].min(), embeds_np[:, 0].max())
print(embeds_np[:, 1].min(), embeds_np[:, 1].max())

-5.356537 5.2130203
32.809303 52.773674


In [7]:
x_centered = embeds_np[:, 0] - embeds_np[:, 0].mean()
y_centered = embeds_np[:, 1] - embeds_np[:, 1].mean()

print(x_centered.min(), x_centered.max())
print(y_centered.min(), y_centered.max())

-5.342183 5.227374
-6.227566 13.736805


In [8]:
df_embeds = (df
    .select(["tconst", "averageRating"])
    .with_columns(
        x_2d=x_centered,
        y_2d=y_centered,
    )
)

df_embeds

tconst,averageRating,x_2d,y_2d
str,f64,f32,f32
"""tt0000009""",5.4,0.985534,4.925121
"""tt0000147""",5.3,-0.46465,-0.505939
"""tt0000574""",6.0,1.141387,5.097034
"""tt0000591""",5.6,0.540034,4.332634
"""tt0000630""",3.2,-0.526112,1.901875
…,…,…,…
"""tt9915790""",7.0,-3.226949,-5.247654
"""tt9916160""",6.2,-0.256684,-1.698399
"""tt9916190""",3.6,0.991565,-1.813526
"""tt9916270""",5.8,1.471797,-1.534901


In [9]:
df_embeds.write_parquet("imdb_embeddings_2d.parquet")