In [1]:
#! uv pip install --extra-index-url=https://pypi.nvidia.com "cudf-cu12==24.12.*" "cuml-cu12==24.12.*"

In [2]:
import polars as pl
import numpy as np
import cuml

In [3]:
df = pl.read_parquet(
    "movie_data_plus_embeds_all.parquet",
)


df

tconst,startYear,numVotes,averageRating,json,embedding
str,i64,i64,f64,str,"array[f32, 768]"
"""tt0000009""",1894,220,5.3,"""{  ""title"": ""Miss Jerry"",  ""…","[-0.007815, -0.022642, … 0.005391]"
"""tt0000147""",1897,549,5.3,"""{  ""title"": ""The Corbett-Fitz…","[0.012021, 0.014255, … -0.015754]"
"""tt0000574""",1906,971,6.0,"""{  ""title"": ""The Story of the…","[-0.010052, -0.015825, … 0.040161]"
"""tt0000591""",1907,30,5.6,"""{  ""title"": ""The Prodigal Son…","[0.00765, 0.019661, … -0.010763]"
"""tt0000630""",1908,30,3.1,"""{  ""title"": ""Hamlet"",  ""genr…","[0.03492, 0.00301, … 0.027586]"
…,…,…,…,…,…
"""tt9915790""",2019,44,7.0,"""{  ""title"": ""Bobbyr Bondhura""…","[-0.008241, -0.024547, … -0.014563]"
"""tt9916160""",2019,52,6.2,"""{  ""title"": ""Drømmeland"",  ""…","[-0.014737, -0.035892, … 0.027569]"
"""tt9916190""",2020,262,3.6,"""{  ""title"": ""Safeguard"",  ""g…","[0.017858, -0.018587, … -0.007212]"
"""tt9916270""",2020,1501,5.8,"""{  ""title"": ""Il talento del c…","[0.02358, -0.024546, … 0.017486]"


In [4]:
embeds = df["embedding"].to_numpy()
embeds.shape

(238628, 768)

Workaround CUML UMAP bug: https://github.com/rapidsai/cuml/issues/5707#issuecomment-2618102235

In [5]:
from cuml.manifold.umap import fuzzy_simplicial_set, simplicial_set_embedding
from umap.spectral import spectral_layout
from cuml.manifold.umap_utils import find_ab_params

class UMAP:
    def __init__(self, n_epochs, n_neighbors, n_components, min_dist=0.1, random_state=np.random.randint(1, 1000), verbose=False):
        self.n_epochs = n_epochs
        self.n_neighbors = n_neighbors
        self.n_components = n_components
        self.min_dist = min_dist
        self.random_state = random_state
        self.verbose = verbose


        
    def fit_transform(self, X):
        graph = fuzzy_simplicial_set(X, 
                                     n_neighbors=self.n_neighbors, 
                                     random_state=self.random_state, 
                                     metric='euclidean', 
                                     verbose=self.verbose)
        
        layout = spectral_layout(X, graph.tocsr().get(), 
                                 dim=self.n_components, 
                                 random_state=self.random_state)
        spread = 1.0
        a, b = find_ab_params(spread, self.min_dist)
        embedding = simplicial_set_embedding(X, graph, 
                                             init=layout,
                                             a=a, b=b,
                                             n_epochs=self.n_epochs,
                                             n_components=self.n_components,
                                             random_state=self.random_state, 
                                             verbose=self.verbose)
        
        return embedding


In [6]:
%%time
output_dims = 2

# umap_fitted = cuml.UMAP(random_state=None,
#                         init="random",
#                         n_components=output_dims,
#                         n_neighbors=2,
#                         min_dist=0.1,
#                         build_algo="nn_descent",
#                         n_epochs=500_000)

umap_fitted = UMAP(n_components=output_dims,
                        n_neighbors=30,
                        min_dist=0.0,
                        n_epochs=10_000)

embeds_t = umap_fitted.fit_transform(embeds)
# _ = umap_fitted.fit(embeds)

CPU times: user 4min 54s, sys: 1min 10s, total: 6min 4s
Wall time: 2min 12s


In [7]:
embeds_np = embeds_t.to_output().get()
embeds_np.shape

(238628, 2)

In [8]:
print(embeds_np[:, 0].min(), embeds_np[:, 0].max())
print(embeds_np[:, 1].min(), embeds_np[:, 1].max())

-13.643234 1.1678486
4.845169 19.638477


In [9]:
x_centered = embeds_np[:, 0] - embeds_np[:, 0].mean()
y_centered = embeds_np[:, 1] - embeds_np[:, 1].mean()

print(x_centered.min(), x_centered.max())
print(y_centered.min(), y_centered.max())

-8.503075 6.3080077
-9.060875 5.7324333


In [10]:
df_embeds = (df
    .select(["tconst", "averageRating"])
    .with_columns(
        x_2d=x_centered,
        y_2d=y_centered,
    )
)

df_embeds

tconst,averageRating,x_2d,y_2d
str,f64,f32,f32
"""tt0000009""",5.3,5.200502,1.923615
"""tt0000147""",5.3,5.155415,1.243568
"""tt0000574""",6.0,5.139497,1.886518
"""tt0000591""",5.6,2.645668,1.936699
"""tt0000630""",3.1,3.535955,-0.061966
…,…,…,…
"""tt9915790""",7.0,-5.629279,-4.07938
"""tt9916160""",6.2,-1.883669,0.135609
"""tt9916190""",3.6,0.422619,0.121901
"""tt9916270""",5.8,-2.568947,-0.260567


In [11]:
df_embeds.write_parquet("imdb_embeddings_2d.parquet")