# CC100 1M GPU Pipeline Notebook
End-to-end pipeline for loading, embedding, reducing, clustering, and selecting medoids from CC100 shards on S3.

In [None]:
import os, sys, logging
import pandas as pd
import numpy as np

from phrasely.data_loading.s3_loader import CC100S3Loader
from phrasely.embeddings.phrase_embedder import PhraseEmbedder
from phrasely.reduction.two_stage_reducer import TwoStageReducer
from phrasely.clustering.hdbscan_clusterer import HDBSCANClusterer
from phrasely.evaluation.dbcv_score import compute_dbcv
from phrasely.medoids.medoid_selector import MedoidSelector
from phrasely.utils.gpu_utils import is_gpu_available, get_device_info

logging.basicConfig(level=logging.INFO)
print('GPU available:', is_gpu_available())
print('GPU info:', get_device_info())

## Load CC100 from S3

In [None]:
loader = CC100S3Loader(
    bucket='phrasely-data-mastroianni',
    prefix='cc100/',
    language=None,
    max_files=None,
    batch_size=50_000,
    max_phrases=1_000_000,
)

phrases = []
for df in loader.stream_load():
    assert 'phrase' in df.columns
    phrases.extend(df['phrase'].tolist())
    if len(phrases) >= 1_000_000:
        phrases = phrases[:1_000_000]
        break

len(phrases)

## Embedding

In [None]:
embedder = PhraseEmbedder(
    model_name='sentence-transformers/all-MiniLM-L6-v2',
    batch_size=4096,
    prefer_fp16=True,
)

embeddings = embedder.embed(phrases, dataset_name='cc100_1m')
embeddings.shape

## Dimensionality Reduction (SVD + UMAP)

In [None]:
reducer = TwoStageReducer(
    svd_components=320,
    umap_components=48,
    n_neighbors=30,
    min_dist=0.05,
    metric='cosine',
    use_gpu=True,
)

reduced = reducer.reduce(embeddings)
reduced.shape

## Clustering (HDBSCAN GPU)

In [None]:
clusterer = HDBSCANClusterer(
    use_gpu=True,
    min_cluster_size=30,
    min_samples=10,
)
labels = clusterer.cluster(reduced)

num_clusters = len(set(labels) - {-1})
noise_pct = float(np.mean(labels == -1))
num_clusters, noise_pct

## Cluster Quality (DBCV)

In [None]:
score = compute_dbcv(reduced, labels)
score

## Medoid Selection

In [None]:
selector = MedoidSelector(
    metric='cosine',
    exact_threshold=1500,
    prefer_gpu=True,
    return_indices=False,
)

medoids = selector.select(phrases, embeddings, labels)
len(medoids)