In [1]:
import os, sys

os.environ["HF_DATASETS_OFFLINE"] = "0"
os.environ["HF_DATASETS_CACHE"] = "/tmp"     # points to temp dir
os.environ["HF_HUB_DISABLE_PROGRESS_BARS"] = "1"
os.environ["HF_DATASETS_DISABLE_CACHING"] = "1"  # üëà full off switch

project_root = "/home/michael/workspace/phrasely"
sys.path.insert(0, os.path.join(project_root, "src"))
os.chdir(project_root)


import random
import warnings
from tqdm import TqdmWarning, tqdm
import pandas as pd
from phrasely.pipeline import run_pipeline
from phrasely.data_loading.cc100_loader import CC100Loader
from phrasely.embeddings.phrase_embedder import PhraseEmbedder
from phrasely.data_loading.cc100_offline_loader import CC100OfflineLoader
import logging
from phrasely.reduction.visualization_reducer import VisualizationReducer
import matplotlib.pyplot as plt

logging.getLogger().setLevel(logging.INFO)

warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=TqdmWarning)

  from tqdm.autonotebook import tqdm, trange


In [2]:
def setup_logger(name: str) -> logging.Logger:
    logger = logging.getLogger(name)
    if not logger.handlers:
        handler = logging.StreamHandler(sys.stdout)
        formatter = logging.Formatter("%(message)s")
        handler.setFormatter(formatter)
        logger.addHandler(handler)
    return logger

In [3]:
setup_logger('phrasely')

<Logger phrasely (INFO)>

In [6]:
result = run_pipeline(
    CC100OfflineLoader,
    loader_kwargs={
        "arrow_dir": "data_cache/cc100",
        "language": "",
        "max_files": 20,
        "max_phrases": 100_000,
        "batch_size": 50_000,
    },
    stream=True,
    use_gpu=True,
    min_cluster_size=5,   # üëà adjust here
    min_samples=2         # üëà optional fine-tuning
)


result.save("data_cache/run_cc100_100k")
print("\n‚úÖ End-to-end pipeline finished successfully!")
result.summary()

üöÄ Starting Phrasely pipeline...
Detected GPU VRAM: 3.8 GB
Adaptive GPU limits ‚Äî SVD: 190,000 rows, HDBSCAN: 190,000 rows.
‚ñ∂Ô∏è  Loading and embedding phrases...
PhraseEmbedder using model=paraphrase-MiniLM-L6-v2, device=cuda, VRAM‚âà3.8 GB, batch_size=8
Converted model to fp16 for reduced VRAM usage.
‚ö†Ô∏è  Found 236 shards; limiting to first 20 to avoid memory overflow.
Streaming 20 chunks from data_cache/cc100
Yielding 50,000 rows from cc100-train-00000-00000-of-NNNNN.arrow (1/62)
Loading cached embeddings from data_cache/embeddings_paraphrase-MiniLM-L6-v2_f7bf034570992803f5b1fb054a25ccfd.npy
‚ö†Ô∏è Embedding size mismatch in batch 1: 20000 embeddings vs 50000 phrases. Truncating to smallest length.
Streamed batch 1: 20,000 phrases
Yielding 50,000 rows from cc100-train-00000-00000-of-NNNNN.arrow (2/62)
Loading cached embeddings from data_cache/embeddings_paraphrase-MiniLM-L6-v2_6d7e846e588113ccb2b999071a7f046f.npy
Streamed batch 2: 50,000 phrases
Yielding 50,000 rows from cc1

{'n_phrases': 100000,
 'n_clusters': 8,
 'n_medoids': 8,
 'embedding_dim': 384,
 'reduced_dim': 100}

In [9]:
from phrasely.evaluation.visualizer import plot_clusters_2d

viz_reducer = VisualizationReducer(method="umap", n_components=2, use_gpu=True, random_state=42)
viz_2d = viz_reducer.reduce(result.reduced)
# Normalize for stable plotting
viz_2d = (viz_2d - viz_2d.mean(axis=0)) / (viz_2d.std(axis=0) + 1e-9)


plot_clusters_2d(
    viz_2d,
    result.labels,
    texts=result.medoids,
    phrases=result.phrases,
    dbcv_score=0.71,  # optional if you have evaluator
    savepath='data_cache/clusters.png'
)

VisualizationReducer: method=umap, GPU=True, seed=42
‚ö†Ô∏è  GPU UMAP may be nondeterministic even with a fixed random_state.
VisualizationReducer: reduced 100 ‚Üí 2 dims, mean=[ 3.8703678e-08 -1.7868042e-07], std=[0.9999806 0.999987 ]
Clipped outliers: kept 99994 / 100000 points
Saved cluster plot to data_cache/clusters.png


In [10]:
from phrasely.medoids.medoid_selector import MedoidSelector
import numpy as np

phrases = ["a", "b", "c"]
embeddings = np.array([[1.0, 0.0], [0.7071, 0.7071], [0.0, 1.0]])
labels = np.array([0, 0, 0])

selector = MedoidSelector(metric="cosine", exact_threshold=10)
out = selector.select(phrases, embeddings, labels)
print("Type:", type(out))
print("Output:", out)


Selected 1 medoids across 1 clusters.
Type: <class 'tuple'>
Output: ([1], ['b'])
