In [None]:
import os, sys

os.environ["HF_DATASETS_OFFLINE"] = "0"
os.environ["HF_DATASETS_CACHE"] = "/tmp"     # points to temp dir
os.environ["HF_HUB_DISABLE_PROGRESS_BARS"] = "1"
os.environ["HF_DATASETS_DISABLE_CACHING"] = "1"  # ðŸ‘ˆ full off switch

project_root = "/home/michael/workspace/phrasely"
sys.path.insert(0, os.path.join(project_root, "src"))
os.chdir(project_root)


import random
import warnings
from tqdm import TqdmWarning, tqdm
import pandas as pd
from phrasely.pipeline import run_pipeline
from phrasely.data_loading.cc100_loader import CC100Loader
from phrasely.embeddings.phrase_embedder import PhraseEmbedder
from phrasely.data_loading.cc100_offline_loader import CC100OfflineLoader
import logging
from phrasely.reduction.visualization_reducer import VisualizationReducer
import matplotlib.pyplot as plt

logging.getLogger().setLevel(logging.INFO)

warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=TqdmWarning)

In [None]:
def setup_logger(name: str) -> logging.Logger:
    logger = logging.getLogger(name)
    if not logger.handlers:
        handler = logging.StreamHandler(sys.stdout)
        formatter = logging.Formatter("%(message)s")
        handler.setFormatter(formatter)
        logger.addHandler(handler)
    return logger

In [None]:
setup_logger('phrasely')

In [None]:
result = run_pipeline(
    CC100OfflineLoader,
    loader_kwargs={
        "arrow_dir": "data_cache/cc100",
        "language": "",
        "max_files": 20,
        "max_phrases": 100_000,
        "batch_size": 50_000,
    },
    stream=True,
    use_gpu=True,
    min_cluster_size=5,   # ðŸ‘ˆ adjust here
    min_samples=2         # ðŸ‘ˆ optional fine-tuning
)


result.save("data_cache/run_cc100_100k")
print("\nâœ… End-to-end pipeline finished successfully!")
result.summary()

In [None]:
from phrasely.evaluation.visualizer import plot_clusters_2d

viz_reducer = VisualizationReducer(method="umap", n_components=2, use_gpu=True, random_state=42)
viz_2d = viz_reducer.reduce(result.reduced)
# Normalize for stable plotting
viz_2d = (viz_2d - viz_2d.mean(axis=0)) / (viz_2d.std(axis=0) + 1e-9)


plot_clusters_2d(
    viz_2d,
    result.labels,
    texts=result.medoids,
    phrases=result.phrases,
    dbcv_score=0.71,  # optional if you have evaluator
    savepath='data_cache/clusters.png'
)

In [None]:
from phrasely.medoids.medoid_selector import MedoidSelector
import numpy as np

phrases = ["a", "b", "c"]
embeddings = np.array([[1.0, 0.0], [0.7071, 0.7071], [0.0, 1.0]])
labels = np.array([0, 0, 0])

selector = MedoidSelector(metric="cosine", exact_threshold=10)
out = selector.select(phrases, embeddings, labels)
print("Type:", type(out))
print("Output:", out)
