In [None]:
# === Cell 1: imports & paths ===
import numpy as np
import pandas as pd
from pathlib import Path

import torch
from transformers import AutoTokenizer, AutoModel
import umap
import hdbscan

DATA_DIR = Path("../data")
MODEL_DIR = Path("../models")

CORPUS_PATH = DATA_DIR / "df_corpus.parquet"
LABELED_PATH = DATA_DIR / "df_corpus_labeled.parquet"

device = "cuda" if torch.cuda.is_available() else "cpu"
device


In [None]:
# === Cell 2: veri ve model ===
df_corpus = pd.read_parquet(CORPUS_PATH)
df_labeled = pd.read_parquet(LABELED_PATH)

model_name = "allenai/scibert_scivocab_uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
encoder   = AutoModel.from_pretrained(model_name).to(device)
encoder.eval()


In [None]:
# === Cell 3: embedding fonksiyonu (CLS pooling) ===
@torch.no_grad()
def encode_texts(texts, batch_size=16, max_length=256):
    all_vecs = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        enc = tokenizer(
            list(batch),
            padding=True,
            truncation=True,
            max_length=max_length,
            return_tensors="pt"
        ).to(device)
        out = encoder(**enc)
        # CLS token
        cls_vec = out.last_hidden_state[:, 0, :]
        all_vecs.append(cls_vec.cpu())
    return torch.cat(all_vecs, dim=0).numpy()

# Örnek küçük subset ile test
test_emb = encode_texts(df_corpus["text"].head(8))
test_emb.shape


In [None]:
# === Cell 4: tüm korpus için embedding (gerekirse batch/bölerek) ===
emb_path = DATA_DIR / "embeddings_scibert.npy"

if emb_path.exists():
    embeddings = np.load(emb_path)
else:
    embeddings = encode_texts(df_corpus["text"].tolist(), batch_size=16, max_length=256)
    np.save(emb_path, embeddings)

embeddings.shape


In [None]:
# === Cell 5: UMAP boyut indirgeme ===
umap_reducer = umap.UMAP(
    n_neighbors=15,
    min_dist=0.1,
    n_components=15,
    metric="cosine",
    random_state=42,
)

emb_umap = umap_reducer.fit_transform(embeddings)
emb_umap.shape


In [None]:
# === Cell 6: HDBSCAN ile topic cluster ===
clusterer = hdbscan.HDBSCAN(
    min_cluster_size=50,
    metric="euclidean",
    cluster_selection_method="eom"
)

cluster_labels = clusterer.fit_predict(emb_umap)
df_corpus["topic_id"] = cluster_labels
df_corpus["topic_id"].value_counts()


In [None]:
# === Cell 7: seed tech listesi (tech_names) import ve anchoring ===
# tech_names.py içinde mesela:
# TECH_SEEDS = ["quantum sensor", "autonomous driving", "solid-state battery", ...]
from tech_names import TECH_SEEDS

seed_emb = encode_texts(TECH_SEEDS, batch_size=8, max_length=32)
seed_umap = umap_reducer.transform(seed_emb)

# Basit anchoring: her seed → en yakın topic_id
from sklearn.metrics.pairwise import cosine_distances

topic_centers = (
    pd.DataFrame(emb_umap)
    .assign(topic_id=df_corpus["topic_id"].values)
    .groupby("topic_id")
    .mean()
)

topic_ids = topic_centers.index.values
topic_vecs = topic_centers.values

seed_to_topic = {}
for seed, vec in zip(TECH_SEEDS, seed_umap):
    dists = cosine_distances(vec.reshape(1, -1), topic_vecs)[0]
    best_idx = np.argmin(dists)
    seed_to_topic[seed] = int(topic_ids[best_idx])

seed_to_topic


In [None]:
# === Cell 8: topic'lara isim atama ===
topic_name_map = {}  # {topic_id: "Quantum Sensors"} gibi

for seed, tid in seed_to_topic.items():
    if tid not in topic_name_map:
        topic_name_map[tid] = seed
    else:
        topic_name_map[tid] += " | " + seed  # aynı topic'e düşen seedler birleşir

df_corpus["topic_name"] = df_corpus["topic_id"].map(topic_name_map).fillna("UNKNOWN_TOPIC")

df_corpus[["title", "source_type", "topic_id", "topic_name"]].head(20)


In [None]:
# === Cell 9: kaydet ===
df_corpus.to_parquet(DATA_DIR / "df_corpus_with_topics.parquet", index=False)
print("Saved:", DATA_DIR / "df_corpus_with_topics.parquet")
