## Part I.

In [None]:
import os
import json
import pickle
import numpy as np
import torch
from sentence_transformers import SentenceTransformer


device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

In [None]:
def load_and_cache_jsonl(file_path, fields, cache_path):
    if os.path.exists(cache_path):
        print(f"Loading cached data from '{cache_path}'...")
        return pickle.load(open(cache_path, 'rb'))
    print(f"Reading '{file_path}' and extracting fields {fields}...")
    data = {field: [] for field in fields}
    with open(file_path, 'r', encoding='utf-8') as fin:
        for line in fin:
            obj = json.loads(line)
            for field in fields:
                data[field].append(obj.get(field))
    pickle.dump(data, open(cache_path, 'wb'))
    print(f"Cached extracted data to '{cache_path}'")
    return data


train_cache = "train_data.pkl"
train_fields = ["text", "label"]
train_data = load_and_cache_jsonl("corpus.jsonl", train_fields, train_cache)
texts = train_data["text"]
labels = train_data["label"]
print(f"Loaded {len(texts)} training documents.")


new_cache = "new_data.pkl"
new_fields = ["text", "da_label", "label"]
new_data = load_and_cache_jsonl("query.jsonl", new_fields, new_cache)
new_texts = new_data["text"]
new_da_labels = new_data["da_label"]
new_labels = new_data["label"]
print(f"Loaded {len(new_texts)} new documents.")

In [None]:
from keybert import KeyBERT

vocab_cache = "vocabulary.pkl"

if os.path.exists(vocab_cache):
    with open(vocab_cache, 'rb') as f:
        vocabulary = pickle.load(f)
    print(f"Loaded vocabulary ({len(vocabulary)} terms) from cache.")
else:
    kw_model = KeyBERT(model="all-MiniLM-L6-v2")
    keyword_results = kw_model.extract_keywords(texts, stop_words='english')
    vocabulary = list({term for doc_kw in keyword_results for term, _ in doc_kw})
    with open(vocab_cache, 'wb') as f:
        pickle.dump(vocabulary, f)
    print(f"Extracted and cached {len(vocabulary)} vocabulary terms.")

In [None]:
model_name = "all-MiniLM-L6-v2"
embedding_model = SentenceTransformer(model_name)
if device == "cuda":
    embedding_model = embedding_model.to(device)

def get_embeddings(texts, cache_path=None, batch_size=512):
    if cache_path and os.path.exists(cache_path):
        print(f"Loading embeddings from {cache_path}...")
        return np.load(cache_path) 

    print(f"Computing embeddings for {len(texts)} texts...")
    embs = embedding_model.encode(
        texts, batch_size=batch_size, show_progress_bar=True
    )
    if cache_path:
        print(f"Saving embeddings to {cache_path}...")
        np.save(cache_path, embs)
    return embs

embeddings_file = "tdt_embeddings.npy"
train_embeddings = get_embeddings(texts, cache_path=embeddings_file, batch_size=512)

In [None]:
from cuml.manifold import UMAP
from cuml.cluster import HDBSCAN
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.vectorizers import ClassTfidfTransformer

umap_model = UMAP(
    n_components=5,
    n_neighbors=15,
    min_dist=0.0,
    metric='cosine',
    random_state=42
)

hdbscan_model = HDBSCAN(
    min_cluster_size=100,
    min_samples=50,
    gen_min_span_tree=True,
    prediction_data=True
)

vectorizer_model = CountVectorizer(
    vocabulary=vocabulary,
    stop_words='english',
    min_df=20,
    ngram_range=(1, 3)
)

ctfidf_model = ClassTfidfTransformer(
    bm25_weighting=True,
    reduce_frequent_words=True
)

In [37]:
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

In [None]:
from bertopic import BERTopic

print("Training BERTopic model...")
topic_model = BERTopic(
    embedding_model=embedding_model,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    vectorizer_model=vectorizer_model,
    ctfidf_model=ctfidf_model,
    verbose=True
)
topics, probs = topic_model.fit_transform(texts, embeddings=train_embeddings)
print("Model training completed.")

In [None]:
with open("train_topics.pkl", "wb") as f:
    pickle.dump(topics, f)

print("Saved topic assignments to 'train_topics.pkl'")

In [None]:
topic_info = topic_model.get_topic_info()
topic_info[['Topic', 'Count', 'Name']].to_csv("topic_info.csv", index=False)
print(f"Generated {len(topic_info)-1} topics (excluding outliers).")

In [None]:
topic_model.save(
    "tdt_topic_model",
    serialization="safetensors",
    save_embedding_model="all-MiniLM-L6-v2",
    save_ctfidf=True
)
print("Saved BERTopic model to 'tdt_topic_model'")

## Part II.

In [None]:
topic_model = BERTopic.load("tdt_topic_model")
print("Loaded BERTopic model.")

train_data = pickle.load(open("train_data.pkl", "rb"))
train_texts = train_data["text"]
train_labels = train_data["label"]

new_data = pickle.load(open("new_data.pkl", "rb"))
new_texts = new_data["text"]
new_da_labels = new_data["da_label"]
new_labels = new_data["label"]

train_embeddings = np.load("tdt_embeddings.npy")
train_topics = pickle.load(open("train_topics.pkl", "rb"))

print(f"Training set: {len(train_texts)} docs, " f"New set: {len(new_texts)} docs")
print(f"train_embeddings.shape = {train_embeddings.shape}")

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

new_embeddings = get_embeddings(new_texts, cache_path=None, batch_size=512)
new_topics, new_probs = topic_model.transform(new_texts, embeddings=new_embeddings)
print("Predicted topics for new documents.")
print(f"new_embeddings.shape = {new_embeddings.shape}")

sim_matrix = cosine_similarity(new_embeddings, train_embeddings)
print(f"Computed similarity matrix of shape {sim_matrix.shape}")

In [None]:
print("len(train_texts) =", len(train_texts))
print("len(new_texts)   =", len(new_texts))
print("train_embeddings.shape =", train_embeddings.shape)
print("new_embeddings.shape   =", new_embeddings.shape)
print("sim_matrix.shape       =", sim_matrix.shape)

In [None]:
topic_info = topic_model.get_topic_info()[["Topic", "Name"]]
topic_to_name = {
    int(r.Topic): r.Name
    for r in topic_info.itertuples(index=False)
}

In [None]:
results = []
TOP_K = 10

for i, (topic, prob) in enumerate(zip(new_topics, new_probs)):
    rec = {
        "id": i,
        "theme_name": None,
        "theme_prob": float(prob),
        "da_label": new_da_labels[i],
        "label": new_labels[i],
        "text": new_texts[i],
        "top_10": []
    }
    if topic == -1:
        rec["theme_name"] = "Outlier"
        results.append(rec)
        continue

    rec["theme_name"] = topic_to_name.get(int(topic), str(topic))
    same_idxs = np.where(train_topics == topic)[0]
    if same_idxs.size:
        sims = sim_matrix[i, same_idxs]
        topk_idxs = same_idxs[np.argsort(sims)[-TOP_K:]][::-1]
        for idx in topk_idxs:
            rec["top_10"].append({
                "train_doc_id": int(idx),
                "similarity": float(sim_matrix[i, idx]),
                "label": train_labels[idx],
                "text": train_texts[idx]
            })
    results.append(rec)

print(f"Built top-{TOP_K} recommendations for {len(results)} new docs.")

In [None]:
with open("new2train_top10_pretty.json", "w", encoding="utf-8") as f:
    json.dump(results, f, ensure_ascii=False, indent=2)


with open("new2train_pairs.jsonl", "w", encoding="utf-8") as fout:
    for rec in results:
        for top in rec["top_10"]:
            out = {
                "da_label": rec["da_label"],
                "score": top["similarity"],
                "label": top["label"],
                "text": top["text"]
            }
            fout.write(json.dumps(out, ensure_ascii=False) + "\n")

print("Saved 'new2train_top10_pretty.json' and 'new2train_pairs.jsonl'.")

In [None]:
THRESHOLD = 0.8
kept = 0
with open("new2train_pairs.jsonl", "r", encoding="utf-8") as fin, \
     open("new2train_filtered.jsonl", "w", encoding="utf-8") as fout:
    for line in fin:
        obj = json.loads(line)
        if obj["score"] > THRESHOLD:
            fout.write(line)
            kept += 1

print(f"Filtered pairs with score > {THRESHOLD}: kept {kept} records.")