<a href="https://colab.research.google.com/github/meiyee1010/gdp-dashboard/blob/main/To_achieve_RO_Include_structured_noise.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# 0) Force-reinstall NumPy, Gensim, scikit-learn, and NLTK to fix the dtype mismatch
!pip install --upgrade --force-reinstall --no-cache-dir numpy
!pip install --upgrade --force-reinstall --no-cache-dir gensim scikit-learn nltk

# 1) Imports (after reinstall)
import pandas as pd
import nltk

from nltk.corpus import stopwords
from gensim.utils import simple_preprocess
from gensim.corpora import Dictionary
from gensim.models import LdaMulticore
from gensim.models import CoherenceModel
from gensim.models import Word2Vec
from sklearn.cluster import KMeans
from sklearn.metrics import homogeneity_score, completeness_score, v_measure_score
import numpy as np
import time
import tracemalloc

# 2) Download NLTK data
nltk.download('stopwords')
nltk.download('punkt')


Collecting numpy
  Downloading numpy-2.2.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (62 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/62.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.0/62.0 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading numpy-2.2.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (16.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.4/16.4 MB[0m [31m258.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 1.26.4
    Uninstalling numpy-1.26.4:
      Successfully uninstalled numpy-1.26.4
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
gensim 4.3.3 requires numpy<2.0,>=1.18.5, but you have

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
stop_words = set(stopwords.words('english'))

In [None]:
# ========================
# 1) Preprocessing
# ========================
def preprocess_texts(texts):
    """Tokenize and remove stopwords."""
    tok = []
    for doc in texts:
        tokens = simple_preprocess(doc, deacc=True)
        tokens = [w for w in tokens if w not in stop_words]
        tok.append(tokens)
    return tok

def build_corpus(texts_tokens):
    """Build Gensim dictionary & corpus from token lists."""
    dictionary = Dictionary(texts_tokens)
    corpus = [dictionary.doc2bow(doc) for doc in texts_tokens]
    return dictionary, corpus


In [None]:
# ========================
# 2) Baseline LDA Runner
# ========================
from gensim.models import LdaMulticore

def run_baseline_lda(dictionary, corpus, texts_tokens, labels, num_topics=4, passes=10, iterations=50):
    tracemalloc.start()
    start = time.time()
    model = LdaMulticore(
        corpus=corpus,
        id2word=dictionary,
        num_topics=num_topics,
        passes=passes,
        iterations=iterations,
        chunksize=2000,
        workers=4,
        alpha='symmetric',
        random_state=42,
        eval_every=None
    )
    runtime = time.time() - start
    current, peak = tracemalloc.get_traced_memory()
    tracemalloc.stop()

    # Assign topics to docs
    assigned = [
        max(model.get_document_topics(doc), key=lambda x: x[1])[0]
        for doc in corpus
    ]

# Coherence
    coh = CoherenceModel(model=model,
                         texts=texts_tokens,
                         dictionary=dictionary,
                         coherence='c_v').get_coherence()

# Clustering metrics
    hom = homogeneity_score(labels, assigned)
    com = completeness_score(labels, assigned)
    vm  = v_measure_score(labels, assigned)

    return {
        'model': model,
        'coherence': coh,
        'homogeneity': hom,
        'completeness': com,
        'v_measure': vm,
        'runtime_s': runtime,
        'mem_peak_mb': peak / 1e6
    }

In [None]:
# ========================
# 3) Hybrid LDA Runner
# ========================
def run_hybrid_lda(dictionary,
                   corpus,
                   texts_tokens,
                   labels,
                   num_topics=4,
                   emb_size=100,
                   base_eta=0.01,
                   high_eta=0.1,
                   passes=5,
                   iterations=30):
    """
    1) Train Word2Vec on texts_tokens
    2) KMeans cluster the resulting embeddings
    3) Build a (num_topics x vocab_size) eta matrix
    4) Train LdaMulticore with that eta
    5) Compute and return all metrics
    """

    # --- a) Train Word2Vec ---
    w2v = Word2Vec(
    sentences=texts_tokens,
    vector_size=emb_size,
    window=5,
    min_count=1,    # ← was 2
    workers=4,
    seed=42
)



    # --- b) Extract word vectors for every token in your dictionary ---
    vocab_size = len(dictionary)
    word_vecs = np.vstack([w2v.wv[dictionary[i]] for i in range(vocab_size)])

    # --- c) Cluster those vectors into num_topics clusters ---
    kmeans = KMeans(n_clusters=num_topics, random_state=0, n_init=10)
    kmeans.fit(word_vecs)
    word_to_topic = kmeans.labels_   # array of length vocab_size

    # --- d) Build eta matrix (guaranteed to exist now) ---
    eta = np.full((num_topics, vocab_size), base_eta, dtype=float)
    for word_id, topic_id in enumerate(word_to_topic):
        eta[topic_id, word_id] = high_eta

    # --- e) Train LdaMulticore with that custom eta ---
    tracemalloc.start()
    t0 = time.time()
    model = LdaMulticore(
        corpus=corpus,
        id2word=dictionary,
        num_topics=num_topics,
        passes=passes,
        iterations=iterations,
        chunksize=len(corpus),
        workers=4,
        alpha='symmetric',  # multicore does NOT support 'auto'
        eta=eta,
        random_state=42,
        eval_every=None
    )
    runtime = time.time() - t0
    _, peak = tracemalloc.get_traced_memory()
    tracemalloc.stop()

    # --- f) Assign each document to its top topic ---
    assigned = [
        max(model.get_document_topics(doc), key=lambda x: x[1])[0]
        for doc in corpus
    ]

    # --- g) Compute coherence and clustering metrics ---
    coherence = CoherenceModel(
        model=model,
        texts=texts_tokens,
        dictionary=dictionary,
        coherence='c_v'
    ).get_coherence()
    homogeneity   = homogeneity_score(labels, assigned)
    completeness  = completeness_score(labels, assigned)
    v_measure     = v_measure_score(labels, assigned)

    return {
        'model': model,
        'coherence': coherence,
        'homogeneity': homogeneity,
        'completeness': completeness,
        'v_measure': v_measure,
        'runtime_s': runtime,
        'mem_peak_mb': peak / 1e6
    }

In [None]:
# ========================
# 4) Load & Prepare Datasets
# ========================
# 4a) Synthetic (we generated earlier)


# --- 1) Configuration & Templates ---

categories = {
    "work": [
        "Meeting scheduled for next Monday.",
        "Please send your project update.",
        "Reminder: Submit your timesheet by Friday.",
        "Team lunch on Wednesday at noon.",
        "Follow up on Q2 marketing results."
    ],
    "promotion": [
        "Exclusive offer: 50% off on all items!",
        "Summer sale starts now - don't miss it!",
        "Buy one get one free - limited time only.",
        "New arrivals in our store this week.",
        "Free shipping on orders over $50."
    ],
    "scam": [
        "Your account has been suspended. Click here to verify.",
        "Congratulations! You've won a lottery prize.",
        "Urgent: Update your banking details now.",
        "This is your final warning before account closure.",
        "You've received a secure message - view now."
    ],
    "news": [
        "Local football team wins championship.",
        "Weather alert: Heavy rain expected tomorrow.",
        "Community meeting scheduled for next week.",
        "Mayor announces new green initiative.",
        "City library to host book fair this Saturday."
    ]
}

TOTAL = 5000
PER_CAT = TOTAL // len(categories)    # 1250 per label

# Need to import random module
import random
random.seed(42)

# --- 2) Precompute which indices get structural noise (~5%) ---
N_STRUCT = int(0.05 * TOTAL)        # 250
struct_noise_idxs = set(random.sample(range(TOTAL), k=N_STRUCT))

data = []
idx = 0
for label, templates in categories.items():
    for _ in range(PER_CAT):
        # a) Base subject & body
        subj = random.choice(templates)
        body = f"{subj} Please read the details and act accordingly."

        # b) 10% token-level noise
        tokens = body.split()
        n_tok_noise = int(0.1 * len(tokens))
        for j in random.sample(range(len(tokens)), k=n_tok_noise):
            tokens[j] = random.choice(tokens)  # could also choose from full vocab
        text_noisy = " ".join(tokens)

        # c) **Structural noise** on exactly 5% of samples
        if idx in struct_noise_idxs:
            pos = random.randint(0, len(subj) - 1)
            subj = subj[:pos] + random.choice(['#', '@', '%']) + subj[pos:]

        data.append({
            "subject": subj,
            "text":    text_noisy,
            "label":   label
        })
        idx += 1

# --- 3) Save to CSV ---
df_synth = pd.DataFrame(data)
synth_path = "/content/synthetic_email_dataset_5000.csv"
df_synth.to_csv(synth_path, index=False)
print(f"Saved synthetic CSV with {len(df_synth)} rows to {synth_path}")

# ========================
# 4b) Load & Preprocess
# ========================
# Synthetic
df_synth = pd.read_csv(synth_path)
texts_s = df_synth['text'].tolist()
tokens_s = preprocess_texts(texts_s)
dict_s, corpus_s = build_corpus(tokens_s)
labels_s = df_synth['label'].astype('category').cat.codes.tolist()

# Real (Kaggle)
df_kag = pd.read_csv('/content/spam_ham_dataset.csv')
texts_k = df_kag['text'].tolist()
tokens_k = preprocess_texts(texts_k)
dict_k, corpus_k = build_corpus(tokens_k)
labels_k = df_kag['label'].astype('category').cat.codes.tolist()

print("Synthetic tokens example:", tokens_s[0][:10])
print("Kaggle tokens example:   ", tokens_k[0][:10])

Saved synthetic CSV with 5000 rows to /content/synthetic_email_dataset_5000.csv
Synthetic tokens example: ['please', 'send', 'project', 'update', 'please', 'please', 'details', 'act', 'accordingly']
Kaggle tokens example:    ['subject', 'enron', 'methanol', 'meter', 'follow', 'note', 'gave', 'monday', 'preliminary', 'flow']


In [None]:
# preprocessing & corpus creation
tokens_s = preprocess_texts(df_synth['text'].tolist())
dict_s, corpus_s = build_corpus(tokens_s)
labels_s = df_synth['label'].astype('category').cat.codes.tolist()

tokens_k = preprocess_texts(df_kag['text'].tolist())
dict_k, corpus_k = build_corpus(tokens_k)
labels_k = df_kag['label'].astype('category').cat.codes.tolist()


In [None]:
# ========================
# 5) Run Experiments
# ========================
results = []

for name, (dictionary, corpus, tokens, labels) in [
    ('Synthetic', (dict_s, corpus_s, tokens_s, labels_s)),
    ('Kaggle',    (dict_k, corpus_k, tokens_k, labels_k))
]:
    base = run_baseline_lda(
        dictionary=dictionary,
        corpus=corpus,
        texts_tokens=tokens,
        labels=labels,
        num_topics=4,
        passes=10,
        iterations=100
    )
    hyb = run_hybrid_lda(
        dictionary=dictionary,
        corpus=corpus,
        texts_tokens=tokens,
        labels=labels,
        num_topics=4,
        emb_size=50,
        base_eta=0.01,
        high_eta=0.1,
        passes=10,
        iterations=100
    )

    for tag, res in [('Baseline', base), ('Hybrid', hyb)]:
        results.append({
            'Dataset':        name,
            'Model':          tag,
            'Coherence':      res['coherence'],
            'Homogeneity':    res['homogeneity'],
            'Completeness':   res['completeness'],
            'V-measure':      res['v_measure'],
            'Runtime (s)':    res['runtime_s'],
            'Memory Peak (MB)': res['mem_peak_mb']
        })

df_results = pd.DataFrame(results)
print(df_results)

Exception ignored on calling ctypes callback function: <function ThreadpoolController._find_libraries_with_dl_iterate_phdr.<locals>.match_library_callback at 0x7fbe10136980>
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/threadpoolctl.py", line 1005, in match_library_callback
    self._make_controller_from_path(filepath)
  File "/usr/local/lib/python3.11/dist-packages/threadpoolctl.py", line 1187, in _make_controller_from_path
    lib_controller = controller_class(
                     ^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/threadpoolctl.py", line 114, in __init__
    self.dynlib = ctypes.CDLL(filepath, mode=_RTLD_NOLOAD)
                  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/lib/python3.11/ctypes/__init__.py", line 376, in __init__
    self._handle = _dlopen(self._name, mode)
                   ^^^^^^^^^^^^^^^^^^^^^^^^^
OSError: dlopen() error


     Dataset     Model  Coherence  Homogeneity  Completeness  V-measure  \
0  Synthetic  Baseline   0.309325     0.113982      0.165399   0.134959   
1  Synthetic    Hybrid   0.341003     0.277661      0.292826   0.285042   
2     Kaggle  Baseline   0.489284     0.441085      0.227000   0.299741   
3     Kaggle    Hybrid   0.506551     0.576864      0.337584   0.425918   

   Runtime (s)  Memory Peak (MB)  
0   184.044011          0.944616  
1   223.002790          2.948337  
2   414.240446         12.282879  
3   417.952264         17.009860  
