In [None]:
import os
import json
import logging
import pandas as pd
from typing import Dict, List
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation, NMF
from sklearn.cluster import KMeans
from bertopic import BERTopic
from keybert import KeyBERT
from sentence_transformers import SentenceTransformer
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import numpy as np
import gensim.corpora as corpora
from gensim.models.coherencemodel import CoherenceModel

In [None]:
class ModelComparison:
  def __init__(self, input_file="processed_data.json", output_dir="model_comparison"):
    self.input_file = input_file
    self.output_dir = output_dir
    self.viz_dir = os.path.join(output_dir, "visualizations")
    os.makedirs(self.viz_dir, exist_ok=True)
    self.sentence_model = SentenceTransformer('all-MiniLM-L6-v2')
    self.keybert = KeyBERT(model=self.sentence_model)
    
  def load_data(self) -> Dict[str, List[str]]:
    with open(self.input_file, 'r', encoding='utf-8') as f:
        data = json.load(f)
    df = pd.DataFrame(data)
    return {
        'darkweb': df[df['source'] == 'darkweb']['text'].tolist(),
        'reddit': df[df['source'] == 'reddit']['text'].tolist()
    }
    


In [16]:
modelcompare = ModelComparison()
data = modelcompare.load_data()


In [17]:
darkweb_data = data['darkweb']
reddit_data = data['reddit']

In [18]:
def run_BERTopic(texts, n_topics=5):
  if not texts:
    return None, [], [], None, None
  model = BERTopic()
  topics, probs = model.fit_transform(texts)
  return model, topics, probs

In [19]:
# model, topics, probs = run_BERTopic(darkweb_data)

In [20]:
from bertopic.representation import KeyBERTInspired

# Fine-tune your topic representations
representation_model = KeyBERTInspired()
topic_model = BERTopic(nr_topics="auto",representation_model=representation_model)
topics, probs = topic_model.fit_transform(darkweb_data)


embedding_model = "sentence-transformers/all-MiniLM-L6-v2"
topic_model.save("model_comparison/best_models/BERTopic_darkweb", serialization="pytorch", save_ctfidf=True, save_embedding_model=embedding_model)
vectorizer = topic_model.vectorizer_model
tokenizer = vectorizer.build_tokenizer()
words = vectorizer.get_feature_names_out()
tokens = [tokenizer(doc) for doc in darkweb_data]
dictionary = corpora.Dictionary(tokens)
corpus = [dictionary.doc2bow(token) for token in tokens]
topic_words = [[words for words, _ in topic_model.get_topic(topic)] 
               for topic in range(len(set(topics))-1)]

# Evaluate
coherence_model = CoherenceModel(topics=topic_words, 
                                 texts=tokens, 
                                 corpus=corpus,
                                 dictionary=dictionary, 
                                 coherence='c_v')
coherence = coherence_model.get_coherence()
print(coherence)

0.5096610903724429


In [21]:
vectorizer = topic_model.vectorizer_model
tokenizer = vectorizer.build_tokenizer()
words = vectorizer.get_feature_names_out()
tokens = [tokenizer(doc) for doc in darkweb_data]
dictionary = corpora.Dictionary(tokens)
corpus = [dictionary.doc2bow(token) for token in tokens]
topic_words_dark = [[words for words, _ in topic_model.get_topic(topic)] 
               for topic in range(len(set(topics))-1)]

# Evaluate
coherence_model = CoherenceModel(topics=topic_words_dark, 
                                 texts=tokens, 
                                 corpus=corpus,
                                 dictionary=dictionary, 
                                 coherence='c_v')
coherence = coherence_model.get_coherence()
print(coherence)

0.5096610903724429


In [22]:
representation_model = KeyBERTInspired()
topic_model = BERTopic(nr_topics="auto",representation_model=representation_model)
topics, probs = topic_model.fit_transform(reddit_data)

embedding_model = "sentence-transformers/all-MiniLM-L6-v2"
topic_model.save("model_comparison/best_models/BERTopic_reddit", serialization="pytorch", save_ctfidf=True, save_embedding_model=embedding_model)

In [23]:
vectorizer = topic_model.vectorizer_model
tokenizer = vectorizer.build_tokenizer()
words = vectorizer.get_feature_names_out()
tokens = [tokenizer(doc) for doc in reddit_data]
dictionary = corpora.Dictionary(tokens)
corpus = [dictionary.doc2bow(token) for token in tokens]
topic_words_reddit = [[words for words, _ in topic_model.get_topic(topic)] 
               for topic in range(len(set(topics))-1)]

# Evaluate
coherence_model = CoherenceModel(topics=topic_words_reddit, 
                                 texts=tokens, 
                                 corpus=corpus,
                                 dictionary=dictionary, 
                                 coherence='c_v')
coherence = coherence_model.get_coherence()
print(coherence)

0.6232101023586243


In [24]:
topic_words_reddit[0]

['hacking',
 'hacker',
 'airplay',
 'hacked',
 'apple',
 'vulnerability',
 'wormable',
 'leaked',
 'breach',
 'device']

In [29]:
topic_words_dark[4]

['copyright',
 'infringement',
 'copyrighted',
 'lawsuit',
 'piracy',
 'notice',
 'claim',
 'sue',
 'content',
 'court']

In [31]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

model_st = SentenceTransformer("all-MiniLM-L6-v2")

# Embed the topic strings
embeddings_reddit = model_st.encode(topic_words_reddit, normalize_embeddings=True)
embeddings_darkweb = model_st.encode(topic_words_dark, normalize_embeddings=True)

In [36]:
# Compute cosine similarities between each Reddit and Darkweb topic
similarity_matrix = cosine_similarity(embeddings_reddit, embeddings_darkweb)

# For each Reddit topic, find the best-matching Darkweb topic
threshold = 0.5  # Adjust this threshold as needed
common_topics = []
unique_reddit = []
unique_darkweb = set(range(len(embeddings_darkweb)))  # Keep track of unmatched darkweb topics

reddit_topic_ids = list(range(len(embeddings_reddit)))
darkweb_topic_ids = list(range(len(embeddings_darkweb)))

for i, reddit_topic in enumerate(reddit_topic_ids):
    sims = similarity_matrix[i]
    max_sim_idx = np.argmax(sims)
    max_sim_val = sims[max_sim_idx]

    if max_sim_val >= threshold:
        darkweb_topic = darkweb_topic_ids[max_sim_idx]
        common_topics.append((reddit_topic, darkweb_topic, max_sim_val))
        unique_darkweb.discard(darkweb_topic)
    else:
        unique_reddit.append(reddit_topic)


In [37]:
print("Common Topics (Reddit ID, Darkweb ID, Similarity):")
for r, d, sim in common_topics:
    print(f"{r} <--> {d} | Sim: {sim:.2f}")
    print(f"Reddit: {topic_words_reddit[r]}")
    print(f"Darkweb: {topic_words_dark[d]}")
    print("----")

print("\nUnique Reddit Topics:")
for r in unique_reddit:
    print(f"{r}: {topic_words_reddit[r]}")

print("\nUnique Darkweb Topics:")
for d in unique_darkweb:
    print(f"{d}: {topic_words_dark[d]}")


Common Topics (Reddit ID, Darkweb ID, Similarity):
3 <--> 1 | Sim: 0.72
Reddit: ['tor', 'privacy', 'vpn', 'anonymity', 'surveillance', 'protect', 'torbased', 'onionbrowser', 'onion', 'traffic']
Darkweb: ['enforcement', 'privacy', 'analytics', 'google', 'safeguard', 'data', 'security', 'information', 'garante', 'leaked']
----
7 <--> 1 | Sim: 0.74
Reddit: ['privacy', 'privacyrelated', 'privacyguides', 'privacytools', 'privacytoolsio', 'freedom', 'cryptopartybln', 'intersection', 'transparent', 'technology']
Darkweb: ['enforcement', 'privacy', 'analytics', 'google', 'safeguard', 'data', 'security', 'information', 'garante', 'leaked']
----
9 <--> 2 | Sim: 0.76
Reddit: ['pirate', 'domain', 'torrent', 'thepiratebay', 'redirect', 'qbittorrent', 'tpb', 'website', 'bay', 'btguard']
Darkweb: ['proxy', 'piratebay', 'pirate', 'haproxy', 'thepiratebay', 'cloudflare', 'bay', 'load', 'website', 'cache']
----
11 <--> 13 | Sim: 0.64
Reddit: ['linux', 'linux4noobs', 'linuxmemes', 'gnu', 'linuxquestions'

## Model comparison

In [61]:
from gensim.models.ldamodel import LdaModel
from gensim.models.coherencemodel import CoherenceModel
from gensim.test.utils import common_texts
from gensim.corpora.dictionary import Dictionary

In [None]:
new_data = [list(str.split()) for str in reddit_data]
common_dictionary = Dictionary(new_data)
common_corpus = [common_dictionary.doc2bow(text) for text in new_data]
model = LdaModel(common_corpus, id2word=common_dictionary)
cm = CoherenceModel(model=model, texts=new_data, corpus=common_corpus, dictionary=common_dictionary, coherence='c_v')
coherence = cm.get_coherence()  # get coherence value
print(f"Reddit Coherence Score: {coherence}")

In [98]:
common_dictionary = Dictionary(new_data)
common_corpus = [common_dictionary.doc2bow(text) for text in new_data]

In [None]:
model = LdaModel(common_corpus, id2word=common_dictionary)

In [107]:
cm = CoherenceModel(model=model, texts=new_data, corpus=common_corpus, dictionary=common_dictionary, coherence='c_v')
coherence = cm.get_coherence()  # get coherence value
coherence

0.4477480710029231