In [1]:
from bertopic import BERTopic
import hdbscan 
from sklearn.feature_extraction.text import CountVectorizer
from sentence_transformers import SentenceTransformer
from sklearn.model_selection import ParameterGrid
from gensim.corpora import Dictionary
from gensim.models import CoherenceModel
import itertools
import pandas as pd
import json
import os

os.chdir('..')
from process_comments import preprocess_comment

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import torch
print(torch.version.cuda)

11.8


In [3]:
def calculate_coherence_score(topic_words):

    # Create a dictionary
    dictionary = Dictionary(topic_words)
    
    # Initialize coherence model
    coherence_model = CoherenceModel(
        topics=topic_words,
        texts=topic_words, 
        dictionary=dictionary,
        coherence='c_v'
    )

    coherence_score = coherence_model.get_coherence()

    return coherence_score

def calculate_jaccard_similarity(topic1, topic2):

    set1, set2 = set(topic1), set(topic2)
    intersection = set1.intersection(set2)
    union = set1.union(set2)

    if union:
        return len(intersection) / len(union)
    
    return 0

def calculate_average_jaccard_similarity(topics):
    
    total_similarity = 0
    num_comparisons = 0
    
    # Compare every pair of topics
    for topic1, topic2 in itertools.combinations(topics, 2):
        total_similarity += calculate_jaccard_similarity(topic1, topic2)
        num_comparisons += 1

    if num_comparisons > 0:
        return total_similarity / num_comparisons
    
    return 0

def topic_modeling(comments, min_cluster_size, min_samples, max_features, ngram_range):

    processed_comments = []
    for comment in comments:
        processed_comments.append(preprocess_comment(comment))

    # HDBSCAN model for clustering 
    hdbscan_model = hdbscan.HDBSCAN(
        min_cluster_size=min_cluster_size, 
        min_samples=min_samples, 
        cluster_selection_epsilon=0.1
    )
    
    # BERTopic model with adjusted parameters
    topic_model = BERTopic(
        hdbscan_model=hdbscan_model, 
        embedding_model=SentenceTransformer('all-MiniLM-L6-v2', device='cuda'), 
        vectorizer_model=CountVectorizer(max_features=max_features, ngram_range=ngram_range),
    )

    # fit-transform the model
    topics, probs = topic_model.fit_transform(processed_comments)
    
    topics_list = {}
    topic_words = []
    top_comments = {}

    i = 0
    while True:

        topic = topic_model.get_topic(i)

        if not topic:
            break
        subtopics_list = [word for word, _ in topic if word != '']

        if len(subtopics_list) == 0:
            break
        topics_list[f"Topic {i + 1}"] = subtopics_list
        topic_words.append(subtopics_list)
        
        # Find top comments for each topic based on probabilities
        topic_indices = [index for index, t in enumerate(topics) if t == i]
        topic_probs = [probs[index] for index in topic_indices]
        top_indices = sorted(range(len(topic_probs)), key=lambda k: topic_probs[k], reverse=True)[:25]
        top_comments[f"Topic {i + 1}"] = [comments[topic_indices[index]] for index in top_indices]
    
        i += 1

    if len(topics_list) == 0:
        return None, None, topics_list, top_comments

    return calculate_coherence_score(topic_words), calculate_average_jaccard_similarity(topic_words), topics_list, top_comments

def find_best_topics(comments):

    param_grid = {
        'min_cluster_size': [2, 5],
        'min_samples': [2, 5],
        'max_features': [200, 500, 1000],
        'ngram_range': [(1, 3), (2, 4), (3, 5)]
    }

    best_score = -1
    best_coherence = 0
    best_jaccard = 0

    for params in ParameterGrid(param_grid):
        coherence, jaccard, topics_list, top_comments = topic_modeling(
            comments, 
            min_cluster_size=params['min_cluster_size'], 
            min_samples=params['min_samples'], 
            max_features=params['max_features'], 
            ngram_range=params['ngram_range']
        )
        if coherence and jaccard:
            score = coherence - jaccard
            if score > best_score:
                best_score = score
                best_coherence = coherence
                best_jaccard = jaccard
                best_topics = topics_list
                best_comments = top_comments
                best_params = params

    return best_coherence, best_jaccard, best_params, best_topics, best_comments

In [4]:
os.chdir('..')

# Comments by state
folder = 'csv_files/classified_comments_by_state'
files = os.listdir(folder)

coherence_jaccard = []

# Iterate through the files and find the best topics for each state 
for file in files:

    if file == 'excluded_states.csv':
        continue

    state = file[:-4]
    state_data = pd.read_csv(f'{folder}/{file}') 
    state_comment = state_data['Comment'].tolist()

    best_coherence, best_jaccard, best_params, best_topics, best_comments = find_best_topics(state_comment)
    coherence_jaccard.append(f'{state}: Coherence - {best_coherence}, Jaccard - {best_jaccard}, Params - {best_params}')

    # Write the results to a file
    with open(f'topic_modeling/BERTopic_method/topics_by_state/{state}.json', 'w') as f:
        json.dump(best_topics, f)

    # Write the top comments for each topic to a file
    with open(f'topic_modeling/BERTopic_method/comments_by_state/{state}.json', 'w') as f:
        json.dump(best_comments, f)

# Save the coherence and jaccard scores to file
with open(f'topic_modeling/BERTopic_method/detailed_metrics.txt', 'w') as f:
    f.write('\n'.join(coherence_jaccard))