In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from gensim.models.coherencemodel import CoherenceModel
from gensim.corpora import Dictionary
from sklearn.decomposition import NMF
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import normalize

def train_nmf_model(data, num_topics):
    vectorizer = CountVectorizer()
    dtm = vectorizer.fit_transform(data['Text_Lemmatized'].tolist())
    nmf_model = NMF(n_components=num_topics, random_state=42)
    nmf_model.fit(dtm)
    return nmf_model, vectorizer.get_feature_names_out().tolist(), dtm  # Convert feature_names to a list


def calculate_jaccard_similarity(topic_words1, topic_words2):
    set1 = set(topic_words1)
    set2 = set(topic_words2)
    intersection_size = len(set1.intersection(set2))
    union_size = len(set1.union(set2))
    return intersection_size / union_size if union_size > 0 else 0.0

def calculate_topic_diversity(lda_model):
    topics_words = [lda_model.get_topic_terms(topic, topn=10) for topic in range(lda_model.num_topics)]
    topic_diversity_scores = []
    for i in range(len(topics_words)):
        for j in range(i + 1, len(topics_words)):
            topic1_words = [lda_model.id2word[word_id] for word_id, _ in topics_words[i]]
            topic2_words = [lda_model.id2word[word_id] for word_id, _ in topics_words[j]]
            jaccard_similarity = calculate_jaccard_similarity(topic1_words, topic2_words)
            topic_diversity_scores.append(jaccard_similarity)

    if len(topic_diversity_scores) > 0:
        average_topic_diversity = sum(topic_diversity_scores) / len(topic_diversity_scores)
    else:
        average_topic_diversity = 0.0

    return average_topic_diversity

def calculate_topic_coherence_and_dominance(nmf_model, df, feature_names):
    topic_terms = nmf_model.components_
    coherence_scores = []
    topic_dominance_scores = {topic_idx: 0.0 for topic_idx in range(nmf_model.n_components)}
    total_documents = 0

    vectorizer = CountVectorizer()
    dtm = vectorizer.fit_transform(df['Text_Lemmatized'].tolist())
    dtm_normalized = normalize(dtm, norm='l1', axis=1)

    for topic_idx in range(nmf_model.n_components):
        topic_terms_ids = topic_terms[topic_idx].argsort()[::-1][:10]
        topic_terms_list = [feature_names[idx] for idx in topic_terms_ids]
        topic_coherence = 0.0

        for i in range(len(topic_terms_list)):
            for j in range(i + 1, len(topic_terms_list)):
                term1 = topic_terms_list[i]
                term2 = topic_terms_list[j]
                term1_index = feature_names.index(term1)
                term2_index = feature_names.index(term2)
                term1_vector = dtm[:, term1_index].toarray()
                term2_vector = dtm[:, term2_index].toarray()
                cosine_sim = cosine_similarity(term1_vector, term2_vector)[0][0]
                topic_coherence += cosine_sim

        coherence_scores.append(topic_coherence)

    for doc_idx in range(len(df)):
        doc_vector = dtm_normalized[doc_idx]
        doc_representation = nmf_model.transform(doc_vector)
        dominant_topic_idx = np.argmax(doc_representation)
        topic_dominance_scores[dominant_topic_idx] += 1
        total_documents += 1

    for topic_idx in topic_dominance_scores:
        topic_dominance_scores[topic_idx] /= total_documents

    return coherence_scores, topic_dominance_scores

def main():
    input_folder = r'D:\Research\Python\Data\WIP\Spike Data'
    output_folder = r'D:\Research\Python\Data\WIP\Topic modelling'

    files = [f for f in os.listdir(input_folder) if f.endswith('.csv')]

    all_results = []

    for filename in files:
        file_path = os.path.join(input_folder, filename)
        df = pd.read_csv(file_path)

        num_topics_nmf = 5
        
        nmf_model, feature_names, dtm = train_nmf_model(df, num_topics_nmf)

        coherence_nmf, topic_dominance_scores_nmf = calculate_topic_coherence_and_dominance(nmf_model, df, feature_names)

        result_dict_nmf = {
            'Dataset': filename,
            'Model': 'NMF',
            'Coherence': coherence_nmf,
            'Topic_Dominance_Scores': topic_dominance_scores_nmf,
        }

        all_results.append(result_dict_nmf)

    result_df = pd.DataFrame(all_results)
    result_file = os.path.join(output_folder, 'topic_model_metrics_NMF.csv')
    result_df.to_csv(result_file, index=False)

if __name__ == '__main__':
    main()
