In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from gensim.models.coherencemodel import CoherenceModel
from gensim.corpora import Dictionary
from gensim.models import LdaModel
from sklearn.decomposition import TruncatedSVD, NMF

def train_lda_model(data):
    vectorizer = CountVectorizer()
    dtm = vectorizer.fit_transform(data['Text_Lemmatized'].tolist())
    num_topics = 5
    id2word = Dictionary([doc.split() for doc in data['Text_Lemmatized'].tolist()])
    corpus = [id2word.doc2bow(doc.split()) for doc in data['Text_Lemmatized'].tolist()]
    model = LdaModel(corpus=corpus, id2word=id2word, num_topics=num_topics, random_state=42)
    return model

def calculate_jaccard_similarity(topic_words1, topic_words2):
    set1 = set(topic_words1)
    set2 = set(topic_words2)
    intersection_size = len(set1.intersection(set2))
    union_size = len(set1.union(set2))
    return intersection_size / union_size if union_size > 0 else 0.0

def calculate_topic_diversity(lda_model):
    # Get the top words for each topic from the LDA model
    topics_words = [lda_model.get_topic_terms(topic, topn=10) for topic in range(lda_model.num_topics)]

    # Calculate pairwise topic diversities using Jaccard similarity
    topic_diversity_scores = []
    for i in range(len(topics_words)):
        for j in range(i + 1, len(topics_words)):
            topic1_words = [lda_model.id2word[word_id] for word_id, _ in topics_words[i]]
            topic2_words = [lda_model.id2word[word_id] for word_id, _ in topics_words[j]]
            jaccard_similarity = calculate_jaccard_similarity(topic1_words, topic2_words)
            topic_diversity_scores.append(jaccard_similarity)

    # Calculate the average topic diversity score
    if len(topic_diversity_scores) > 0:
        average_topic_diversity = sum(topic_diversity_scores) / len(topic_diversity_scores)
    else:
        average_topic_diversity = 0.0

    return average_topic_diversity


def calculate_topic_coherence(lda_model, df):
    vectorizer = CountVectorizer()
    dtm = vectorizer.fit_transform(df['Text_Lemmatized'].tolist())
    id2word = Dictionary([doc.split() for doc in df['Text_Lemmatized'].tolist()])
    corpus = [id2word.doc2bow(doc.split()) for doc in df['Text_Lemmatized'].tolist()]

    # Convert topics to list of strings
    topics_words = [[lda_model.id2word[word_id] for word_id, _ in lda_model.get_topic_terms(topic, topn=10)] for topic in range(lda_model.num_topics)]

    # Calculate CoherenceModel for c_v
    coherence_model_c_v = CoherenceModel(topics=topics_words, texts=[doc.split() for doc in df['Text_Lemmatized'].tolist()], dictionary=id2word, coherence='c_v')
    coherence_c_v = coherence_model_c_v.get_coherence()

    # Calculate CoherenceModel for u_mass
    coherence_model_u_mass = CoherenceModel(topics=topics_words, corpus=corpus, dictionary=id2word, coherence='u_mass')
    coherence_u_mass = coherence_model_u_mass.get_coherence()

    # Calculate CoherenceModel for c_uci
    coherence_model_c_uci = CoherenceModel(topics=topics_words, texts=[doc.split() for doc in df['Text_Lemmatized'].tolist()], dictionary=id2word, coherence='c_uci')
    coherence_c_uci = coherence_model_c_uci.get_coherence()

    return coherence_model_c_v, coherence_model_u_mass, coherence_model_c_uci




def calculate_topic_dominance(lda_model, df):
    topic_dominance_scores = {topic_idx: 0.0 for topic_idx in range(lda_model.num_topics)}
    total_documents = 0

    # Get the topic-term matrix from the LDA model
    topic_term_matrix = lda_model.get_topics()

    # Normalize the topic-term matrix to get topic-term probabilities
    topic_term_probabilities = topic_term_matrix / topic_term_matrix.sum(axis=1, keepdims=True)

    # Create a bag-of-words representation for each document in the dataframe
    id2word = Dictionary([doc.split() for doc in df['Text_Lemmatized'].tolist()])
    corpus = [id2word.doc2bow(doc.split()) for doc in df['Text_Lemmatized'].tolist()]

    # Iterate through each document and calculate the topic dominance
    for doc_bow in corpus:
        # Calculate the topic distribution for the document
        topic_distribution = lda_model.get_document_topics(doc_bow, minimum_probability=0)

        # Calculate the topic dominance score for each topic in the document
        for topic_idx, prob in topic_distribution:
            topic_dominance_scores[topic_idx] += prob
        total_documents += 1

    # Calculate the average topic dominance score for each topic
    for topic_idx in topic_dominance_scores:
        topic_dominance_scores[topic_idx] /= total_documents

    return topic_dominance_scores



def main():
    input_folder = r'D:\Research\Python\Data\WIP\Spike Data'
    output_folder = r'D:\Research\Python\Data\WIP\Topic modelling'

    # Get a list of all CSV files in the input folder
    files = [f for f in os.listdir(input_folder) if f.endswith('.csv')]

    all_results = []

    for filename in files:
        file_path = os.path.join(input_folder, filename)

        # Load data from CSV
        df = pd.read_csv(file_path)

        # LDA
        lda_model = train_lda_model(df)

        # Calculate perplexity score after training the model
        vectorizer = CountVectorizer()
        dtm = vectorizer.fit_transform(df['Text_Lemmatized'].tolist())
        id2word = Dictionary([doc.split() for doc in df['Text_Lemmatized'].tolist()])
        corpus = [id2word.doc2bow(doc.split()) for doc in df['Text_Lemmatized'].tolist()]
        perplexity_score_lda = lda_model.log_perplexity(corpus)

        coherence_model_c_v, coherence_model_u_mass, coherence_model_c_uci = calculate_topic_coherence(lda_model, df)

        # Get coherence scores
        coherence_c_v = coherence_model_c_v.get_coherence()
        coherence_u_mass = coherence_model_u_mass.get_coherence()
        coherence_c_uci = coherence_model_c_uci.get_coherence()

        topic_diversity_score = calculate_topic_diversity(lda_model)
        topic_dominance_scores = calculate_topic_dominance(lda_model, df)

        # Create a dictionary to store the results
        result_dict = {
            'Dataset': filename,
            'Perplexity_Score': perplexity_score_lda,
            'Coherence_CV': coherence_c_v,
            'Coherence_U_Mass': coherence_u_mass,
            'Coherence_C_UCI': coherence_c_uci,
            'Topic_Diversity': topic_diversity_score,
            'Topic_Dominance_Scores': topic_dominance_scores,
        }

        all_results.append(result_dict)

    # Save all results to a single CSV file
    result_df = pd.DataFrame(all_results)
    result_file = os.path.join(output_folder, 'topic_model_metrics_LDA.csv')
    result_df.to_csv(result_file, index=False)

if __name__ == '__main__':
    main()