In [7]:
import os
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from gensim.models.coherencemodel import CoherenceModel
from gensim.corpora import Dictionary
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import normalize

def train_lsa_model(data, num_topics):
    vectorizer = CountVectorizer()
    dtm = vectorizer.fit_transform(data['Text_Lemmatized'].tolist())
    lsa_model = TruncatedSVD(n_components=num_topics, random_state=42)
    lsa_model.fit(dtm)
    return lsa_model, vectorizer.get_feature_names_out(), dtm

def calculate_jaccard_similarity(topic_words1, topic_words2):
    set1 = set(topic_words1)
    set2 = set(topic_words2)
    intersection_size = len(set1.intersection(set2))
    union_size = len(set1.union(set2))
    return intersection_size / union_size if union_size > 0 else 0.0
    

def calculate_topic_diversity(lda_model):
    # Get the top words for each topic from the LDA model
    topics_words = [lda_model.get_topic_terms(topic, topn=10) for topic in range(lda_model.num_topics)]

    # Calculate pairwise topic diversities using Jaccard similarity
    topic_diversity_scores = []
    for i in range(len(topics_words)):
        for j in range(i + 1, len(topics_words)):
            topic1_words = [lda_model.id2word[word_id] for word_id, _ in topics_words[i]]
            topic2_words = [lda_model.id2word[word_id] for word_id, _ in topics_words[j]]
            jaccard_similarity = calculate_jaccard_similarity(topic1_words, topic2_words)
            topic_diversity_scores.append(jaccard_similarity)

    # Calculate the average topic diversity score
    if len(topic_diversity_scores) > 0:
        average_topic_diversity = sum(topic_diversity_scores) / len(topic_diversity_scores)
    else:
        average_topic_diversity = 0.0

    return average_topic_diversity

def calculate_topic_coherence(lsa_model, dtm, feature_names, num_topics):
    topic_terms = lsa_model.components_
    coherence_scores = []

    for topic_idx in range(num_topics):
        topic_terms_ids = topic_terms[topic_idx].argsort()[::-1][:10]  # Get the top 10 terms for each topic
        topic_terms_list = [feature_names[idx] for idx in topic_terms_ids]
        topic_coherence = 0.0

        for i in range(len(topic_terms_list)):
            for j in range(i + 1, len(topic_terms_list)):
                term1 = topic_terms_list[i]
                term2 = topic_terms_list[j]

                # Convert feature_names to a list if it's a NumPy array
                if isinstance(feature_names, np.ndarray):
                    feature_names = feature_names.tolist()

                # Calculate the cosine similarity between the two terms in the dtm
                term1_index = feature_names.index(term1)
                term2_index = feature_names.index(term2)
                term1_vector = dtm[:, term1_index].toarray()  # Convert to dense array
                term2_vector = dtm[:, term2_index].toarray()  # Convert to dense array

                # Calculate cosine similarity between the two vectors
                cosine_sim = cosine_similarity(term1_vector, term2_vector)[0][0]
                topic_coherence += cosine_sim

        coherence_scores.append(topic_coherence)

    return coherence_scores  # Add this line to return the coherence scores


def calculate_topic_dominance(lsa_model, df, num_topics):
    topic_dominance_scores = {topic_idx: 0.0 for topic_idx in range(num_topics)}
    total_documents = 0

    # Create a bag-of-words representation for each document in the dataframe
    vectorizer = CountVectorizer()
    dtm = vectorizer.fit_transform(df['Text_Lemmatized'].tolist())

    # Normalize the document-term matrix to get term probabilities
    dtm_normalized = normalize(dtm, norm='l1', axis=1)

    # Iterate through each document and calculate the topic dominance
    for doc_idx in range(len(df)):
        doc_vector = dtm_normalized[doc_idx]

        # Calculate the document representation in the LSA space
        doc_representation = lsa_model.transform(doc_vector)

        # Find the dominant topic (latent semantic dimension) for the document
        dominant_topic_idx = np.argmax(doc_representation)
        topic_dominance_scores[dominant_topic_idx] += 1
        total_documents += 1

    # Calculate the average topic dominance score for each topic
    for topic_idx in topic_dominance_scores:
        topic_dominance_scores[topic_idx] /= total_documents

    return topic_dominance_scores

def main():
    input_folder = r'D:\Research\Python\Data\WIP\Spike Data'
    output_folder = r'D:\Research\Python\Data\WIP\Topic modelling'

    # Get a list of all CSV files in the input folder
    files = [f for f in os.listdir(input_folder) if f.endswith('.csv')]

    all_results = []

    for filename in files:
        file_path = os.path.join(input_folder, filename)

        # Load data from CSV
        df = pd.read_csv(file_path)

        # LSA
        num_topics_lsa = 2  # Choose the desired number of topics for LSA
        lsa_model, feature_names, dtm = train_lsa_model(df, num_topics_lsa)

        coherence_lsa = calculate_topic_coherence(lsa_model, dtm, feature_names, num_topics_lsa)
        topic_dominance_scores_lsa = calculate_topic_dominance(lsa_model, df, num_topics_lsa)

        # Create a dictionary to store the results for LSA
        result_dict_lsa = {
            'Dataset': filename,
            'Model': 'LSA',
            'Coherence': coherence_lsa,
            'Topic_Dominance_Scores': topic_dominance_scores_lsa,
        }

        all_results.append(result_dict_lsa)

    # Save all results to a single CSV file
    result_df = pd.DataFrame(all_results)
    result_file = os.path.join(output_folder, 'topic_model_metrics.csv')
    result_df.to_csv(result_file, index=False)

if __name__ == '__main__':
    main()

