In [2]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np

# Load the dataset
newsgroups = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))
documents = newsgroups.data
labels = newsgroups.target

In [3]:
# Create a CountVectorizer for parsing/counting words
count_vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
doc_term_matrix = count_vectorizer.fit_transform(documents)

In [4]:
# Number of topics
n_topics = 20

# Create and fit LDA model
lda = LatentDirichletAllocation(n_components=n_topics, random_state=0)
lda.fit(doc_term_matrix)

LatentDirichletAllocation(n_components=20, random_state=0)

In [6]:
from collections import defaultdict
from scipy import stats

# Assigns the topic to each document
topic_assignments = lda.transform(doc_term_matrix)
dominant_topics = np.argmax(topic_assignments, axis=1)

# Create a mapping from LDA topics to actual labels
topic_label_mapping = defaultdict(lambda: defaultdict(int))

for lda_topic, actual_label in zip(dominant_topics, labels):
    topic_label_mapping[lda_topic][actual_label] += 1

# Determine the most frequent actual label for each LDA topic
mapped_labels = {lda_topic: max(label_dict, key=label_dict.get) for lda_topic, label_dict in topic_label_mapping.items()}

# Assign mapped labels based on LDA topics
assigned_labels = [mapped_labels[lda_topic] for lda_topic in dominant_topics]

# Calculate accuracy
accuracy = np.mean(np.array(assigned_labels) == np.array(labels))
print(f"Adjusted Model Accuracy: {accuracy}")


Adjusted Model Accuracy: 0.34490077470020164
