In [19]:
import numpy as np
from scipy import sparse
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.decomposition import NMF
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
import itertools
%matplotlib inline

In [20]:
# Load the data from the .npz file
data = np.load('../data/news/tfidf_data.npz', allow_pickle=True)

# Extract the datasets
tfidf_train = sparse.csr_matrix((data['tfidf_train_data'], data['tfidf_train_indices'], data['tfidf_train_indptr']), shape=data['tfidf_train_shape'])
tfidf_test = sparse.csr_matrix((data['tfidf_test_data'], data['tfidf_test_indices'], data['tfidf_test_indptr']), shape=data['tfidf_test_shape'])
article_ids_train = data['article_ids_train']
article_ids_test = data['article_ids_test']
categories_train = data['categories_train']

In [21]:
# Displaying shape for train and test to ensure datasets are correct
print("Train matrix shape:", tfidf_train.shape)
print("Test matrix shape:", tfidf_test.shape)
print(list(data.keys()))

Train matrix shape: (1484, 1000)
Test matrix shape: (735, 1000)
['tfidf_train_data', 'tfidf_train_indices', 'tfidf_train_indptr', 'tfidf_train_shape', 'tfidf_test_data', 'tfidf_test_indices', 'tfidf_test_indptr', 'tfidf_test_shape', 'article_ids_train', 'categories_train', 'article_ids_test']


In [22]:
# Initialize NMF model
nmf = NMF(n_components=5, random_state=42)
W_train = nmf.fit_transform(tfidf_train)
W_test = nmf.transform(tfidf_test)

In [23]:
# Function to predict dominant topic
def predict(w_matrix):
    return np.argmax(w_matrix, axis=1)

# Encode the categories into integers
encoder = LabelEncoder()
categories_encoded = encoder.fit_transform(categories_train)
predicted_topics_train = predict(W_train)

# Function to find the best permutation of topic labels
def label_permute(y_true, y_pred):
    labels = np.unique(y_true)
    perms = list(itertools.permutations(labels))
    best_perm = None
    best_acc = 0

    for perm in perms:
        permuted_labels = np.array([perm[label] for label in y_pred])
        current_acc = accuracy_score(y_true, permuted_labels)
        if current_acc > best_acc:
            best_acc = current_acc
            best_perm = perm

    return best_perm, best_acc

In [24]:
# Perform label permutation to align predicted topics with actual categories
best_permutation, best_accuracy = label_permute(categories_encoded, predicted_topics_train)

print('Best Permutation:', best_permutation)
print('Best Accuracy:', best_accuracy)

Best Permutation: (2, 3, 1, 0, 4)
Best Accuracy: 0.8928571428571429


In [None]:
# Example of changing n_components for SVD
components = [50, 100, 150, 200]
mean_scores = []

for n in components:
    svd = TruncatedSVD(n_components=n)
    X_train_reduced = svd.fit_transform(tfidf_train)
    
    # Define the classifier
    classifier = LogisticRegression(max_iter=1000)
    
    # Perform cross-validation
    scores = cross_val_score(classifier, X_train_reduced, categories_encoded, cv=5, scoring='accuracy')
    mean_score = scores.mean()
    mean_scores.append(mean_score)
    print(f"Mean accuracy for {n} components: {mean_score}")

plt.figure(figsize=(10, 5))
plt.plot(components, mean_scores, marker='o')
plt.title('Effect of Number of SVD Components on Cross-Validated Accuracy')
plt.xlabel('Number of Components')
plt.ylabel('Cross-Validated Accuracy')
plt.grid(True)
plt.show()