In [31]:
# Matrix Factorization Model

# This notebook focuses on building and evaluating models for the BBC news classification task. We will explore two approaches:
# 1. Non-Negative Matrix Factorization (NMF) combined with Logistic Regression.
# 2. Logistic Regression using TF-IDF features.

# We will compare the performance of these models on a validation set and prepare a submission for the test set.


In [32]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.decomposition import NMF
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report


In [33]:
# Load the preprocessed data from the .npz file with allow_pickle=True
npz_file_path = '../data/news/tfidf_data.npz'
data = np.load(npz_file_path, allow_pickle=True)

tfidf_train_data = data['tfidf_train_data']
tfidf_train_indices = data['tfidf_train_indices']
tfidf_train_indptr = data['tfidf_train_indptr']
tfidf_train_shape = data['tfidf_train_shape']

tfidf_test_data = data['tfidf_test_data']
tfidf_test_indices = data['tfidf_test_indices']
tfidf_test_indptr = data['tfidf_test_indptr']
tfidf_test_shape = data['tfidf_test_shape']

article_ids_train = data['article_ids_train']
categories_train = data['categories_train']
article_ids_test = data['article_ids_test']

# Reconstruct the sparse matrices
from scipy.sparse import csr_matrix
tfidf_train = csr_matrix((tfidf_train_data, tfidf_train_indices, tfidf_train_indptr), shape=tfidf_train_shape)
tfidf_test = csr_matrix((tfidf_test_data, tfidf_test_indices, tfidf_test_indptr), shape=tfidf_test_shape)



In [34]:
# Splitting the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(tfidf_train, categories_train, test_size=0.2, random_state=42)


In [35]:
# Encode labels
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_val_encoded = label_encoder.transform(y_val)


In [36]:
# Fit NMF on the TF-IDF train matrix
nmf = NMF(n_components=20, random_state=42)
W_train = nmf.fit_transform(X_train)
H = nmf.components_

# Transform validation set
W_val = nmf.transform(X_val)

# Use logistic regression on the NMF components
lr = LogisticRegression(max_iter=1000)
lr.fit(W_train, y_train_encoded)

# Predict on the validation set
y_val_pred_nmf = lr.predict(W_val)
y_val_pred_nmf_labels = label_encoder.inverse_transform(y_val_pred_nmf)


In [37]:
# Train logistic regression on the original TF-IDF features
lr_tfidf = LogisticRegression(max_iter=1000)
lr_tfidf.fit(X_train, y_train_encoded)

# Predict on the validation set
y_val_pred_lr = lr_tfidf.predict(X_val)
y_val_pred_lr_labels = label_encoder.inverse_transform(y_val_pred_lr)


In [38]:
# Evaluate NMF + Logistic Regression
print("NMF + Logistic Regression")
print("Accuracy:", accuracy_score(y_val, y_val_pred_nmf_labels))
print(classification_report(y_val, y_val_pred_nmf_labels))

# Evaluate TF-IDF + Logistic Regression
print("TF-IDF + Logistic Regression")
print("Accuracy:", accuracy_score(y_val, y_val_pred_lr_labels))
print(classification_report(y_val, y_val_pred_lr_labels))


NMF + Logistic Regression
Accuracy: 0.9023569023569024
               precision    recall  f1-score   support

     business       0.92      0.96      0.94        69
entertainment       0.93      0.79      0.85        47
     politics       0.84      0.89      0.86        54
        sport       0.91      0.92      0.91        75
         tech       0.92      0.92      0.92        52

     accuracy                           0.90       297
    macro avg       0.90      0.90      0.90       297
 weighted avg       0.90      0.90      0.90       297

TF-IDF + Logistic Regression
Accuracy: 0.9663299663299664
               precision    recall  f1-score   support

     business       0.94      0.99      0.96        69
entertainment       1.00      0.91      0.96        47
     politics       0.94      0.93      0.93        54
        sport       0.99      1.00      0.99        75
         tech       0.96      0.98      0.97        52

     accuracy                           0.97       297
  

In [39]:
# Predict on the test set using the chosen model (TF-IDF + Logistic Regression in this example)
y_test_pred = lr_tfidf.predict(tfidf_test)
y_test_pred_labels = label_encoder.inverse_transform(y_test_pred)

# Prepare submission dataframe
submission_df = pd.DataFrame({'ArticleId': article_ids_test, 'Category': y_test_pred_labels})

# Save submission
submission_file_path = '../data/news/bbc_news_classification_submission.csv'
submission_df.to_csv(submission_file_path, index=False)
print("Submission file saved at:", submission_file_path)


Submission file saved at: ../data/news/bbc_news_classification_submission.csv
