In [9]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.metrics import f1_score
from sklearn.datasets import fetch_20newsgroups
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import f1_score

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

In [10]:
groups = ['alt.atheism', 'rec.sport.baseball', 'sci.space']

# Load the 20 newsgroups dataset, we set it with only 3 categories to save time, you can change it to all categories by removing the 'categories' parameter
train_data = fetch_20newsgroups(shuffle=True, random_state=42, categories=groups)

# Convert text data into a matrix of token counts
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(train_data.data)
y = train_data.target

In [11]:
X.shape, y.shape

((1670, 27427), (1670,))

In [12]:
# Define the parameter grid to search
param_grid = {'alpha': [0.1, 0.5, 1.0]}

# Perform GridSearchCV with 5-fold cross-validation
classifier = MultinomialNB()
grid_search = GridSearchCV(classifier, param_grid, cv=5)
grid_search.fit(X, y)

# Get the best model
best_classifier = grid_search.best_estimator_

# Evaluate the best model using cross-validation
cv_scores = cross_val_score(best_classifier, X, y, cv=5, scoring='f1_macro')
mean_cv_score = cv_scores.mean()
print(f'Mean Cross-Validation Score: {mean_cv_score}')

Mean Cross-Validation Score: 0.9934383662042003


In [13]:
import pandas as pd
pd.DataFrame(grid_search.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_alpha,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.001659,0.000261,0.000437,4.2e-05,0.1,{'alpha': 0.1},0.994012,0.994012,0.98503,0.997006,0.997006,0.993413,0.0044,1
1,0.001192,2.8e-05,0.000394,1.8e-05,0.5,{'alpha': 0.5},0.994012,0.988024,0.982036,0.997006,0.994012,0.991018,0.005356,2
2,0.001446,0.000146,0.000394,1.8e-05,1.0,{'alpha': 1.0},0.994012,0.988024,0.98503,0.997006,0.991018,0.991018,0.004234,2


In [14]:
# Predict on the test set
test_data = fetch_20newsgroups(subset='test', shuffle=True, random_state=42, categories=groups)
X_test = vectorizer.transform(test_data.data)
y_test = test_data.target

predictions = best_classifier.predict(X_test)

# Calculate F1 score
f1 = f1_score(y_test, predictions, average='macro')
print(f'Test Set F1 Score: {f1}')

Test Set F1 Score: 0.9751950960932954


In [15]:
# A simple toy classifier
class LSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        super(LSTMClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        embedded = self.embedding(x)
        output, (hidden, cell) = self.lstm(embedded)
        # extract only the last time step
        return self.fc(output[:, -1, :])

In [16]:
# Define hyperparameters
device = 'cuda' if torch.cuda.is_available() else 'cpu'
vocab_size = len(vectorizer.get_feature_names_out())
embedding_dim = 100
hidden_dim = 128
output_dim = len(np.unique(y))
batch_size = 8
training_epochs = 3

# Perform 5-fold cross-validation with early stop
kf = KFold(n_splits=5, shuffle=True, random_state=42)
scores = []
for i, (train_index, val_index) in enumerate(kf.split(X)):
    X_train, X_val = X[train_index], X[val_index]
    y_train, y_val = y[train_index], y[val_index]
    
    # Convert data to tensors
    X_train_tensor = torch.tensor(X_train.toarray(), dtype=torch.long)
    y_train_tensor = torch.tensor(y_train, dtype=torch.long)
    X_val_tensor = torch.tensor(X_val.toarray(), dtype=torch.long)
    y_val_tensor = torch.tensor(y_val, dtype=torch.long)
    
    # DataLoader
    train_data = TensorDataset(X_train_tensor, y_train_tensor)
    train_loader = DataLoader(train_data, batch_size=batch_size)
    val_data = TensorDataset(X_val_tensor, y_val_tensor)
    val_loader = DataLoader(val_data, batch_size=1)
    
    model = LSTMClassifier(vocab_size, embedding_dim, hidden_dim, output_dim)
    model.to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=1e-3)
    
    model.train()
    for epoch in range(training_epochs):
        for data, target in train_loader:
            optimizer.zero_grad()
            output = model(data.to(device))
            loss = criterion(output.to(device), target.to(device))
            loss.backward()
            
            # Clip gradients
            torch.nn.utils.clip_grad_value_(model.parameters(), clip_value=0.5)
            optimizer.step()

    model.eval()
    with torch.no_grad():
        truth = []
        preds = []
        val_loss = []
        for data, target in val_loader:
            output = model(data.to(device))
            _, predicted = torch.max(output, 1)
            preds.append(predicted.detach().cpu().numpy())
            truth.append(target.numpy())
            val_loss.append(criterion(output.to(device), target.to(device)).item())
        f1 = f1_score(np.concatenate(truth), np.concatenate(preds), average='macro')
        print(f'Split{i+1}\nLoss: {np.mean(val_loss)}\nF1 score: {f1}')

Split1
Loss: 1.0932898935443627
F1 score: 0.17512877115526124
Split2
Loss: 1.0915464405528086
F1 score: 0.1867816091954023
Split3
Loss: 1.0942164770143474
F1 score: 0.1783625730994152
Split4
Loss: 1.0984142072900327
F1 score: 0.17074981440237566
Split5
Loss: 1.0937744421873263
F1 score: 0.1762114537444934


In [17]:
X_test_tensor = torch.tensor(X_test.toarray(), dtype=torch.long)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)
test_data = TensorDataset(X_test_tensor, y_test_tensor)
test_loader = DataLoader(test_data, batch_size=1)
model.eval()
with torch.no_grad():
    truth = []
    preds = []
    for data, target in test_loader:
        output = model(data.to(device))
        _, predicted = torch.max(output, 1)
        preds.append(predicted.detach().cpu().numpy())
        truth.append(target.numpy())
    f1 = f1_score(np.concatenate(truth), np.concatenate(preds), average='macro')
    print(f'F1 score: {f1}')

F1 score: 0.17464539007092197
