### HyperParameter Tuning
It is important to tune the hyperparameters of a machine learning model to get the best performance. In this notebook, we will use the 20 newgroups dataset to tune our model's hyperparameters. It might take some time to download the dataset and preprocess it.

In [1]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.metrics import f1_score
from sklearn.datasets import fetch_20newsgroups

groups = ['alt.atheism', 'rec.sport.baseball', 'sci.space']

# Load the 20 newsgroups dataset, we set it with only 3 categories to save time, you can change it to all categories by removing the 'categories' parameter
train_data = fetch_20newsgroups(shuffle=True, random_state=42, categories=groups)

# Convert text data into a matrix of token counts
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(train_data.data)
y = train_data.target

In [2]:
X.shape, y.shape

((1670, 27427), (1670,))

### 1. K-Fold Cross-Validation and Grid Search

In [3]:
# Define the parameter grid to search
param_grid = {'alpha': [0.1, 0.5, 1.0]}

# Perform GridSearchCV with 5-fold cross-validation
classifier = MultinomialNB()
grid_search = GridSearchCV(classifier, param_grid, cv=5)
grid_search.fit(X, y)

# Get the best model
best_classifier = grid_search.best_estimator_

# Evaluate the best model using cross-validation
cv_scores = cross_val_score(best_classifier, X, y, cv=5, scoring='f1_macro')
mean_cv_score = cv_scores.mean()
print(f'Mean Cross-Validation Score: {mean_cv_score}')

Mean Cross-Validation Score: 0.9934383662042003


In [4]:
import pandas as pd
pd.DataFrame(grid_search.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_alpha,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.003577,0.000796,0.0011,0.000193,0.1,{'alpha': 0.1},0.994012,0.994012,0.98503,0.997006,0.997006,0.993413,0.0044,1
1,0.00357,0.000204,0.000694,0.000241,0.5,{'alpha': 0.5},0.994012,0.988024,0.982036,0.997006,0.994012,0.991018,0.005356,2
2,0.003577,0.000371,0.000596,0.000197,1.0,{'alpha': 1.0},0.994012,0.988024,0.98503,0.997006,0.991018,0.991018,0.004234,2


In [5]:
# Predict on the test set
test_data = fetch_20newsgroups(subset='test', shuffle=True, random_state=42, categories=groups)
X_test = vectorizer.transform(test_data.data)
y_test = test_data.target

predictions = best_classifier.predict(X_test)

# Calculate F1 score
f1 = f1_score(y_test, predictions, average='macro')
print(f'Test Set F1 Score: {f1}')

Test Set F1 Score: 0.9751950960932954


### 2. Hyperparameters of a neural network
Note that you need to modify the hyperparameters if your device can't train the network efficiently in this notebook (to get a taste of the workflow). The training pipeline is built using a toy classifier and a small dataset, so it may not be the best choice for a real-world application.

In [6]:
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import f1_score

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

In [7]:
# A simple toy classifier
class LSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        super(LSTMClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        embedded = self.embedding(x)
        output, (hidden, cell) = self.lstm(embedded)
        # extract only the last time step
        return self.fc(output[:, -1, :])

In [None]:
# Define hyperparameters
device = 'cuda' if torch.cuda.is_available() else 'cpu'
vocab_size = len(vectorizer.get_feature_names_out())
embedding_dim = 100
hidden_dim = 128
output_dim = len(np.unique(y))
batch_size = 8
training_epochs = 3

# Perform 5-fold cross-validation with early stop
kf = KFold(n_splits=5, shuffle=True, random_state=42)
scores = []
for i, (train_index, val_index) in enumerate(kf.split(X)):
    X_train, X_val = X[train_index], X[val_index]
    y_train, y_val = y[train_index], y[val_index]
    
    # Convert data to tensors
    X_train_tensor = torch.tensor(X_train.toarray(), dtype=torch.long)
    y_train_tensor = torch.tensor(y_train, dtype=torch.long)
    X_val_tensor = torch.tensor(X_val.toarray(), dtype=torch.long)
    y_val_tensor = torch.tensor(y_val, dtype=torch.long)
    
    # DataLoader
    train_data = TensorDataset(X_train_tensor, y_train_tensor)
    train_loader = DataLoader(train_data, batch_size=batch_size)
    val_data = TensorDataset(X_val_tensor, y_val_tensor)
    val_loader = DataLoader(val_data, batch_size=1)
    
    model = LSTMClassifier(vocab_size, embedding_dim, hidden_dim, output_dim)
    model.to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=1e-3)
    
    model.train()
    for epoch in range(training_epochs):
        for data, target in train_loader:
            optimizer.zero_grad()
            output = model(data.to(device))
            loss = criterion(output.to(device), target.to(device))
            loss.backward()
            
            # Clip gradients
            torch.nn.utils.clip_grad_value_(model.parameters(), clip_value=0.5)
            optimizer.step()

    model.eval()
    with torch.no_grad():
        truth = []
        preds = []
        val_loss = []
        for data, target in val_loader:
            output = model(data.to(device))
            _, predicted = torch.max(output, 1)
            preds.append(predicted.detach().cpu().numpy())
            truth.append(target.numpy())
            val_loss.append(criterion(output.to(device), target.to(device)).item())
        f1 = f1_score(np.concatenate(truth), np.concatenate(preds), average='macro')
        print(f'Split{i+1}\nLoss: {np.mean(val_loss)}\nF1 score: {f1}')

Split1
Loss: 1.0938678148263943
F1 score: 0.1762114537444934
Split2
Loss: 1.0923432218814324
F1 score: 0.17185185185185184
Split3
Loss: 1.0945039937596122
F1 score: 0.1729490022172949
Split4
Loss: 1.098829667725249
F1 score: 0.16853094705443697
Split5
Loss: 1.0940548424235361
F1 score: 0.1762114537444934


In [9]:
X_test_tensor = torch.tensor(X_test.toarray(), dtype=torch.long)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)
test_data = TensorDataset(X_test_tensor, y_test_tensor)
test_loader = DataLoader(test_data, batch_size=1)
model.eval()
with torch.no_grad():
    truth = []
    preds = []
    for data, target in test_loader:
        output = model(data.to(device))
        _, predicted = torch.max(output, 1)
        preds.append(predicted.detach().cpu().numpy())
        truth.append(target.numpy())
    f1 = f1_score(np.concatenate(truth), np.concatenate(preds), average='macro')
    print(f'F1 score: {f1}')

F1 score: 0.17464539007092197


### Practice for the Week
You have learnt ways to select the hyper-parameters for a machine learning model. Here's the early stopping function based on the loss function. When the difference between the current and the previous loss is less than `delta` (`mode`=`min`), the early stopping function will restore the best network weights and train until `n` patience epochs, before the training stopped. You can modify the early stopping function to suit your needs or replace it with the PyTorch EarlyStopping class from `ignite.handlers`. Apply it to training pipeline. The LSTMClassifier is a very simple toy classifier, improve it by having a better network architecture. Define the hyperparameters that affects the LSTMClassifier and improve the overall f1 score of the test data (20 categories). 

In [10]:
from ignite.handlers import EarlyStopping

In [11]:
class EarlyStopping:
    def __init__(self, patience=7, verbose=False, delta=0, restore_best_weight=True, mode='min'):
        self.patience = patience
        self.verbose = verbose
        self.counter = 0
        self.best_score = None
        self.early_stop = False
        self.val_loss_min = np.Inf
        self.delta = delta
        self.restore_best_weight = restore_best_weight
        self.other_metric = None
        self.loop = -1
        self.mode = mode
        self.best_state = None

    def __call__(self, model, val_loss, loop, other_metric=None):
        if self.mode == 'min':
            score = -val_loss
        else:
            score = val_loss
            
        if self.best_score is None:
            self.best_score = score
            self.other_metric = other_metric
            self.loop = loop
            self.best_state = model.state_dict()
            self.update_loss(val_loss)
            return True
        elif score < self.best_score + self.delta:
            self.counter += 1
            print(f'EarlyStopping counter: {self.counter} out of {self.patience}')
            print('Restore best weight...')
            if self.restore_best_weight:
                model.load_state_dict(self.best_state)

            if self.counter >= self.patience:
                self.early_stop = True
                print('')
                print(f'EarlyStopping at {loop}! Val Loss:{self.val_loss_min}. Other Metrics:{self.other_metric} from epoch {self.loop}')
            return False
        else:
            self.best_score = score
            self.other_metric = other_metric
            self.loop = loop
            self.best_state = model.state_dict()
            self.update_loss(val_loss)
            self.counter = 0
            return True

    def update_loss(self, val_loss):
        if self.verbose:
            print(f'Validation loss decreased ({self.val_loss_min:.6f} --> {val_loss:.6f}).  Saving model checkpoint...')
        self.val_loss_min = val_loss
        
stopper = EarlyStopping(patience=2, delta=0.001, verbose=True)

In [12]:
# Hint: modify the network architecture. eg., add dropout, activation functions, more layers, number of training epochs, batch size, etc.
class LSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        super(LSTMClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        embedded = self.embedding(x)
        output, (hidden, cell) = self.lstm(embedded)
        # extract only the last time step
        return self.fc(output[:, -1, :])
    
# Build the training pipeline as your preference
def train():
    pass