In [1]:
%load_ext autoreload
%autoreload 2

import os

while "notebooks" in os.getcwd():
    os.chdir("..")

import torch
import torch.nn as nn
from datasets import load_dataset
from numpy import array_equal
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from torchmetrics import Precision

from mim_nlp.classifier.nn import NNClassifier

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
torch.__version__

'2.0.0+cpu'

In [3]:
torch.cuda.is_available()

False

# Loading an open-source dataset

In [4]:
dataset = load_dataset("imdb")

In [5]:
dataset

DatasetDict({
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})

In [6]:
x_train, x_val, y_train, y_val = train_test_split(
    dataset["train"]["text"], dataset["train"]["label"], train_size=20000, random_state=0
)

# Training the model on the dataset

In [7]:
class MLP(nn.Module):
    def __init__(self, input_size):
        super().__init__()

        self.dropout = nn.Dropout(0.2)
        self.dense = nn.Linear(input_size, 64)
        self.relu = nn.ReLU()
        self.out_proj = nn.Linear(64, 1)

    def forward(self, x):
        x = self.dropout(x)
        x = self.dense(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.out_proj(x)

        return x

In [8]:
def accuracy(y_pred, y_target):
    y_pred = y_pred > 0.5
    return torch.sum(y_target == y_pred) / len(y_target)

In [10]:
tokenizer = TfidfVectorizer(sublinear_tf=True, min_df=0.01, max_df=0.5, ngram_range=(1, 2))
tokenizer = tokenizer.fit(dataset["train"]["text"])
input_size = len(tokenizer.vocabulary_)
print(input_size)

3514


In [11]:
MODEL_PARAMS = {
    "batch_size": 256,
    "epochs": 2,
    "optimizer_params": {"lr": 1e-4},
    "device": "cpu",
    "many_gpus": False,
}
BINARY_METRICS = {
    "train_metrics_dict": {
        "accuracy": accuracy,
    },
    "eval_metrics_dict": {
        "accuracy": accuracy,
    },
}

In [12]:
model = NNClassifier(**MODEL_PARAMS, input_size=input_size, neural_network=MLP(input_size), tokenizer=tokenizer)
model.fit(x_train, y_train)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
  rank_zero_warn(

  | Name                      | Type              | Params
----------------------------------------------------------------
0 | neural_network            | MLP               | 225 K 
1 | loss_fun                  | BCEWithLogitsLoss | 0     
2 | train_metrics_module_dict | ModuleDict        | 0     
3 | eval_metrics_module_dict  | ModuleDict        | 0     
4 | test_metrics_module_dict  | ModuleDict        | 0     
----------------------------------------------------------------
225 K     Trainable params
0         Non-trainable params
225 K     Total params
0.900     Total estimated model params size (MB)
  rank_zero_warn(


Epoch 0:   3%|▎         | 2/79 [00:00<00:07, 10.69it/s]

  rank_zero_warn(


Epoch 1: 100%|██████████| 79/79 [00:00<00:00, 88.94it/s, train_loss=0.674]

`Trainer.fit` stopped: `max_epochs=2` reached.


Epoch 1: 100%|██████████| 79/79 [00:00<00:00, 88.71it/s, train_loss=0.674]


# Get predictions on the test set

In [13]:
predictions = model.predict(dataset["test"]["text"])

  rank_zero_warn(


Predicting DataLoader 0: 100%|██████████| 98/98 [00:00<00:00, 218.38it/s]


# Calculate the accuracy score

In [14]:
accuracy_score(dataset["test"]["label"], predictions)

0.80188

# Saving the model

In [15]:
model.save_without_stop_words("models/nn_classifier")

# Loading the model

In [16]:
model_loaded = NNClassifier.load("models/nn_classifier", device="cpu")

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [17]:
predictions_from_loaded = model.predict(dataset["test"]["text"])

  rank_zero_warn(


Predicting DataLoader 0: 100%|██████████| 98/98 [00:00<00:00, 217.53it/s]


In [18]:
assert array_equal(predictions, predictions_from_loaded)

In [19]:
model_loaded.test(
    dataset["test"]["text"],
    dataset["test"]["label"],
    test_metrics_dict={"acc": accuracy, "precision": Precision(task="binary")},
)

  rank_zero_warn(


Testing DataLoader 0:  29%|██▊       | 28/98 [00:00<00:00, 145.41it/s]

  rank_zero_warn(
  rank_zero_warn(


Testing DataLoader 0: 100%|██████████| 98/98 [00:00<00:00, 128.30it/s]
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
           acc              0.8018800020217896
        precision           0.9014788866043091
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────


{'precision': 0.9014788866043091, 'acc': 0.8018800020217896}