In [1]:
%load_ext autoreload
%autoreload 2

import os

while "notebooks" in os.getcwd():
    os.chdir("..")

import torch
import torch.nn as nn
from datasets import load_dataset
from numpy import array_equal
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import mean_squared_error

from mim_nlp.regressor import NNRegressor

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
torch.__version__

'1.8.1+cu101'

In [3]:
torch.cuda.is_available()

True

# Loading an open-source dataset

In [4]:
dataset = load_dataset("imdb")

Found cached dataset imdb (/home/portal/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0)
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 424.28it/s]


In [5]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})

# Training the model on the dataset

In [6]:
tokenizer = TfidfVectorizer(sublinear_tf=True, min_df=0.01, max_df=0.5, ngram_range=(1, 3))
tokenizer = tokenizer.fit(dataset["train"]["text"])

In [7]:
input_size = len(tokenizer.vocabulary_)
print(input_size)

3800


In [8]:
class MLP(nn.Module):
    def __init__(self):
        super().__init__()

        self.dropout = nn.Dropout(0.2)
        self.dense = nn.Linear(input_size, 64)
        self.relu = nn.ReLU()
        self.out_proj = nn.Linear(64, 1)

    def forward(self, x):
        x = self.dropout(x)
        x = self.dense(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.out_proj(x)

        return x

In [9]:
model = NNRegressor(
    batch_size=4,
    learning_rate=1e-3,
    epochs=8,
    input_size=input_size,
    tokenizer=tokenizer,
    neural_network=MLP(),
    device="cuda:0",
    many_gpus=False,
)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [10]:
model.fit(dataset["train"]["text"], dataset["train"]["label"])

  rank_zero_warn("You defined a `validation_step` but have no `val_dataloader`. Skipping val loop.")
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]

  | Name                      | Type       | Params
---------------------------------------------------------
0 | neural_network            | MLP        | 243 K 
1 | loss_fun                  | MSELoss    | 0     
2 | train_metrics_module_dict | ModuleDict | 0     
3 | eval_metrics_module_dict  | ModuleDict | 0     
---------------------------------------------------------
243 K     Trainable params
0         Non-trainable params
243 K     Total params
0.973     Total estimated model params size (MB)
  rank_zero_warn(


Epoch 7: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6250/6250 [05:17<00:00, 19.65it/s, loss=0.00131, train_loss=0.0047]


# Get predictions on the test set

In [11]:
predictions = model.predict(dataset["test"]["text"])

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]
  rank_zero_warn(


Predicting DataLoader 0: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6250/6250 [00:10<?, ?it/s]


# Calculate the accuracy score

In [12]:
mean_squared_error(dataset["test"]["label"], predictions)

0.5049941901199979

# Saving the model

In [13]:
model.save_without_stop_words("models/nn_regressor")

# Loading the model

In [14]:
model_loaded = NNRegressor.load("models/nn_regressor")

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [15]:
predictions_from_loaded = model.predict(dataset["test"]["text"])

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]
  rank_zero_warn(


Predicting DataLoader 0: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6250/6250 [00:10<?, ?it/s]


In [16]:
assert array_equal(predictions, predictions_from_loaded)