In [1]:
import re
from typing import List

import bz2
import numpy as np
import nltk
nltk.download('punkt') # At first you have to download these nltk packages.
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import pandas as pd
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import (
    AdamW, get_linear_schedule_with_warmup, AutoTokenizer, AutoModelForSequenceClassification
)
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score

# Mlflow
import mlflow
import mlflow.pytorch

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\luisg\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\luisg\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\luisg\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
  from .autonotebook import tqdm as notebook_tqdm


# Load dataset

In [2]:
import os
print(os.listdir("../data"))

['test.ft.txt.bz2', 'train.ft.txt.bz2']


In [3]:
def get_labels_and_texts(file):
    labels = []
    texts = []
    for line in bz2.BZ2File(file):
        x = line.decode("utf-8")
        labels.append(int(x[9]) - 1)
        texts.append(x[10:].strip())
    return np.array(labels), texts
train_labels, train_texts = get_labels_and_texts('../data/train.ft.txt.bz2')
test_labels, test_texts = get_labels_and_texts('../data/test.ft.txt.bz2')

In [4]:
train_labels, train_texts = train_labels[:10000], train_texts[:10000]
test_labels, test_texts = test_labels[:2000], test_texts[:2000]

# Data preprocessing

In [5]:
stop_words = stopwords.words('english') # defining stop_words
stop_words.remove('not') # removing not from the stop_words list as it contains value in negative movies
lemmatizer = WordNetLemmatizer()

In [6]:
def data_preprocessing(review):

  # data cleaning
  review = re.sub(re.compile('<.*?>'), '', review) #removing html tags
  review =  re.sub('[^A-Za-z0-9]+', ' ', review) #taking only words
  review = re.sub(r"http\S+", "",review) #Removing URLs 
  
  # lowercase
  review = review.lower()
  
  # tokenization
  tokens = nltk.word_tokenize(review) # converts review to tokens
  
  # stop_words removal
  review = [word for word in tokens if word not in stop_words] #removing stop words
  
  # lemmatization
  # review = [lemmatizer.lemmatize(word) for word in review]
  
  # join words in preprocessed review
  review = ' '.join(review)
  
  emoji_pattern = re.compile("["
                          u"\U0001F600-\U0001F64F"  # emoticons
                          u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                          u"\U0001F680-\U0001F6FF"  # transport & map symbols
                          u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                          u"\U00002702-\U000027B0"
                          u"\U000024C2-\U0001F251"
                          "]+", flags=re.UNICODE)
  review = emoji_pattern.sub(r'', review) #Removing emojis
  
  return review


def get_max_sentence_length(sentences_list: List, tokenizer):
    max_len = 0

    # For every sentence...
    for sent in sentences_list:

        # Tokenize the text and add `[CLS]` and `[SEP]` tokens.
        input_ids = tokenizer.encode(sent, add_special_tokens=True)

        # Update the maximum sentence length.
        max_len = max(max_len, len(input_ids))

    print('Max sentence length: ', max_len)
    
    return max_len


def bert_preprocessing(sentences_list: List, tokenizer, max_length, labels):
    # Tokenize all of the sentences and map the tokens to thier word IDs.
    input_ids = []
    attention_masks = []

    # For every sentence...
    for sent in sentences_list:
        # `encode_plus` will:
        #   (1) Tokenize the sentence.
        #   (2) Prepend the `[CLS]` token to the start.
        #   (3) Append the `[SEP]` token to the end.
        #   (4) Map tokens to their IDs.
        #   (5) Pad or truncate the sentence to `max_length`
        #   (6) Create attention masks for [PAD] tokens.
        encoded_dict = tokenizer.encode_plus(
                            sent,                      # Sentence to encode.
                            add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                            max_length = max_length,           # Pad & truncate all sentences.
                            pad_to_max_length = True,
                            return_attention_mask = True,   # Construct attn. masks.
                            return_tensors = 'pt',     # Return pytorch tensors.
                    )

        # Add the encoded sentence to the list.
        input_ids.append(encoded_dict['input_ids'])

        # And its attention mask (simply differentiates padding from non-padding).
        attention_masks.append(encoded_dict['attention_mask'])

    # Convert the lists into tensors.
    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    labels = torch.tensor(labels)

    # Print sentence 0, now as a list of IDs.
    # print('Original: ', sentences_list[0])
    # print('Token IDs:', input_ids[0])

    return input_ids, attention_masks, labels


def get_data_loaders(input_ids, attention_masks, labels, input_ids_test, attention_masks_test, labels_test):
    val_ratio = 0.2
    # Recommended batch size: 16, 32. See: https://arxiv.org/pdf/1810.04805.pdf
    batch_size = 32

    # Indices of the train and validation splits stratified by labels
    train_idx, val_idx = train_test_split(
        np.arange(len(labels)),
        test_size = val_ratio,
        shuffle = True,
        stratify = labels)

    # Train and validation sets
    train_set = TensorDataset(input_ids[train_idx], 
                            attention_masks[train_idx], 
                            labels[train_idx])

    val_set = TensorDataset(input_ids[val_idx], 
                            attention_masks[val_idx], 
                            labels[val_idx])

    test_set = TensorDataset(input_ids_test, 
                            attention_masks_test, 
                            labels_test)

    # Prepare DataLoader
    train_dataloader = DataLoader(
                train_set,
                sampler = RandomSampler(train_set),
                batch_size = batch_size
            )

    validation_dataloader = DataLoader(
                val_set,
                sampler = SequentialSampler(val_set),
                batch_size = batch_size
            )

    test_dataloader = DataLoader(
                test_set,
                sampler = SequentialSampler(test_set),
                batch_size = batch_size
            )

    print('{:>5,} training samples'.format(len(train_idx)))
    print('{:>5,} validation samples'.format(len(val_idx)))
    print('{:>5,} test samples'.format(len(input_ids_test)))

    return train_dataloader, validation_dataloader, test_dataloader

In [7]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

In [8]:
prep_train_texts = [data_preprocessing(text) for text in train_texts]
max_sentence_length = get_max_sentence_length(prep_train_texts, tokenizer)
input_ids, attention_masks, labels = bert_preprocessing(prep_train_texts, tokenizer, max_sentence_length, train_labels)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Max sentence length:  267




In [9]:
prep_test_texts = [data_preprocessing(text) for text in test_texts]
input_ids_test, attention_masks_test, labels_test = bert_preprocessing(prep_test_texts, tokenizer, max_sentence_length, test_labels)

In [10]:
train_dataloader, validation_dataloader, test_dataloader = get_data_loaders(input_ids, attention_masks, labels, input_ids_test, attention_masks_test, labels_test)

8,000 training samples
2,000 validation samples
2,000 test samples


In [11]:
# print(' Original: ', prep_test_texts[0])

# # Print the sentence split into tokens.
# print('Tokenized: ', tokenizer.tokenize(prep_test_texts[0]))

# # Print the sentence mapped to token ids.
# print('Token IDs: ', tokenizer.convert_tokens_to_ids(tokenizer.tokenize(prep_test_texts[0])))

# Load BERT model

In [12]:
# model = BertForSequenceClassification.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')
model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-uncased", # Use the 12-layer BERT model, with an uncased vocab.
    num_labels = 2, # The number of output labels--2 for binary classification.
                    # You can increase this for multi-class tasks.
    output_attentions = False, # Whether the model returns attentions weights.
    output_hidden_states = False, # Whether the model returns all hidden-states.
    
)

model.cuda()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [13]:
# Get all of the model's parameters as a list of tuples.
params = list(model.named_parameters())

print('The BERT model has {:} different named parameters.\n'.format(len(params)))

print('==== Embedding Layer ====\n')

for p in params[0:5]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== First Transformer ====\n')

for p in params[5:21]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== Output Layer ====\n')

for p in params[-4:]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

The BERT model has 201 different named parameters.

==== Embedding Layer ====

bert.embeddings.word_embeddings.weight                  (30522, 768)
bert.embeddings.position_embeddings.weight                (512, 768)
bert.embeddings.token_type_embeddings.weight                (2, 768)
bert.embeddings.LayerNorm.weight                              (768,)
bert.embeddings.LayerNorm.bias                                (768,)

==== First Transformer ====

bert.encoder.layer.0.attention.self.query.weight          (768, 768)
bert.encoder.layer.0.attention.self.query.bias                (768,)
bert.encoder.layer.0.attention.self.key.weight            (768, 768)
bert.encoder.layer.0.attention.self.key.bias                  (768,)
bert.encoder.layer.0.attention.self.value.weight          (768, 768)
bert.encoder.layer.0.attention.self.value.bias                (768,)
bert.encoder.layer.0.attention.output.dense.weight        (768, 768)
bert.encoder.layer.0.attention.output.dense.bias              (

# Model training

In [14]:
import time
import datetime
import numpy as np


# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [15]:
train_on_gpu = torch.cuda.is_available()

In [18]:
def train_test_model(train_dataloader, validation_dataloader, test_dataloader, model, params):
    
    with mlflow.start_run(run_name=params['run_name']) as run:
        
        mlflow.log_params(params)
        if train_on_gpu:
            model.cuda()

        optimizer = AdamW(model.parameters(),
                        lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5
                        eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
                        )

        # Number of training epochs. The BERT authors recommend between 2 and 4.
        epochs = params['n_epochs']

        # Total number of training steps is [number of batches] x [number of epochs].
        total_steps = len(train_dataloader) * epochs

        # Create the learning rate scheduler.
        scheduler = get_linear_schedule_with_warmup(optimizer,
                                                    num_warmup_steps = 0, # Default value in run_glue.py
                                                    num_training_steps = total_steps)

        training_stats = []

        # Measure the total training time for the whole run.
        total_t0 = time.time()

        valid_loss_min = np.Inf # track change in validation loss

        # For each epoch...
        for epoch_i in range(0, epochs):

            # ========================================
            #               Training
            # ========================================

            # Perform one full pass over the training set.

            print("")
            print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
            print('Training...')

            # Measure how long the training epoch takes.
            t0 = time.time()

            # Reset the total loss for this epoch.
            total_train_loss = 0
            total_eval_loss = 0

            model.train()

            # For each batch of training data...
            for batch in train_dataloader:
                device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
                b_input_ids = batch[0].type(torch.LongTensor).to(device)
                b_input_mask = batch[1].type(torch.LongTensor).to(device)
                b_labels = batch[2].type(torch.LongTensor).to(device)

                model.zero_grad()

                output = model(
                    b_input_ids,
                    token_type_ids=None,
                    attention_mask=b_input_mask,
                    labels=b_labels
                )

                total_train_loss += output.loss.item()
                output.loss.backward()

                # Clip the norm of the gradients to 1.0.
                # This is to help prevent the "exploding gradients" problem.
                torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

                optimizer.step()
                scheduler.step()

            # Measure how long this epoch took.
            training_time = format_time(time.time() - t0)

            print("")
            print("  Training epoch took: {:}".format(training_time))

            # ========================================
            #               Validation
            # ========================================
            # After the completion of each training epoch, measure our performance on
            # our validation set.

            print("")
            print("Running Validation...")

            t0 = time.time()

            model.eval()

            # Evaluate data for one epoch
            for batch in validation_dataloader:
                device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
                b_input_ids = batch[0].type(torch.LongTensor).to(device)
                b_input_mask = batch[1].type(torch.LongTensor).to(device)
                b_labels = batch[2].type(torch.LongTensor).to(device)

                # Tell pytorch not to bother with constructing the compute graph during
                # the forward pass, since this is only needed for backprop (training).
                with torch.no_grad():
                    output = model(
                        b_input_ids,
                        token_type_ids=None,
                        attention_mask=b_input_mask,
                        labels=b_labels
                    )

                # Accumulate the validation loss.
                total_eval_loss += output.loss.item()

            # Calculate the average loss over all of the batches.
            avg_train_loss = total_train_loss / len(train_dataloader)
            avg_val_loss = total_eval_loss / len(validation_dataloader)

            # Measure how long the validation run took.
            validation_time = format_time(time.time() - t0)

            # print training/validation statistics 
            print('Epoch: {} \tTraining Loss: {:.6f} \tValidation Loss: {:.6f} \t\Time: {:}'.format(
                epoch_i, avg_train_loss, avg_val_loss, validation_time))

            mlflow.log_metric("train_loss", avg_train_loss, step=epoch_i + 1)
            mlflow.log_metric("valid_loss", avg_val_loss, step=epoch_i + 1)
    
            # save model if validation loss has decreased
            if avg_val_loss <= valid_loss_min:
                print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(
                valid_loss_min,
                avg_val_loss))
                output_dir = '../models/bert'
                model_to_save = model.module if hasattr(model, 'module') else model
                model_to_save.save_pretrained(output_dir)
                tokenizer.save_pretrained(output_dir)
                valid_loss_min = avg_val_loss


        # test
        # track test loss

        # ========================================
        #               Testing
        # ========================================
        # After the completion of each training epoch, measure our performance on
        # our validation set.

        print("")
        print("Running Testing...")

        total_test_loss = 0.0

        class_correct = list(0. for i in range(len(params['class_names'])))
        class_total = list(0. for i in range(len(params['class_names'])))
        
        # Put the model in evaluation mode--the dropout layers behave differently
        # during evaluation.
        model.eval()

        device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

        # Evaluate data for one epoch
        for idx, batch in enumerate(test_dataloader):
            
            b_input_ids = batch[0].type(torch.LongTensor).to(device)
            b_input_mask = batch[1].type(torch.LongTensor).to(device)
            b_labels = batch[2].type(torch.LongTensor).to(device)

            with torch.no_grad():
                output = model(
                    b_input_ids,
                    token_type_ids=None,
                    attention_mask=b_input_mask,
                    labels=b_labels
                )

            # Accumulate the validation loss.
            total_test_loss += output.loss.item()

            # Calculate the accuracy for this batch of test sentences, and
            # accumulate it over all batches.
            result = torch.argmax(output.logits, 1)
            correct_tensor = result.eq(b_labels.data.view_as(result))
            correct = np.squeeze(correct_tensor.numpy()) if not train_on_gpu else np.squeeze(correct_tensor.cpu().numpy())
            if idx == 0:
                total_ground_truth = b_labels.data
                total_pred = result
            else:
                total_ground_truth = torch.cat((total_ground_truth, b_labels.data), 0)
                total_pred = torch.cat((total_pred, result), 0)

            # calculate test accuracy for each object class
            for i in range(len(b_labels)): ## TODO
                label = b_labels.data[i]
                class_correct[label] += correct[i].item()
                class_total[label] += 1

        # Calculate the average loss over all of the batches.
        avg_test_loss = total_test_loss / len(test_dataloader)
        print('Test Loss: {:.6f}\n'.format(avg_test_loss))
        mlflow.log_metric("avg_test_loss", avg_test_loss)

        
        for i in range(len(params['class_names'])):
            if class_total[i] > 0:
                class_accuracy = 100 * class_correct[i] / class_total[i]
                print('Test Accuracy of %5s: %2d%% (%2d/%2d)' % (
                    params['class_names'][i], class_accuracy,
                    np.sum(class_correct[i]), np.sum(class_total[i])))
                mlflow.log_metric(f"test_accuracy_{params['class_names'][i]}", class_accuracy)
            else:
                print('Test Accuracy of %5s: N/A (no training examples)' % (params['class_names'][i]))

        # test_accuracy = 100. * np.sum(class_correct) / np.sum(class_total)
        # print('\nTest Accuracy (Overall): %2d%% (%2d/%2d)' % (
        #     test_accuracy,
        #     np.sum(class_correct), np.sum(class_total)))
        # mlflow.log_metric("test_accuracy_overall", test_accuracy)

        accuracy_score_number = accuracy_score(total_ground_truth.cpu(), total_pred.cpu())
        recall_score_number = recall_score(total_ground_truth.cpu(), total_pred.cpu())
        precision_score_number = precision_score(total_ground_truth.cpu(), total_pred.cpu())
        mlflow.log_metric("test_accuracy", accuracy_score_number)
        mlflow.log_metric("test_recall", recall_score_number)
        mlflow.log_metric("test_precision", precision_score_number)

        print('\nTest Accuracy (Overall): %2d%%' % (100 * accuracy_score_number))
        print('\nTest Recall (Overall): %2d%%' % (100 * recall_score_number))
        print('\nTest precision (Overall): %2d%%' % (100 * precision_score_number))

        # Record all statistics from this epoch.
        # training_stats.append(
        #     {
        #         'epoch': epoch_i + 1,
        #         'Training Loss': avg_train_loss,
        #         'Valid. Loss': avg_val_loss,
        #         'Valid. Accur.': accuracy_score_number,
        #         'Training Time': training_time,
        #         'Validation Time': validation_time
        #     }
        # )

        print("")
        print("Training complete!")

        print("Total training took {:} (h:mm:ss)".format(format_time(time.time() - total_t0)))

In [21]:
def main_flow(run_name) -> None:
    # Set mlflow
    TRACKING_SERVER_HOST = "localhost"
    mlflow.set_tracking_uri(f"http://{TRACKING_SERVER_HOST}:5000")
    mlflow.set_experiment("Sentiment_Analisys")

    # Feature engineering step
    tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

    prep_train_texts = [data_preprocessing(text) for text in train_texts]
    max_sentence_length = get_max_sentence_length(prep_train_texts, tokenizer)
    input_ids, attention_masks, labels = bert_preprocessing(prep_train_texts, tokenizer, max_sentence_length, train_labels)
    
    prep_test_texts = [data_preprocessing(text) for text in test_texts]
    input_ids_test, attention_masks_test, labels_test = bert_preprocessing(prep_test_texts, tokenizer, max_sentence_length, test_labels)

    train_dataloader, validation_dataloader, test_dataloader = get_data_loaders(input_ids, attention_masks, labels, input_ids_test, attention_masks_test, labels_test)

    # Select model to train
    model = AutoModelForSequenceClassification.from_pretrained(
        "bert-base-uncased", # Use the 12-layer BERT model, with an uncased vocab.
        num_labels = 2, # The number of output labels--2 for binary classification.
                        # You can increase this for multi-class tasks.
        output_attentions = False, # Whether the model returns attentions weights.
        output_hidden_states = False, # Whether the model returns all hidden-states.
        
    )
    model.cuda()
    
    # Train model
    params = {
            "class_names": ['negative', 'positive'],
            "n_epochs": 2,
            "criterion": "CrossEntropyLoss",
            "optimizer": "Adam",
            "run_name": run_name
        }
    train_test_model(train_dataloader, validation_dataloader, test_dataloader, model, params)

In [22]:
main_flow(run_name='Prueba2_BERT')

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Max sentence length:  267




8,000 training samples
2,000 validation samples
2,000 test samples


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Training...

  Training epoch took: 0:03:08

Running Validation...
Epoch: 0 	Training Loss: 0.353920 	Validation Loss: 0.254892 	\Time: 0:00:21
Validation loss decreased (inf --> 0.254892).  Saving model ...

Running Testing...
Test Loss: 0.264410

Test Accuracy of negative: 88% (845/954)
Test Accuracy of positive: 89% (940/1046)

Test Accuracy (Overall): 89%

Test Recall (Overall): 89%

Test precision (Overall): 89%

Training...


In [None]:
mlflow.end_run()

In [63]:
output_dir = '../models/bert'

# Create output directory if needed
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

print("Saving model to %s" % output_dir)

# Save a trained model, configuration and tokenizer using `save_pretrained()`.
# They can then be reloaded using `from_pretrained()`
model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
model_to_save.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

Saving model to ../models/bert


('../models/bert\\tokenizer_config.json',
 '../models/bert\\special_tokens_map.json',
 '../models/bert\\vocab.txt',
 '../models/bert\\added_tokens.json',
 '../models/bert\\tokenizer.json')

In [46]:
new_model = AutoModelForSequenceClassification.from_pretrained(
    '../models/bert', # Use the 12-layer BERT model, with an uncased vocab.
    num_labels = 2, # The number of output labels--2 for binary classification.
                    # You can increase this for multi-class tasks.   
    output_attentions = False, # Whether the model returns attentions weights.
    output_hidden_states = False, # Whether the model returns all hidden-states.
    
)

new_tokenizer = AutoTokenizer.from_pretrained('../models/bert', do_lower_case=True)

new_model.cpu()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [47]:
test_text = 'It was good but couldve been better. Great'
test_text = 'It is the worst product I have ever bought'
test_text = data_preprocessing(test_text)
tokens = new_tokenizer.encode(test_text, return_tensors='pt')
result = new_model(tokens.cpu())
result = int(torch.argmax(result.logits))

In [48]:
result

0

In [63]:
test_text = 'It was good but couldve been better. Great'
test_text = 'It is the worst product I have ever bought'
test_text = data_preprocessing(test_text)
tokens = new_tokenizer.encode(test_text, return_tensors='pt')
result = new_model(tokens.cpu())
result

SequenceClassifierOutput(loss=None, logits=tensor([[ 2.9501, -2.6430]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [71]:
torch.argmax(result.logits, 1)

tensor([0])

In [69]:
torch.max(result.logits)

tensor(2.9501, grad_fn=<MaxBackward1>)