In [1]:
import transformers
from torch.utils.data import DataLoader,Dataset
from transformers import AutoTokenizer,AutoModelForSequenceClassification,BertModel,BertForSequenceClassification,BertConfig
import torch
from torch import cuda
import numpy as np
from sklearn import metrics
from tqdm import tqdm 
import torch
import pandas as pd 


In [2]:
from sklearn.metrics import accuracy_score,f1_score

In [3]:

task = "multiclass"
model_name = "dccuchile/bert-base-spanish-wwm-cased"#"finiteautomata/beto-sentiment-analysis"#"dccuchile/bert-base-spanish-wwm-cased"
n_example = 1000
model_path = "hg_model" + "_" + task

device = 'cuda' if cuda.is_available() else 'cpu'
n_examples = [100,500,1000]

train_acc_list = []
train_loss_list = []
epoch_list = []
n_examples_list = []



## Intro

In this script we will fine-tune a text-clasiffier model (Multilabel/Multiclass), here we are given a pice of text/sentence/document needs to be classifed in one or more categories(multilabel) o one catgory (multiclass)

## Data

The base dataset is compose by four columns

* idTask : Identity Code
* task content 1 : Title of the article
* idTag : Identity Code
* tag : one of the diferent label/category


* Tags:

     * sociedad
     * deportes 
     * politica 
     * economia
     * clickbait
     * cultura
     * medio_ambiente
     * ciencia_tecnologia
     * educacion
     * opinion



We will use just two rows "task content 1"  and "tag", the "tag" column has to be change to a one-hot vector.

Lets say that the label/class of a element is "deporte"  the model needs numeric data so it can interprete the information provided, so instead of a string we use this form type of vector: [0, 0, 0, 1, 0, 0, 0, 0, 0, 0]

## Load Data

The main objective of this function are:

* Import the file in a dataframe and give it the headers as per the documentation.
* Taking the values of all the categories and coverting it into a list.
* The list is appened as a new column and other columns are removed. 

In [4]:
import pandas as pd 
from sklearn.preprocessing import LabelEncoder

def load_data(file_name,nrows,task):

    data_raw = pd.read_csv(file_name,sep = ",")

    data = data_raw.iloc[:,[1,3]]

    data.columns = ['text','tag']

    data.dropna(inplace = True)

    if task == "multiclass":

        le = LabelEncoder()

        data['label'] = le.fit_transform(data['tag'])

        if nrows > data.shape[0]:

            nrows = data.shape[0]

        data = data.sample(frac = nrows/data.shape[0])

        return data.loc[:,['text','label']],dict(zip(le.classes_,le.transform(le.classes_)))


    elif task == "multilabel":

        data['label'] = [list((row[1].values))for  row in pd.get_dummies(data['tag']).iterrows()]

        #aux = data.drop_duplicates(['tag','label'],keep = 'first')

        if nrows > data.shape[0]:

            nrows = data.shape[0]

        data = data.sample(frac = nrows/data.shape[0])

        return data.loc[:,['text','label']]#,aux

In [5]:
out_data = load_data('data.csv',n_example,task)



if isinstance(out_data,tuple):
    
    data,classes = out_data

    for key in classes.keys():

        classes[key] = int(classes[key])

    config = BertConfig.from_pretrained(
    
    model_name,
    num_labels = len(classes),
    label2id = classes,
    id2label = dict(zip(classes.values(),classes.keys()))
    )

    

elif isinstance(out_data,pd.DataFrame):

    data = out_data
    classes = data.iloc[0,1]

    config = BertConfig.from_pretrained(
    
    model_name)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._set_item(key, value)


In [6]:
classes

{'ciencia_tecnologia': 0,
 'clickbait': 1,
 'cultura': 2,
 'deportes': 3,
 'economia': 4,
 'educacion': 5,
 'medio_ambiente': 6,
 'opinion': 7,
 'politica': 8,
 'sociedad': 9}

## Hyperparameters

We select some hyperparameters for train the model

In [7]:
n_classes = len(classes)

MAX_LEN = 128

TRAIN_BATCH_SIZE = 8

VALID_BATCH_SIZE = 4

EPOCHS = 5

LEARNING_RATE = 1e-05

device = 'cuda' if cuda.is_available() else 'cpu'

## Tokenizer and Model Selection

We select the tokenizer and the model structure using the function from_pretained() and a model to train, here we will define the tokenizer because is necessary for creating the Pytorch Datset, we will define the model further the script

In [8]:
tokenizer = AutoTokenizer.from_pretrained(model_name)


## Split Data

Split the data in train and validation dataset, the arguments are the  process dataframe  and the size of the test set

In [9]:
def split_data(pandas_df,train_size):

    if ((train_size > 0) & (train_size <=1)):

        pass

    elif train_size > 1:

        train_size = train_size/pandas_df.shape[0]

    train_set = pandas_df.sample(frac = train_size,random_state = 42)

    test_set = pandas_df.drop(train_set.index).reset_index(drop = True)

    train_set = train_set.reset_index(drop = True)

    return train_set,test_set

In [10]:
train_dataset,test_dataset = split_data(data,0.8)

## Dataset/DataLoader

We need to create a dataset that fits our needs, it's known that the deep learning models can't process raw text, so we need to pre-process the text before to send it to the neural network, also we will define a Dataloader to feed the data in bathches for training and processing 

Pytorch Dataset and Dataloader allow us to defining and controlling the data pre-processing and its passage to neural network.

## Dataset

* We will define a python class called CustomDataset, is defined to accept a list/Series/arrey of texts and labels, a tokenizer. 

* We will use a Bert tokenizer to encode out text data

* The tokenizer uses the encode_plus method to perform tokenization and generate the necessary outputs, namely: ids, attention_mask, token_type_ids

In [11]:
class DatasetMCLass(Dataset):

    def __init__(self, titles, targets, tokenizer, max_len):

      self.titles = titles
      self.targets = targets
      self.tokenizer = tokenizer
      self.max_len = max_len

    def __len__(self):

      return len(self.titles)

    def __getitem__(self, index):

      title = str(self.titles[index])

      target = self.targets[index]

      encoding = self.tokenizer.encode_plus(
        title,
        add_special_tokens=True,
        max_length=self.max_len,
        return_token_type_ids=True,
        pad_to_max_length=True,
        return_attention_mask=True)

      input_ids = encoding['input_ids']

      attention_mask = encoding['attention_mask']

      token_type_ids = encoding['token_type_ids']

      return {
        'review_text': title,
        'input_ids': torch.tensor(input_ids, dtype = torch.long),
        'attention_mask': torch.tensor(attention_mask, dtype = torch.long),
        'targets': torch.tensor(target, dtype=torch.long),
        'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long)
    }

class DatasetMLabel(Dataset):

    def __init__(self, titles, targets, tokenizer, max_len):

      self.titles = titles
      self.targets = targets
      self.tokenizer = tokenizer
      self.max_len = max_len

    def __len__(self):

      return len(self.titles)

    def __getitem__(self, item):

      title = str(self.titles[item])

      target = self.targets[item]

      encoding = self.tokenizer.encode_plus(
        title,
        add_special_tokens=True,
        max_length=self.max_len,
        return_token_type_ids=True,
        pad_to_max_length=True,
        return_attention_mask=True,
        return_tensors='pt'
        
      )
      return {
        'review_text': title,
        'input_ids': encoding['input_ids'].flatten(),
        'attention_mask': encoding['attention_mask'].flatten(),
        'targets': torch.tensor(target, dtype=torch.float),
        'token_type_ids': encoding['token_type_ids'].flatten()
    }

dict_DL = {'multiclass':DatasetMCLass,
          'multilabel':DatasetMLabel}

In [12]:
data_loader = dict_DL[task]

In [13]:
training_set = data_loader(train_dataset['text'],train_dataset['label'], tokenizer, MAX_LEN)

testing_set = data_loader(test_dataset['text'],test_dataset['label'], tokenizer, MAX_LEN)


## DataLoader

* Dataloader is used to for creating training and validation dataloader that load data to the neural network in a defined manner. This is needed because all the data from the dataset cannot be loaded to the memory at once, hence the amount of dataloaded to the memory and then passed to the neural network needs to be controlled.

* This control is achieved using the parameters such as batch_size and max_len.

* Training and Validation dataloaders are used in the training and validation part of the flow respectively

In [14]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

## Neural Network Model

* This neural network will use a BERTClass

* It will be composed by a bert model, followed by a Droput Layer (to avoid overfitting) and a linear layer.

* The output_1 is passed to the droput layer and the to the linear layer.

* The number of output dimensions is the same as the classes/categories.

* Final layer outputs is what will be used to calcuate 
the loss and to determine the accuracy of models prediction

* We will initiate an instance of the network called model. This instance will be used for training and then to save the final trained model for future inference.

* The Class take the parameter model_name

In [15]:
class BERTClass(torch.nn.Module):
    def __init__(self,model_name,config,out_classes):

        super(BERTClass, self).__init__()

        self.config = config
        self.out_classes = out_classes
        self.model_name = model_name

        self.l1 = BertModel.from_pretrained(self.model_name ,config = self.config)
        self.pre_classifier = torch.nn.Linear(768, 768)
        self.dropout = torch.nn.Dropout(0.3)
        self.classifier = torch.nn.Linear(768,self.out_classes )

    def forward(self, input_ids, attention_mask):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask)#,token_type_ids=token_type_ids)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        pooler = self.pre_classifier(pooler)
        pooler = torch.nn.ReLU()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        return output

def model_selection(task,model_name,config,out_classes):
    

    dict_model = {'multilabel': BERTClass(model_name, config,out_classes),
                'multiclass':BERTClass(model_name, config,out_classes)}#from_pretrained(model_name, config =  config)}

    return dict_model[task]

        


## Loss Function

* As defined above, the loss function used will be a combination of Binary Cross Entropy which is implemented as BCELogits Loss in PyTorch in case we eÂ¡want to do a multilabel classification, if we want to do TEXT CLASIFFICATION we should use CrossEntropyLoss

In [16]:
def loss_fn_mlabel(output,targets):


        return torch.nn.BCEWithLogitsLoss()(output,targets)


dict_loss_fn = {'multiclass':torch.nn.CrossEntropyLoss(),
                'multilabel':torch.nn.BCEWithLogitsLoss()}

In [19]:
model = BertForSequenceClassification.from_pretrained( "dccuchile/bert-base-spanish-wwm-cased",config = config)#,id2label = id2label,label2id = classes)#model_selection(task,model_name,config,len(classes))

model.to(device)

Some weights of the model checkpoint at dccuchile/bert-base-spanish-wwm-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dccuchi

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(31002, 768, padding_idx=1)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [20]:
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)


## Model Fine-Tune

Our train function trains the modle on the training set a number of times (EPOCH), each epoch is how many time complete data will be passed through the network

* The dataloader passes data to the model based on the batch size.

* Subsequent output from the model and the actual category are compared to calculate the loss.

* Loss value is used to optimize the weights of the neurons in the network.

* After every 10 steps the loss value is printed in the console.

In [21]:
def calcuate_accu_label(big_idx, targets):

    n_correct = (big_idx==np.argmax(targets)).sum().item()
    
    return n_correct

def calcuate_accu_class(big_idx, targets):

    n_correct = (big_idx==targets).sum().item()
    
    return n_correct

class Train_Eval():

    def __init__(self,epoch,model,training_loader,device,optimizer,loss_fn,file_name):

        self.epoch = epoch
        self.model = model
        self.training_loader = training_loader
        self.device = device
        self.optimizer = optimizer
        self.loss_fn = loss_fn
        self.file_name = file_name
        
    def train_mclass(epoch,model,training_loader,device,optimizer,loss_fn):
        
        model.train()
        
        tr_loss = 0
        n_correct = 0
        nb_tr_steps = 0
        nb_tr_examples = 0
        loss_function = loss_fn
  
        for _,batch in enumerate(training_loader, 0):
            
            ids = batch['input_ids'].to(device, dtype = torch.long)
            mask = batch['attention_mask'].to(device, dtype = torch.long)
            token_type_ids = batch['token_type_ids'].to(device, dtype = torch.long)
            targets = batch['targets'].to(device, dtype = torch.long)

            outputs = model(ids, mask)['logits']#.squeeze()

            preds = torch.tensor(torch.argmax(outputs, axis=-1))

            loss = loss_function(outputs, targets)

            tr_loss += loss.item()

            #big_val, big_idx = torch.max(outputs.data, dim=1)

            n_correct += calcuate_accu_class(preds, targets)

            nb_tr_steps += 1

            nb_tr_examples+=targets.size(0)

            if _% 10 ==0:

                loss_step = tr_loss/nb_tr_steps
                accu_step = (n_correct*100)/nb_tr_examples 
                print(f'Epoch: {epoch}, Step: {_} ,Loss:  {loss.item()},Acc: {accu_step}')

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        #print(f'The Total Accuracy for Epoch {epoch}: {(n_correct*100)/nb_tr_examples}')
        epoch_loss = tr_loss/nb_tr_steps
        epoch_accu = (n_correct*100)/nb_tr_examples

        return epoch_accu,epoch_loss


    def validation_mclass(epoch,model,testing_loader,device,optimizer,loss_fn):

        model.eval()
        
        fin_targets=[]
        
        fin_outputs=[]
        
        with torch.no_grad():
        
            for _, batch in enumerate(testing_loader, 0):
        
                ids = batch['input_ids'].to(device, dtype = torch.long)
        
                mask = batch['attention_mask'].to(device, dtype = torch.long)
        
                token_type_ids = batch['token_type_ids'].to(device, dtype = torch.long)
        
                targets = batch['targets'].to(device, dtype = torch.long)
        
                outputs = model(ids, mask)['logits']#.squeeze()
        
                fin_targets.extend(targets.cpu().detach().numpy().tolist())
        
                fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
        
        return fin_outputs, fin_targets


    def train_mlabel(epoch,model,training_loader,device,optimizer,loss_fn):
        
        model.train()
        tr_loss = 0
        n_correct = 0
        nb_tr_steps = 0
        nb_tr_examples = 0
        
        for _,batch in enumerate(training_loader, 0):
            
            ids = batch['input_ids'].to(device, dtype = torch.long)
            mask = batch['attention_mask'].to(device, dtype = torch.long)
            token_type_ids = batch['token_type_ids'].to(device, dtype = torch.long)
            targets = batch['targets'].to(device, dtype = torch.float)

            outputs = model(ids, mask)['logits'].squeeze()
            loss = loss_fn(outputs, targets)
            tr_loss += loss.item()
            big_val, big_idx = torch.max(outputs.data, dim=1)
            n_correct += calcuate_accu_label(big_idx, targets)

            nb_tr_steps += 1
            nb_tr_examples+=targets.size(0)

            optimizer.zero_grad()

            

            if _%10 == 0:

                print(f'Epoch: {epoch}, Loss:  {loss.item()}')
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        
        epoch_loss = tr_loss/nb_tr_steps
        epoch_accu = (n_correct*100)/nb_tr_examples

        return epoch_accu,epoch_loss

    def validation_mlabel(epoch,model,testing_loader,device,optimizer,loss_fn):

        model.eval()
        
        fin_targets=[]
        
        fin_outputs=[]
        
        with torch.no_grad():
        
            for _, batch in enumerate(testing_loader, 0):
        
                ids = batch['input_ids'].to(device, dtype = torch.long)
        
                mask = batch['attention_mask'].to(device, dtype = torch.long)
        
                token_type_ids = batch['token_type_ids'].to(device, dtype = torch.long)
        
                targets = batch['targets'].to(device, dtype = torch.float)
        
                outputs = model(ids, mask)#, token_type_ids)
        
                fin_targets.extend(targets.cpu().detach().numpy().tolist())
        
                fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
        
        return fin_outputs, fin_targets

dic_train_val = {'multiclass':[Train_Eval.train_mclass,Train_Eval.validation_mclass],
                'multilabel':[Train_Eval.train_mlabel,Train_Eval.validation_mlabel]}

In [22]:

train,validation = dic_train_val[task]

loss_fn = dict_loss_fn[task]

In [23]:
for epoch in range((EPOCHS)):

        acc,loss = train(epoch = epoch, model = model,training_loader = training_loader, device = device,optimizer = optimizer,loss_fn = loss_fn)

        train_acc_list.append(acc)
        train_loss_list.append(loss)
        epoch_list.append(epoch)
        n_examples_list.append(n_example)


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
  preds = torch.tensor(torch.argmax(outputs, axis=-1))


Epoch: 0, Step: 0 ,Loss:  2.3572640419006348,Acc: 12.5
Epoch: 0, Step: 10 ,Loss:  2.2257213592529297,Acc: 19.318181818181817
Epoch: 0, Step: 20 ,Loss:  2.186922550201416,Acc: 20.238095238095237
Epoch: 0, Step: 30 ,Loss:  2.0743865966796875,Acc: 22.177419354838708
Epoch: 0, Step: 40 ,Loss:  2.2242164611816406,Acc: 21.951219512195124
Epoch: 0, Step: 50 ,Loss:  2.2274622917175293,Acc: 24.264705882352942
Epoch: 0, Step: 60 ,Loss:  1.8506919145584106,Acc: 27.25409836065574
Epoch: 0, Step: 70 ,Loss:  1.7519422769546509,Acc: 31.161971830985916
Epoch: 0, Step: 80 ,Loss:  1.1332201957702637,Acc: 33.641975308641975
Epoch: 0, Step: 90 ,Loss:  1.3150215148925781,Acc: 34.535367545076284
Epoch: 1, Step: 0 ,Loss:  1.303417444229126,Acc: 75.0
Epoch: 1, Step: 10 ,Loss:  1.6474583148956299,Acc: 51.13636363636363
Epoch: 1, Step: 20 ,Loss:  1.4207819700241089,Acc: 52.38095238095238
Epoch: 1, Step: 30 ,Loss:  1.5574790239334106,Acc: 51.61290322580645
Epoch: 1, Step: 40 ,Loss:  1.3229025602340698,Acc: 55.79

In [24]:
model_path

'hg_model_multiclass'

In [25]:
if not os.path.exists(model_path):

    os.makedirs(model_path )

#model.save_pretrained( model_path )

#tokenizer.save_pretrained(model_path )

In [26]:
tokenizer.save_pretrained(model_path )

('hg_model_multiclass/tokenizer_config.json',
 'hg_model_multiclass/special_tokens_map.json',
 'hg_model_multiclass/vocab.txt',
 'hg_model_multiclass/added_tokens.json',
 'hg_model_multiclass/tokenizer.json')

In [27]:
model.save_pretrained(model_path)

We save the model in the HuggingFace format

## Validation 

During the validation stage we pass the unseen data(Testing Dataset) to the model. This step determines how good the model performs on the unseen data.

This unseen data is the 20% of train.csv which was seperated during the Dataset creation stage. During the validation stage the weights of the model are not updated. Only the final output is compared to the actual value. This comparison is then used to calcuate the accuracy of the model.

As defined above to get a measure of our models performance we are using the following metrics.

* Accuracy Score
* F1 Micro
* F1 Macro


In [28]:
outputs, targets = validation(epoch,model,testing_loader,device,optimizer,loss_fn)

#outputs = np.array(outputs) >= 0.5

outputs = np.argmax(outputs, axis=-1)

if task == "multilabel":

    targets = np.array([np.argmax(target) for target in targets])

accuracy = accuracy_score(targets, outputs)

f1_score_micro = f1_score(targets, outputs, average='micro')

f1_score_macro = f1_score(targets, outputs, average='macro')

print(targets,outputs)

print(f"Accuracy Score = {accuracy}")

print(f"F1 Score (Micro) = {f1_score_micro}")

print(f"F1 Score (Macro) = {f1_score_macro}")

print(f"Finalizo el entranmiento del epoch {epoch} usando {n_example} de entrnamineto")


[9, 1, 9, 4, 9, 3, 9, 9, 3, 3, 1, 9, 9, 6, 1, 1, 9, 6, 9, 1, 9, 0, 9, 9, 9, 1, 9, 8, 9, 5, 6, 9, 9, 8, 9, 9, 3, 3, 8, 3, 1, 0, 0, 3, 1, 9, 1, 8, 4, 1, 3, 6, 8, 1, 3, 9, 3, 8, 9, 3, 8, 5, 4, 9, 9, 9, 5, 1, 3, 8, 8, 4, 8, 4, 9, 9, 8, 1, 1, 1, 9, 8, 8, 4, 9, 3, 1, 9, 9, 9, 4, 1, 8, 9, 4, 1, 2, 3, 9, 8, 5, 8, 8, 4, 9, 9, 1, 3, 3, 1, 3, 3, 9, 8, 1, 4, 8, 8, 9, 3, 1, 2, 1, 9, 3, 3, 2, 8, 3, 4, 9, 3, 5, 4, 9, 2, 9, 7, 3, 1, 3, 1, 1, 6, 3, 8, 2, 3, 3, 9, 5, 9, 3, 2, 8, 5, 1, 2, 8, 4, 1, 4, 1, 4, 4, 1, 3, 4, 8, 3, 2, 1, 9, 2, 8, 1, 4, 9, 3, 9] [9 6 9 4 6 3 4 9 3 3 3 8 9 4 1 9 8 4 9 1 9 4 1 9 8 1 8 4 3 8 6 9 9 8 9 9 3
 3 8 3 4 4 0 9 1 3 4 8 4 9 3 6 8 1 3 9 3 9 8 3 8 5 4 9 9 1 5 1 3 8 8 4 8 4
 9 8 8 1 1 1 8 8 8 3 1 3 1 9 3 9 4 2 8 4 4 1 3 2 9 8 5 8 8 4 9 8 1 8 3 1 3
 3 4 4 1 4 4 8 9 3 1 1 1 4 3 8 2 8 3 4 4 3 4 4 9 4 4 9 3 9 8 1 1 8 1 8 8 3
 3 9 8 9 3 1 4 5 1 2 8 4 1 4 1 4 4 3 9 4 8 3 9 1 9 2 8 1 4 9 3 1]
Accuracy Score = 0.6722222222222223
F1 Score (Micro) = 0.6722222222222223
F1 Score (Macro) = 

In [29]:
pd.DataFrame(zip(n_examples_list,epoch_list,train_loss_list,train_acc_list),columns=['n_example','epoch','loss','acc'])



Unnamed: 0,n_example,epoch,loss,acc
0,1000,0,1.949754,34.535368
1,1000,1,1.323561,58.391123
2,1000,2,0.831583,77.115118
3,1000,3,0.496029,88.349515
4,1000,4,0.272954,94.868239


In [30]:
model_path 

'hg_model_multiclass'

In [43]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained(model_path)

model = AutoModelForSequenceClassification.from_pretrained(model_path)


texto = "El resultado fue excelente, el equipo jugo de gran manera y derroto a su rival en el segundo tiempo"

encoded_review = tokenizer.encode_plus(
  texto,
  max_length=MAX_LEN,
  add_special_tokens=True,
  #return_token_type_ids=False,
  pad_to_max_length=True,
  return_attention_mask=True,
  return_tensors='pt',
)

input_ids = encoded_review['input_ids'].to(device)
attention_mask = encoded_review['attention_mask'].to(device)
output = model(input_ids, attention_mask)
_, prediction = torch.max(output['logits'], dim=1)
print(f'Review text: {texto}')

print(f'Sentiment  : {model.config.id2label[prediction.detach().cpu().numpy()[0]]}')


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Review text: El resultado fue excelente, el equipo jugo de gran manera y derroto a su rival en el segundo tiempo
Sentiment  : deportes
