In [36]:
import torch
import torch.optim
import numpy as np
import pandas as pd
import time
import datetime
import random

from transformers import  BertTokenizer
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader, SequentialSampler
from transformers import BertForSequenceClassification, AdamW
from transformers import get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

In [2]:
# Select cpu or cuda
run_on = 'cpu'
device = torch.device(run_on)

In [3]:
# Importing the data:
df = pd.read_csv('/Users/mario/OneDrive/ITAM/4to semestre/Inv Aplicada II/Mario - Tesis/code/BETO/data/sample_results_text_clean.csv')

In [4]:
# Load the dataset into a pandas dataframe.
reviews = df['text']
sentiment = df['label'] # 1: Pro // 0: Anti 

In [5]:
# Split dataset
X_train, X_val, y_train, y_val = train_test_split(reviews, sentiment, stratify=sentiment, test_size=0.3, random_state=42)

In [6]:
# Report datasets lenghts
print('Training set length : {}'.format(len(X_train)))
print('Validation set length : {}'.format(len(X_val)))

Training set length : 14
Validation set length : 6


In [7]:
# Tokenization
tokenizer = BertTokenizer.from_pretrained("../pytorch/", do_lower_case=True)

In [8]:
def preprocessing(dataset):
    input_ids = []
    attention_mask = []
    for doc in dataset:
        encoded_doc = tokenizer.encode_plus(doc,
                   add_special_tokens=True, max_length=512,
                   truncation=True,padding=True)
        input_ids.append(encoded_doc['input_ids'])
        attention_mask.append(encoded_doc['attention_mask'])
    return (torch.tensor(input_ids),
           torch.tensor(attention_mask))

In [11]:
X_train[1]

'periodico la jornadamiercoles de enero de p la bancada de morena en la camara de diputados reformara el articulo bis de la ley general de responsabilidades administrativas para catalogar como grave que las dependencias federales cometan subejercicios carol altamirano representante por ese partido puntualizo que si bien la ley federal de presupuesto y responsabilidad hacendaria indica que el subejercicio sera sancionado conforme a la nueva ley general de responsabilidades al revisar las faltas contenidas en esa ley no hay ninguna que describa con precision el subejercicio la iniciativa ya se encuentra en estudio para su dictamen en la comision de transparencia y anticorrupcion en su contenido se subraya que si un funcionario cae en el subejercicio se expondra a una sancion administrativa y al ser una conducta grave podra ameritar la suspension destitucion del empleo sancion economica e inhabilitacion temporal en el servicio publico el diputado morenista senalo que los subejercicios afe

In [12]:
# Apply preprocessing to dataset
X_train_inputs, X_train_masks = preprocessing(X_train)
X_val_inputs, X_val_masks = preprocessing(X_val)



In [16]:
X_val_inputs[1]

tensor([    4,  1247,  9478,  1081,  8727,  1086,  1294, 30984,  3285,  2436,
         1041,  1035,  1069,  2022,  1054,  6612,  1137, 11418,  2450,  4418,
        11418,  1600,  5023,  1039,  1939,  7085,  1009,  3050,  1009,  1853,
         1039,  2914,  1009, 10877,  4741,  2645, 14579, 30956,  1096,  1195,
         1853,  1067,  8814,  1054,  4418, 11418,  1600,  1151,  1028,  1250,
         1096,  1652,  1247,  1012, 11053, 11418,  1057,  3148,  1946,  1009,
         1041,  1067,  4661, 26461,  1320,  2788,  1048,  2988,  1074,  7085,
         3229,  1905, 12534,  1035,  1032, 18288,  1332,  2213,  1018,  1009,
         1329,  1190, 30991,  1035, 12210,  1081,  2248, 25958, 30957,  1283,
         2310,  1022,  1009,  1032,  3457,  1009,  1032,  8808, 30956,  1040,
         1032, 18081,  1054,  1057, 17887, 30957, 21843, 15133,  1339,  1040,
        24034,  1039, 30827,  1035,     5])

In [17]:
# Report max n° tokens in a sentence
max_len = max([torch.sum(sen) for sen in X_train_masks])
print('Max n°tokens in a sentence: {0}'.format(max_len))

Max n°tokens in a sentence: 115


In [22]:
# Data loaders
batch_size = 64
y_train_labels = torch.tensor(y_train.values)
y_val_labels = torch.tensor(y_val.values)

In [24]:
def dataloader(x_inputs, x_masks, y_labels):
    data = TensorDataset(x_inputs, x_masks, y_labels)
    sampler = SequentialSampler(data)
    dataloader = DataLoader(data, sampler=sampler,
                 batch_size=batch_size,
                 num_workers=4)
    return dataloader

In [28]:
train_dataloader = dataloader(X_train_inputs, X_train_masks,
                   y_train_labels)
val_dataloader = dataloader(X_val_inputs, X_val_masks, 
                 y_val_labels)

In [30]:
from transformers import BertModel
import torch

# set random seed
def set_seed(value):
    random.seed(value)
    np.random.seed(value)
    torch.manual_seed(value)
    torch.cuda.manual_seed_all(value)
set_seed(42)

In [31]:
# Create model
model = BertForSequenceClassification.from_pretrained("../pytorch/", num_labels=2, output_attentions=False, output_hidden_states=False)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ../pytorch/ and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [32]:
model.softmax = torch.nn.Softmax(3)

In [33]:
# Optimizer
optimizer = AdamW(model.parameters(),
                  lr = 4e-5,
                  eps = 1e-6
                  )

if run_on == 'cuda':
    model.cuda()

# Define number of epochs
epochs = 10

total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer,
            num_warmup_steps = 0, 
            num_training_steps = total_steps)



In [34]:
#fuction to format time
def format_time(elapsed):
    elapsed_rounded = int(round((elapsed)))
    return str(datetime.timedelta(seconds=elapsed_rounded))

#function to compute accuracy
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [37]:
#function to train the model
def training(n_epochs, training_dataloader,
             validation_dataloader):
    # ========================================
    #               Training 
    # ========================================
    print('======= Training =======')
    for epoch_i in range(0,n_epochs):
        # Perform one full pass over the training set
        print("")
        print('======= Epoch {:} / {:} ======='.format(
             epoch_i + 1, epochs))
        # Measure how long the training epoch takes.
        t0 = time.time()
        # Reset the total loss for this epoch.
        total_loss = 0
        # Put the model into training mode.
        model.train()
        # For each batch of training data
        for step, batch in enumerate(training_dataloader):
            batch_loss = 0
            # Unpack this training batch from dataloader
            #   [0]: input ids, [1]: attention masks, 
            #   [2]: labels
            b_input_ids,b_input_mask, b_labels = tuple(
                                t.to(device) for t in batch)

            # Clear any previously calculated gradients
            model.zero_grad()

            # Perform a forward pass 
            outputs = model(b_input_ids,
                            token_type_ids=None,
                            attention_mask=b_input_mask,
                            labels=b_labels)

            # pull loss value out of the output tuple
            loss = outputs[0]
            batch_loss += loss.item()
            total_loss += loss.item()

            # Perform a backward pass 
            loss.backward()

            # Clip the norm of the gradients to 1.0.
            torch.nn.utils.clip_grad_norm_(model.parameters(),
                                            1.0)

            # Update parameters
            # ¿take a step using the computed gradient
            optimizer.step()
            scheduler.step()

            print('batch loss: {0} | avg loss: {1}'.format(
                  batch_loss, total_loss/(step+1)))
        # Calculate the average loss over the training data.
        avg_train_loss = total_loss / len(train_dataloader)


        print("")
        print("  Average training loss: {0:.2f}".
             format(avg_train_loss))
        print("  Training epoch took: {:}".format(
              format_time(time.time() - t0)))

        # ========================================
        #               Validation
        # ========================================
        # After the completion of each training epoch, 
        # measure accuracy on the validation set.

        print("")
        print("======= Validation =======")

        t0 = time.time()

        # Put the model in evaluation mode
        model.eval()

        # Tracking variables
        eval_loss, eval_accuracy = 0, 0
        all_logits = []
        all_labels = []
        # Evaluate data for one epoch
        for step, batch in enumerate(validation_dataloader):
            # Add batch to device
            # Unpack this training batch from our dataloader.
            #   [0]: input ids, [1]: attention masks,
            #   [2]: labels
            b_input_ids, b_input_mask, b_labels = tuple(
                                t.to(device) for t in batch)


            # Model will not to compute gradients
            with torch.no_grad():
                # Forward pass 
                # This will return the logits 
                outputs = model(b_input_ids,
                                token_type_ids=None,
                                attention_mask=b_input_mask)

            # The "logits" are the output values 
            # prior to applying an activation function 
            logits = outputs[0]

            # Move logits and labels to CPU
            logits = logits.detach().cpu().numpy()
            b_labels = b_labels.to('cpu').numpy()

            # Save batch logits and labels 
            # We will use thoses in the confusion matrix
            predict_labels = np.argmax(
                             logits, axis=1).flatten()
            all_logits.extend(predict_labels.tolist())
            all_labels.extend(b_labels.tolist())

            # Calculate the accuracy for this batch
            tmp_eval_accuracy = flat_accuracy(
                                logits, b_labels)
            # Accumulate the total accuracy.
            eval_accuracy += tmp_eval_accuracy

        # Report the final accuracy for this validation run.
        print("  Accuracy: {0:.2f}".
              format(eval_accuracy / (step+1)))
        print("  Validation took: {:}".format(
             format_time(time.time() - t0)))

    #print the confusion matrix"
    conf = confusion_matrix(
           all_labels, all_logits, normalize='true')
    print(conf)
    print("")
    print("Training complete")

#call function to train the model
training(epochs, train_dataloader, val_dataloader)


batch loss: 0.09645400941371918 | avg loss: 0.09645400941371918

  Average training loss: 0.10
  Training epoch took: 0:00:06

  Accuracy: 0.67
  Validation took: 0:00:01

batch loss: 0.0942695215344429 | avg loss: 0.0942695215344429

  Average training loss: 0.09
  Training epoch took: 0:00:06

  Accuracy: 0.67
  Validation took: 0:00:01

batch loss: 0.09492313116788864 | avg loss: 0.09492313116788864

  Average training loss: 0.09
  Training epoch took: 0:00:06

  Accuracy: 0.67
  Validation took: 0:00:01

batch loss: 0.08322443813085556 | avg loss: 0.08322443813085556

  Average training loss: 0.08
  Training epoch took: 0:00:06

  Accuracy: 0.67
  Validation took: 0:00:01

batch loss: 0.10232424736022949 | avg loss: 0.10232424736022949

  Average training loss: 0.10
  Training epoch took: 0:00:06

  Accuracy: 0.67
  Validation took: 0:00:01

batch loss: 0.09246470779180527 | avg loss: 0.09246470779180527

  Average training loss: 0.09
  Training epoch took: 0:00:06

  Accuracy: 0.