In [49]:
import torch
import torch.optim
import numpy as np
import pandas as pd
import time
import datetime
import random

from transformers import  BertTokenizer
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader, SequentialSampler
from transformers import BertForSequenceClassification, AdamW
from transformers import get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

In [50]:
# Select cpu or cuda
run_on = 'cpu'
device = torch.device(run_on)

In [51]:
# Importing the data:
df = pd.read_csv('/Users/mario/OneDrive/ITAM/4to semestre/Inv Aplicada II/Mario - Tesis/code/BETO/data/5k_results_text_clean.csv')

In [52]:
df.head()

Unnamed: 0,text,label,original_text,label_str
0,en una nueva descalificacion los organismos au...,1,En una nueva descalificación a los organismos ...,PRO
1,cien anos del surgimiento del fascismo en ital...,0,A cien años del surgimiento del fascismo en It...,ANTI
2,el congreso de estados unidos inicio este mier...,1,El Congreso de Estados Unidos inició este miér...,PRO
3,tesla producira un vehiculo electrico de bajo ...,0,Tesla producirá un vehículo eléctrico de bajo ...,ANTI
4,en sesion de este lunes los ministros de la su...,1,"En sesión de este lunes, los ministros de la S...",PRO


In [53]:
# Load the dataset into a pandas dataframe.
reviews = df['text']
#sentiment = df['label'] 
sentiment = df['label_str'] # 1: Pro // 0: Anti
sentiment = pd.DataFrame(list(map(lambda x: 0 if x=="ANTI" else 1, sentiment)))

In [54]:
# Split dataset
X_train, X_val, y_train, y_val = train_test_split(reviews, sentiment, stratify=sentiment, test_size=0.2, random_state=42)

In [55]:
# Report datasets lenghts
print('Training set length : {}'.format(len(X_train)))
print('Validation set length : {}'.format(len(X_val)))

Training set length : 4000
Validation set length : 1000


In [56]:
# Tokenization
tokenizer = BertTokenizer.from_pretrained("../pytorch/", do_lower_case=True)

In [57]:
def preprocessing(dataset):
    input_ids = []
    attention_mask = []
    for doc in dataset:
        encoded_doc = tokenizer.encode_plus(doc,
                   add_special_tokens=True, max_length=512,
                   truncation='longest_first',
                   pad_to_max_length=True)
        input_ids.append(encoded_doc['input_ids'])
        attention_mask.append(encoded_doc['attention_mask'])
    return (torch.tensor(input_ids),
           torch.tensor(attention_mask))

In [58]:
X_train

2846    mientras morena pt salieron en apoyo la estrat...
618     para decirlo con isaac asimov la violencia es ...
1126    la comision de prerrogativas partidos politico...
3228    debe ser duro para los seguidores de cuauhtemo...
3909    diputados federales del partido accion naciona...
                              ...                        
4471    las elecciones en puebla muestran solo un por ...
572     pese contar con una estructura millonaria el i...
169     en morena no habra releccion directa de los le...
274     gerardo fernandez norona no se guardo nada est...
2240    maria del rosario piedra ibarra titular de la ...
Name: text, Length: 4000, dtype: object

In [59]:
# Apply preprocessing to dataset
X_train_inputs, X_train_masks = preprocessing(X_train)
X_val_inputs, X_val_masks = preprocessing(X_val)



In [60]:
# Report max n° tokens in a sentence
max_len = max([torch.sum(sen) for sen in X_train_masks])
print('Max n°tokens in a sentence: {0}'.format(max_len))

Max n°tokens in a sentence: 512


In [61]:
# Data loaders
batch_size = 32
y_train_labels = torch.tensor(y_train.values)
y_val_labels = torch.tensor(y_val.values)

In [62]:
def dataloader(x_inputs, x_masks, y_labels):
    data = TensorDataset(x_inputs, x_masks, y_labels)
    sampler = SequentialSampler(data)
    dataloader = DataLoader(data, sampler=sampler,
                 batch_size=batch_size,
                 num_workers=0)
    return dataloader

In [63]:
train_dataloader = dataloader(X_train_inputs, X_train_masks,
                   y_train_labels)
val_dataloader = dataloader(X_val_inputs, X_val_masks, 
                 y_val_labels)

In [64]:
from transformers import BertModel
import torch

# set random seed
def set_seed(value):
    random.seed(value)
    np.random.seed(value)
    torch.manual_seed(value)
    torch.cuda.manual_seed_all(value)
set_seed(42)

In [65]:
# Create model
model = BertForSequenceClassification.from_pretrained("../pytorch/", num_labels=2, output_attentions=False, output_hidden_states=False)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ../pytorch/ and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [66]:
# model.softmax = torch.nn.Softmax(3)

In [67]:
# Optimizer

optimizer = torch.optim.Adam(model.parameters(),
                  lr = 4e-5,
                  eps = 1e-6)

if run_on == 'cuda':
    model.cuda()

# Define number of epochs
epochs = 10

total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer,
            num_warmup_steps = 0, 
            num_training_steps = total_steps)

In [68]:
#fuction to format time
def format_time(elapsed):
    elapsed_rounded = int(round((elapsed)))
    return str(datetime.timedelta(seconds=elapsed_rounded))

#function to compute accuracy
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [69]:
#function to train the model
def training(n_epochs, training_dataloader,
             validation_dataloader):
    # ========================================
    #               Training 
    # ========================================
    print('======= Training =======')
    for epoch_i in range(0,n_epochs):
        # Perform one full pass over the training set
        print("")
        print('======= Epoch {:} / {:} ======='.format(
             epoch_i + 1, epochs))
        # Measure how long the training epoch takes.
        t0 = time.time()
        # Reset the total loss for this epoch.
        total_loss = 0
        # Put the model into training mode.
        model.train()
        # For each batch of training data
        for step, batch in enumerate(training_dataloader):
            batch_loss = 0
            # Unpack this training batch from dataloader
            #   [0]: input ids, [1]: attention masks, 
            #   [2]: labels
            b_input_ids,b_input_mask, b_labels = tuple(
                                t.to(device) for t in batch)

            # Clear any previously calculated gradients
            model.zero_grad()

            # Perform a forward pass 
            outputs = model(b_input_ids,
                            token_type_ids=None,
                            attention_mask=b_input_mask,
                            labels=b_labels)

            # pull loss value out of the output tuple
            loss = outputs[0]
            batch_loss += loss.item()
            total_loss += loss.item()

            # Perform a backward pass 
            loss.backward()

            # Clip the norm of the gradients to 1.0.
            torch.nn.utils.clip_grad_norm_(model.parameters(),
                                            1.0)

            # Update parameters
            # ¿take a step using the computed gradient
            optimizer.step()
            scheduler.step()

            print('batch loss: {0} | avg loss: {1}'.format(
                  batch_loss, total_loss/(step+1)))
        # Calculate the average loss over the training data.
        avg_train_loss = total_loss / len(train_dataloader)


        print("")
        print("  Average training loss: {0:.2f}".
             format(avg_train_loss))
        print("  Training epoch took: {:}".format(
              format_time(time.time() - t0)))

        # ========================================
        #               Validation
        # ========================================
        # After the completion of each training epoch, 
        # measure accuracy on the validation set.

        print("")
        print("======= Validation =======")

        t0 = time.time()

        # Put the model in evaluation mode
        model.eval()

        # Tracking variables
        eval_loss, eval_accuracy = 0, 0
        all_logits = []
        all_labels = []
        # Evaluate data for one epoch
        for step, batch in enumerate(validation_dataloader):
            # Add batch to device
            # Unpack this training batch from our dataloader.
            #   [0]: input ids, [1]: attention masks,
            #   [2]: labels
            b_input_ids, b_input_mask, b_labels = tuple(
                                t.to(device) for t in batch)


            # Model will not to compute gradients
            with torch.no_grad():
                # Forward pass 
                # This will return the logits 
                outputs = model(b_input_ids,
                                token_type_ids=None,
                                attention_mask=b_input_mask)

            # The "logits" are the output values 
            # prior to applying an activation function 
            logits = outputs[0]

            # Move logits and labels to CPU
            logits = logits.detach().cpu().numpy()
            b_labels = b_labels.to('cpu').numpy()

            # Save batch logits and labels 
            # We will use thoses in the confusion matrix
            predict_labels = np.argmax(
                             logits, axis=1).flatten()
            all_logits.extend(predict_labels.tolist())
            all_labels.extend(b_labels.tolist())

            # Calculate the accuracy for this batch
            tmp_eval_accuracy = flat_accuracy(
                                logits, b_labels)
            # Accumulate the total accuracy.
            eval_accuracy += tmp_eval_accuracy

        # Report the final accuracy for this validation run.
        print("  Accuracy: {0:.2f}".
              format(eval_accuracy / (step+1)))
        print("  Validation took: {:}".format(
             format_time(time.time() - t0)))

    #print the confusion matrix"
    conf = confusion_matrix(
           all_labels, all_logits, normalize='true')
    print(conf)
    print("")
    print("Training complete")

#call function to train the model
training(epochs, train_dataloader, val_dataloader)


batch loss: 0.7262793183326721 | avg loss: 0.7262793183326721
batch loss: 0.9383925199508667 | avg loss: 0.8323359191417694
batch loss: 0.6943339705467224 | avg loss: 0.786335269610087
batch loss: 0.7644037008285522 | avg loss: 0.7808523774147034
batch loss: 0.7495293617248535 | avg loss: 0.7745877742767334
batch loss: 0.6772282123565674 | avg loss: 0.7583611806233724
batch loss: 0.7404534220695496 | avg loss: 0.7558029294013977
batch loss: 0.6556239128112793 | avg loss: 0.7432805523276329
batch loss: 0.6897634267807007 | avg loss: 0.7373342050446404
batch loss: 0.6559417247772217 | avg loss: 0.7291949570178986
batch loss: 0.6936920881271362 | avg loss: 0.7259674234823748
batch loss: 0.6847467422485352 | avg loss: 0.7225323667128881
batch loss: 0.6871854662895203 | avg loss: 0.719813374372629
batch loss: 0.7444644570350647 | avg loss: 0.7215741659913745
batch loss: 0.6309378147125244 | avg loss: 0.7155317425727844
batch loss: 0.7005057334899902 | avg loss: 0.7145926170051098
batch los

KeyboardInterrupt: 