In [1]:
!pip install transformers
import numpy as np
import pandas as pd
from transformers import AutoTokenizer, AutoModelForMaskedLM, AdamW,RobertaModel,RobertaTokenizer
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import seaborn as sns
from sklearn.model_selection import train_test_split
import copy
import warnings
from sklearn.metrics import accuracy_score as acc
from sklearn.metrics import f1_score as f1
import torch.optim as optim


warnings.filterwarnings("ignore")

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

tokenizer = RobertaTokenizer.from_pretrained("PlanTL-GOB-ES/roberta-base-bne")

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.29.2-py3-none-any.whl (7.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m60.9 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m13.0 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m40.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.14.1 tokenizers-0.13.3 transformers-4.29.2


Downloading (…)olve/main/vocab.json:   0%|          | 0.00/851k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/509k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/957 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
sentiments=pd.read_csv("/content/drive/MyDrive/transformer/s.csv")

In [None]:
sentiments=sentiments[["others","joy","sadness","anger","surprise","disgust","fear"]]

In [None]:
len(sentiments)

2671

In [3]:
data = pd.read_csv("/content/drive/MyDrive/transformer/train.csv")
inputs = data["tweet"]
labels = data["humor"]



train_inputs, test_inputs, train_labels, test_labels = train_test_split(
    inputs, 
    labels, 
    test_size=0.2, 
    stratify=labels
)


In [4]:
class createDataset(Dataset):

    def __init__(self, texts, targets, tokenizer, max_len):
        self.texts = texts
        self.targets = targets
        self.tokenizer = tokenizer
        self.max_len = max_len
  
    def __len__(self):
        return len(self.texts)
  
    def __getitem__(self, item):
        text = str(self.texts[item])
        target = self.targets[item]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            pad_to_max_length=True,
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True
        )
        input=encoding['input_ids'].flatten()
        

        return {
            'text': text,
            'input_ids': input,
            
            'attention_mask': encoding['attention_mask'].flatten(),
            'targets': torch.tensor(target, dtype=torch.long)
        }

def create_data_loader(texts, labels, tokenizer, max_len, batch_size):
    
    ds = createDataset(
        texts=texts.to_numpy(),
        targets=labels.to_numpy(),
        
        tokenizer=tokenizer,
        max_len=max_len
  )

    return DataLoader(
        ds,
        batch_size=batch_size,
        num_workers=4
      )

In [5]:
class Model(nn.Module):
    def __init__(self, latent_dims,max_len,nhid):
        super(Model, self).__init__()

        
        self.roberta = RobertaModel.from_pretrained("PlanTL-GOB-ES/roberta-base-bne")
        self.tanh=nn.Tanh()
        self.linear = nn.Linear(in_features=768, out_features=192)
        self.dropout=nn.Dropout(0.2)
        self.r = nn.ReLU()
        self.l = nn.Linear(in_features=192,out_features=2)
        self.s = nn.Softmax(dim=1)
        

        self.latent_dims=latent_dims
        self.nhid=nhid


    def forward(self, input_id,attention):

      secuence_output = self.roberta(
            input_ids=input_id,
            attention_mask=attention
        )
      
      o = secuence_output.pooler_output

      o=self.tanh(o)

      o=self.linear(o)
      o=self.dropout(o)
      o=self.r(o)
      o=self.l(o)
      o=self.dropout(o)
      o = self.s(o)

  
      
      return o



In [7]:
batch_size = 64
learning_rate = 0.00001
criterion = nn.CrossEntropyLoss().to(device)
criterion.requires_grad=True
epochs = 8
latentdims=2
nhid=128
max_len=60


In [8]:
model= Model(latentdims,max_len,nhid)
model.to(device)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

Downloading (…)lve/main/config.json:   0%|          | 0.00/613 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of the model checkpoint at PlanTL-GOB-ES/roberta-base-bne were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.decoder.bias', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at PlanTL-GOB-ES/roberta-base-bne and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream tas

In [None]:
train_data_loader = create_data_loader(train_inputs, train_labels,tokenizer, max_len, batch_size)

test_data_loader = create_data_loader(test_inputs, test_labels, tokenizer, max_len, batch_size)

In [None]:
def train_an_epoch(
    model, 
    train_data_loader,
    dev_data_loader,
    criterion, 
    optimizer
):

    

    # These are the metrics that will indicate us how well it's doing the model...
    running_loss = 0
    training_acc=[]
    f1_training=[]
    steps = 0;
    
    for batch in train_data_loader:
        
        b=len(batch["input_ids"])
        # Clean gradients...
        optimizer.zero_grad()
    
        # Get the information from the tokenization... (using GPU)
        input_ids = batch["input_ids"].to(device)
        targets = batch["targets"].to(device)
        attention = batch["attention_mask"].to(device)

        # get the model's predictions...
        outputs = model(
            input_ids,
            attention
            
        )

        # Apply the loss function and the perform backward propagation...
       
        loss = criterion(outputs, targets)
       
        loss.backward() 
        optimizer.step()
        
        # update the metrics...

        pred = []
        real=[]
        for output in outputs:
          zero=output[0].item()
          one=output[1].item()
          if zero > one:
            pred.append(0)
          else:
            pred.append(1)

        for t in targets:

          real.append(t.item())

        bacc=acc(real,pred)
        bf1= f1(real,pred)
        running_loss+=loss.item()
        training_acc.append(bacc)
        f1_training.append(bf1)

        steps+=1
            
    # get the mean of the metrics...
    
    loss = running_loss/steps;
    t_acc=sum(training_acc)/len(training_acc)
    t_f1=sum(f1_training)/len(f1_training)
    
    #acc = running_accs/steps;
    
    # evaluate the model with the validation data set 
    # ("turn off" gradients...)
    with torch.no_grad():
        
        # These are the metrics that will indicate us how well it's doing the model...
        test_acc=[];
        steps_val=0;
        f1_test=[]
        
        for batch in dev_data_loader:

            b= len(batch["input_ids"])
            
            # Get the information from the tokenization... (using GPU)
            input_ids = batch["input_ids"].to(device)
            targets = batch["targets"].to(device)
            attention = batch["attention_mask"].to(device)

            # get the model's predictions...
            outputs = model(
                input_ids,
                attention
                
                
            )
            
            pred = []
            real=[]
            for output in outputs:
              zero=output[0].item()
              one=output[1].item()
              if zero > one:
                pred.append(0)
              else:
                pred.append(1)

            for t in targets:

              real.append(t.item())

            bacc=acc(real,pred)
            bf1= f1(real,pred)
            test_acc.append(bacc)
            f1_test.append(bf1)


        v_acc=sum(test_acc)/len(test_acc)
        v_f1= sum(f1_test)/len(f1_test)
    

    return loss,t_acc,v_acc,t_f1,v_f1

def train_the_model(epochs):
    
    for e in range(epochs):
      #, acc, val_acc
        
        loss,t_acc,v_acc,t_f1,v_f1 = train_an_epoch(
            model, 
            train_data_loader,
            test_data_loader,
            criterion, 
            optimizer
        )
        
        print('--------EPOCH SUMMARY---------')
        print('Epoch ', e+1, ' training loss: ', loss)
        print('Epoch ', e+1, ' training acc: ', t_acc*100, '%')
        print('Epoch ', e+1, ' val acc: ', v_acc*100, '%')
        print('Epoch ', e+1, ' training f1: ', t_f1*100, '%')
        print('Epoch ', e+1, ' val f1: ', v_f1*100, '%')

In [None]:
train_the_model(epochs)

--------EPOCH SUMMARY---------
Epoch  1  training loss:  0.6444389241583207
Epoch  1  training acc:  66.22242647058823 %
Epoch  1  val acc:  66.58363526570048 %
Epoch  1  training f1:  6.984058602925977 %
Epoch  1  val f1:  8.954511537844873 %
--------EPOCH SUMMARY---------
Epoch  2  training loss:  0.5455470803905936
Epoch  2  training acc:  78.38541666666667 %
Epoch  2  val acc:  85.6280193236715 %
Epoch  2  training f1:  58.196421720011585 %
Epoch  2  val f1:  77.3763010450317 %
--------EPOCH SUMMARY---------
Epoch  3  training loss:  0.48508773218182955
Epoch  3  training acc:  84.84987745098039 %
Epoch  3  val acc:  82.23128019323673 %
Epoch  3  training f1:  75.69275630967992 %
Epoch  3  val f1:  73.74578421362048 %
--------EPOCH SUMMARY---------
Epoch  4  training loss:  0.4697600024587968
Epoch  4  training acc:  84.81924019607844 %
Epoch  4  val acc:  82.23128019323673 %
Epoch  4  training f1:  76.13981078617292 %
Epoch  4  val f1:  70.14449316697046 %
--------EPOCH SUMMARY---

In [9]:
def train_an_epoch_full(
    model, 
    train_data_loader,
    criterion, 
    optimizer
):

    

    # These are the metrics that will indicate us how well it's doing the model...
    running_loss = 0
    training_acc=[]
    f1_training=[]
    steps = 0;
    
    for batch in train_data_loader:
        
        b=len(batch["input_ids"])
        # Clean gradients...
        optimizer.zero_grad()
    
        # Get the information from the tokenization... (using GPU)
        input_ids = batch["input_ids"].to(device)
        targets = batch["targets"].to(device)
        attention = batch["attention_mask"].to(device)

        # get the model's predictions...
        outputs = model(
            input_ids,
            attention
            
        )

        # Apply the loss function and the perform backward propagation...
       
        loss = criterion(outputs, targets)
       
        loss.backward() 
        optimizer.step()
        
        # update the metrics...

        pred = []
        real=[]
        for output in outputs:
          zero=output[0].item()
          one=output[1].item()
          if zero > one:
            pred.append(0)
          else:
            pred.append(1)

        for t in targets:

          real.append(t.item())

        bacc=acc(real,pred)
        bf1= f1(real,pred)
        running_loss+=loss.item()
        training_acc.append(bacc)
        f1_training.append(bf1)

        steps+=1
            
    # get the mean of the metrics...
    
    loss = running_loss/steps;
    t_acc=sum(training_acc)/len(training_acc)
    t_f1=sum(f1_training)/len(f1_training)
    
    #acc = running_accs/steps;
    
    
    

    return loss,t_acc,t_f1

def train_full_model(epochs):
    
    for e in range(epochs):
      #, acc, val_acc
        
        loss,t_acc,t_f1 = train_an_epoch_full(
            model, 
            full_data_loader,
            criterion, 
            optimizer
        )
        
        print('--------EPOCH SUMMARY---------')
        print('Epoch ', e+1, ' training loss: ', loss)
        print('Epoch ', e+1, ' training acc: ', t_acc*100, '%')
        
        print('Epoch ', e+1, ' training f1: ', t_f1*100, '%')
        

In [10]:
full_inputs = pd.concat([train_inputs,test_inputs])
full_targets = pd.concat([train_labels,test_labels])

In [11]:
full_data_loader=create_data_loader(full_inputs, full_targets,tokenizer, max_len, batch_size)

In [12]:
train_full_model(epochs)

--------EPOCH SUMMARY---------
Epoch  1  training loss:  0.633465336901801
Epoch  1  training acc:  69.22729863221885 %
Epoch  1  training f1:  20.51780620302234 %
--------EPOCH SUMMARY---------
Epoch  2  training loss:  0.5172779460748037
Epoch  2  training acc:  81.97109295845999 %
Epoch  2  training f1:  70.41838676865474 %
--------EPOCH SUMMARY---------
Epoch  3  training loss:  0.4730277309815089
Epoch  3  training acc:  85.2821048632219 %
Epoch  3  training f1:  76.67110113931004 %
--------EPOCH SUMMARY---------
Epoch  4  training loss:  0.45257545581885744
Epoch  4  training acc:  86.86835106382979 %
Epoch  4  training f1:  79.96990352836549 %
--------EPOCH SUMMARY---------
Epoch  5  training loss:  0.43935254287152065
Epoch  5  training acc:  87.4936676798379 %
Epoch  5  training f1:  80.8517523461338 %
--------EPOCH SUMMARY---------
Epoch  6  training loss:  0.43401900501478285
Epoch  6  training acc:  87.39235055724419 %
Epoch  6  training f1:  80.7980375294341 %
--------EPOC

In [13]:
test= pd.read_csv("/content/drive/MyDrive/transformer/test.csv")

In [14]:
test

Unnamed: 0.1,Unnamed: 0,index,tweet,humor
0,0,52830,-Mamá en la escuela me dicen gorda -Pobresilla...,0
1,1,78883,"No te sientas diferente, da igual si eres negr...",1
2,2,78926,Si esta asi.. SUPER SI.. y que se pongan celos...,0
3,3,61844,—Bebé ¿Me veo gorda con este vestido?\n—¡No mi...,0
4,4,78830,Las mujeres solo desean 2 cosas en la vida: co...,0
...,...,...,...,...
773,773,9496,Decir que una mujer está soltera es de machist...,0
774,774,14026,¿cómo un aliado se atreve a chamuyar a una ant...,1
775,775,12393,"MENTION No hicieron nada por las mujeres, son ...",1
776,776,18723,Cuando llegará ese día en que las chicas organ...,0


In [15]:
test_inputs= test["tweet"]

In [16]:
class createTestDataset(Dataset):

    def __init__(self, texts,  tokenizer, max_len):
        self.texts = texts
        
        self.tokenizer = tokenizer
        self.max_len = max_len
  
    def __len__(self):
        return len(self.texts)
  
    def __getitem__(self, item):
        
        text = self.texts[item]
        #sentiments=self.texts[item][1:]
        

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            pad_to_max_length=True,
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True
        )
        input=encoding['input_ids'].flatten()
        

        return {
            'text': text,
            'input_ids': input,
            
            'attention_mask': encoding['attention_mask'].flatten()
          
        }

def create_test_data_loader(texts,  tokenizer, max_len, batch_size):
    
    ds = createTestDataset(
        texts=texts.to_numpy(),
        
        
        tokenizer=tokenizer,
        max_len=max_len
  )

    return DataLoader(
        ds,
        batch_size=batch_size,
        num_workers=4
      )

In [17]:
test_loader= create_test_data_loader(test_inputs,tokenizer,max_len,batch_size)

In [18]:
def predict(model, 
    test_data_loader):
  with torch.no_grad():
        
        # These are the metrics that will indicate us how well it's doing the model...
        predictions=[]
        
        for batch in test_data_loader:

            b= len(batch["input_ids"])
            
            # Get the information from the tokenization... (using GPU)
            input_ids = batch["input_ids"].to(device)
            
            attention = batch["attention_mask"].to(device)

            # get the model's predictions...
            outputs = model(
                input_ids,
                attention
                
                
            )
            
            
            for output in outputs:
              zero=output[0].item()
              one=output[1].item()
              if zero > one:
                predictions.append(0)
              else:
                predictions.append(1)


  return predictions


In [19]:
predictions= predict(model,test_loader)

In [21]:
datos={
    'tweet': test_inputs,
    'humor': predictions
}

In [23]:
jpk_2 = pd.DataFrame(datos)

In [24]:
jpk_2

Unnamed: 0,tweet,humor
0,-Mamá en la escuela me dicen gorda -Pobresilla...,1
1,"No te sientas diferente, da igual si eres negr...",1
2,Si esta asi.. SUPER SI.. y que se pongan celos...,0
3,—Bebé ¿Me veo gorda con este vestido?\n—¡No mi...,1
4,Las mujeres solo desean 2 cosas en la vida: co...,1
...,...,...
773,Decir que una mujer está soltera es de machist...,0
774,¿cómo un aliado se atreve a chamuyar a una ant...,0
775,"MENTION No hicieron nada por las mujeres, son ...",0
776,Cuando llegará ese día en que las chicas organ...,1


In [26]:
jpk_2.to_csv("/content/drive/MyDrive/transformer/JPK_2.csv")