In [1]:
from transformers import get_linear_schedule_with_warmup,AutoTokenizer,AutoModelForSequenceClassification, AutoModel, AlbertForSequenceClassification
from textwrap import wrap
import pandas as pd
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
from torch import nn
import numpy as np
import torch
import pickle
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


### Inicializacion


In [2]:
RANDOM_SEED = 42
MAX_LEN = 250
BATCH_SIZE = 8
DATASET_PATH = '../Data/SemEval_Train_Polarity.csv'
NCLASES = 2

np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

<torch._C.Generator at 0x1aeed346d30>

### Cargar DataSet

In [3]:
df = pd.read_csv(DATASET_PATH, sep=';')
df.groupby('sentiment').nunique()

Unnamed: 0_level_0,texto,aspecto,review
sentiment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,494,231,637
1,1147,322,1727


### Tokenizacion

In [4]:
MODEL_TRAIN = '../Model/GPT-2/'
tokenizer = AutoTokenizer.from_pretrained(MODEL_TRAIN)

# Agrega el token de padding (si no existe)
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    print("PAD")

PAD


### Crear DataSet

In [5]:
class CoahDataset(Dataset):

    def __init__(self, reviews, labels, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.reviews = reviews
        self.labels = labels

    def __len__(self):
        return len(self.reviews)
    
    def __getitem__(self,index):
        review = str(self.reviews[index])
        label = self.labels[index]

        encoding = self.tokenizer.encode_plus(
            review,
            add_special_tokens = True,
            max_length = self.max_len,
            padding = "max_length",
            return_token_type_ids = False,
            truncation = True,
            return_attention_mask = True,
            return_tensors = 'pt',
        )
        
        return{
            'review': review,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype= torch.long)
        }
        

In [6]:
def data_loader(df,tokenizer, max_len , batch_size):
    dataset = CoahDataset(
        reviews = df['review'].to_numpy(),
        labels = df['sentiment'].to_numpy(),
        tokenizer = tokenizer,
        max_len = max_len
    )

    return DataLoader(dataset, batch_size = batch_size, num_workers= 0)  #num_workers = 0. Probar con 1 y 2

### Entrenamiento

In [7]:
df_train, df_test = train_test_split(df, test_size=0.1, random_state=RANDOM_SEED)

train_data_loader = data_loader(df_train, tokenizer, MAX_LEN, BATCH_SIZE)
test_data_loader = data_loader(df_test, tokenizer, MAX_LEN, BATCH_SIZE)

### Modelo

In [None]:
#NO
class ModelSentiment(nn.Module):

    def __init__(self, n_classes):
        super(ModelSentiment, self).__init__()
        self.model = BertModel.from_pretrained('../Model/beto/')
        self.drop = nn.Dropout(0.3)
        self.linear = nn.Linear(768, n_classes)   #hidden.size=768
        print(self.model.config.cross_attention_hidden_size)

    def forward(self, input_ids, attention_mask):
        cls_output = self.model(input_ids, attention_mask)
        drop_output = self.drop(cls_output.pooler_output)
        output = self.linear(drop_output)
        return output

model = ModelSentiment(NCLASES)

In [8]:
model = AutoModelForSequenceClassification.from_pretrained(MODEL_TRAIN)
model.dropout = nn.Dropout(0.3)
model.classifier = nn.Linear(model.config.hidden_size, 2) 
model.resize_token_embeddings(len(tokenizer)) # Esencial para actualizar el modelo con el nuevo token
model.config.pad_token_id = tokenizer.pad_token_id


Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at ../Model/GPT-2/ and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
model

GPT2ForSequenceClassification(
  (transformer): GPT2Model(
    (wte): Embedding(50258, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (score): Linear(in_features=768, out_features=2, bias=False)
  (dropout): Dropout(p=0.3, inplace=False)
  (classifier): Linear(in_features=768, out_features=2, bias=Tr

### Entrenamiento

In [None]:
from transformers import RobertaModel, RobertaTokenizer
import torch

# Nombres de los modelos Bertin Base y Bertin Large
MODEL_NAMES = {
    "base": "../Model/GPT-2/",
}

def load_model_and_get_stats(model_name):
    # Cargar el tokenizador y el modelo
    model = RobertaModel.from_pretrained(model_name)

    # Obtener la cantidad de parámetros
    num_params = sum(p.numel() for p in model.parameters())

    # Imprimir estadísticas
    print(f"Modelo: {model_name}")
    print(f"Número de parámetros: {num_params:,}")
    print(f"Número de capas: {len(model.encoder.layer)}")
    print(f"Tamaño de las embeddings: {model.config.hidden_size}")
    print(f"Número de cabezas de atención: {model.config.num_attention_heads}")
    print("-" * 50)

# Cargar y mostrar estadísticas para Bertin Base y Bertin Large
for model_type, model_name in MODEL_NAMES.items():
    load_model_and_get_stats(model_name)

In [11]:
#Entrenamiento
EPOCHES = 3
optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)
total_steps = len(train_data_loader)  *  EPOCHES
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps= 0,
    num_training_steps= total_steps
)
loss_fn = nn.CrossEntropyLoss()


In [12]:
def train_model(model, data_loader,loss_fn,optimizer,scheduler,n_examples):
    model = model.train()
    losses = []
    correct_predictions = 0
    for batch in tqdm(data_loader):
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']
        outputs = model(input_ids = input_ids, attention_mask = attention_mask)
        logits = outputs.logits
        _, preds = torch.max(logits, dim=1)
        loss = loss_fn(logits, labels)
        correct_predictions += torch.sum(preds == labels)
        losses.append(loss.item())
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm = 1.0)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
    return correct_predictions.double()/n_examples, np.mean(losses)

def eval_model(model, data_loader, loss_fn,n_examples):
    model = model.eval()
    losses = []
    correct_predictions = 0
    with torch.no_grad():
        for batch in tqdm(data_loader):
            input_ids = batch['input_ids']
            attention_mask = batch['attention_mask']
            labels = batch['labels']
            outputs = model(input_ids = input_ids, attention_mask = attention_mask)
            logits = outputs.logits
            _, preds = torch.max(logits, dim=1)
            loss = loss_fn(logits, labels)
            correct_predictions += torch.sum(preds == labels)
            losses.append(loss.item())
    return correct_predictions.double()/n_examples, np.mean(losses)

In [25]:
def train_model(model, data_loader,loss_fn,optimizer,scheduler,n_examples):
    model = model.train()
    losses = []
    correct_predictions = 0
    for batch in tqdm(data_loader):
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']
        outputs = model(input_ids = input_ids, attention_mask = attention_mask)
        logits = outputs.logits

        cls_logits = logits[:, 0, :] 
        _, preds = torch.max(cls_logits, dim=1)  
        loss = loss_fn(cls_logits, labels)  
        correct_predictions += torch.sum(preds == labels)
        losses.append(loss.item())
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm = 1.0)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
    return correct_predictions.double()/n_examples, np.mean(losses)

def eval_model(model, data_loader, loss_fn,n_examples):
    model = model.eval()
    losses = []
    correct_predictions = 0
    with torch.no_grad():
        for batch in tqdm(data_loader):
            input_ids = batch['input_ids']
            attention_mask = batch['attention_mask']
            labels = batch['labels']
            outputs = model(input_ids = input_ids, attention_mask = attention_mask)
            logits = outputs.logits
            cls_logits = logits[:, 0, :] 
            _, preds = torch.max(cls_logits, dim=1)
            loss = loss_fn(cls_logits, labels)
            correct_predictions += torch.sum(preds == labels)
            losses.append(loss.item())
    return correct_predictions.double()/n_examples, np.mean(losses)

In [20]:
#NO

def train_model(model, data_loader,loss_fn,optimizer,scheduler,n_examples):
    model = model.train()
    losses = []
    correct_predictions = 0
    for batch in tqdm(data_loader):
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']
        outputs = model(input_ids = input_ids, attention_mask = attention_mask)
        _, preds = torch.max(outputs, dim=1)
        loss = loss_fn(outputs, labels)
        correct_predictions += torch.sum(preds == labels)
        losses.append(loss.item())
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm = 1.0)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
    return correct_predictions.double()/n_examples, np.mean(losses)

def eval_model(model, data_loader, loss_fn,n_examples):
    model = model.eval()
    losses = []
    correct_predictions = 0
    with torch.no_grad():
        for batch in tqdm(data_loader):
            input_ids = batch['input_ids']
            attention_mask = batch['attention_mask']
            labels = batch['labels']
            outputs = model(input_ids = input_ids, attention_mask = attention_mask)
            _, preds = torch.max(outputs, dim=1)
            loss = loss_fn(outputs, labels)
            correct_predictions += torch.sum(preds == labels)
            losses.append(loss.item())
    return correct_predictions.double()/n_examples, np.mean(losses)


In [13]:
for epoch in range(EPOCHES):
    print('Epoch {} de {}'.format(epoch+1, EPOCHES))
    print('---------------------------')
    train_acc, train_loss = train_model(
        model, train_data_loader, loss_fn, optimizer, scheduler, len(df_train)
    )
    print('Entrenamiento: Loss: {}, accuracy: {}'.format(train_loss,train_acc) )
    test_acc, test_loss = eval_model(
        model, test_data_loader, loss_fn, len(df_test)
    )
    print('Validacion: Loss: {}, accuracy: {}'.format(test_loss,test_acc) )
    # with open("../Model_Train/Albert_"+str(epoch)+".pkl", "wb") as file:
    #     pickle.dump(model, file) 
    # model.save_pretrained("Albert_Large_Fine_turned")
    # tokenizer.save_pretrained("Albert_Large_Fine_turned") 
    torch.save(model, 'Albert_Large.pth')
    # torch.save(model.state_dict(), "Electra" + str(epoch+1) + '.pkl')
    # torch.save(model.config, "Electra" + str(epoch+1) + '.pkl')


Epoch 1 de 3
---------------------------


100%|██████████| 293/293 [38:56<00:00,  7.97s/it]


Entrenamiento: Loss: 0.9820798484691174, accuracy: 0.6673792218896965


100%|██████████| 33/33 [01:30<00:00,  2.74s/it]


Validacion: Loss: 0.4454198023586562, accuracy: 0.7923076923076923
Epoch 2 de 3
---------------------------


100%|██████████| 293/293 [38:27<00:00,  7.88s/it]


Entrenamiento: Loss: 0.4796180200759868, accuracy: 0.7776827704147071


100%|██████████| 33/33 [01:44<00:00,  3.16s/it]


Validacion: Loss: 0.4529114487496289, accuracy: 0.8307692307692308
Epoch 3 de 3
---------------------------


100%|██████████| 293/293 [42:12<00:00,  8.64s/it]


Entrenamiento: Loss: 0.382201064985131, accuracy: 0.8268490808037623


100%|██████████| 33/33 [01:27<00:00,  2.66s/it]


Validacion: Loss: 0.42762477842695784, accuracy: 0.8346153846153846


In [None]:
model = AutoModelForSequenceClassification.from_pretrained("../Logic/Bert_Fine_turned/")
tokenizer = AutoTokenizer.from_pretrained('../Model/bert-base-multilingual-uncased/')

### Prediccion

In [40]:
def classifySentiment(review_text):
    encoding_review = tokenizer.encode_plus(
        review_text,
        add_special_tokens = True,
        max_length = 250,
        padding = "max_length",
        return_token_type_ids = False,
        truncation = True,
        return_attention_mask = True,
        return_tensors = 'pt'
    )

    input_ids = encoding_review['input_ids']
    attention_mask = encoding_review['attention_mask']
    output = model(input_ids,attention_mask)
    logits = output.logits
    _, prediction = torch.max(logits,dim=1)
    print(logits)
    print(prediction)
    print("\n".join(wrap(review_text)))
    if prediction:
        print("Sentimiento predicho: * * * * *")
    else:
        print("Sentimiento predicho: *")

In [37]:
classifySentiment("amante carne tiene una carta bastante amplia para")

tensor([[[-6.9659e-01,  6.3480e-02],
         [-5.4630e-01, -4.2633e-02],
         [-5.4799e-01,  2.5685e-01],
         [-5.4479e-01, -4.3641e-01],
         [-3.0911e-01,  4.0672e-02],
         [-3.3779e-01, -1.2691e-01],
         [-9.7334e-03,  1.2395e-01],
         [ 2.0125e-01, -3.0086e-01],
         [-4.8740e-01, -1.7342e-02],
         [-3.8903e-01, -1.7800e-01],
         [-4.5631e-01, -7.7709e-02],
         [-6.9675e-01, -3.4524e-01],
         [-3.1655e-01, -3.1324e-01],
         [-3.6708e-01, -2.2414e-01],
         [-3.7481e-01,  1.4027e-01],
         [ 1.7216e-01,  5.2020e-02],
         [-7.0713e-01,  6.7032e-02],
         [-2.0103e-02, -2.8053e-01],
         [-3.9830e-02, -2.8072e-01],
         [-4.3031e-02, -2.9586e-01],
         [-8.2515e-03, -2.5540e-01],
         [-3.2367e-02, -3.1629e-01],
         [-1.2868e-03, -2.1082e-01],
         [-8.2193e-02, -2.9361e-01],
         [-8.6647e-02, -2.8169e-01],
         [-7.9745e-02, -2.7910e-01],
         [-7.5130e-02, -2.9147e-01],
 

RuntimeError: Boolean value of Tensor with more than one value is ambiguous

### Guardar Model

In [16]:
with open("../Model_Train/Bert_Model_Hotel_COAH", "wb") as file:
    pickle.dump(model, file)

### Guardar Tokenizer

In [136]:
tokenizer.save_pretrained('../Logic/Fine_turned/')

('../Logic/Fine_turned/tokenizer_config.json',
 '../Logic/Fine_turned/special_tokens_map.json',
 '../Logic/Fine_turned/vocab.txt',
 '../Logic/Fine_turned/added_tokens.json',
 '../Logic/Fine_turned/tokenizer.json')

In [None]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from datasets import Dataset
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader

# 1. Definir el tokenizador y el modelo
tokenizer = BertTokenizer.from_pretrained('../Model/bert-base-spanish-uncased/') 
model = BertForSequenceClassification.from_pretrained('../Model/bert-base-spanish-uncased/') # 2 clases: positivo/negativo

# 5. Definir la función de entrenamiento
def train_model(model, train_data_loader, optimizer, scheduler, device):
    model.train()
    total_loss = 0
    for batch in train_data_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
        scheduler.step()
    return total_loss / len(train_data_loader)

# 6. Definir la función de evaluación
def eval_model(model, test_data_loader, device):
    model.eval()
    total_loss = 0
    correct_predictions = 0
    with torch.no_grad():
        for batch in test_data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_loss += loss.item()

            _, preds = torch.max(outputs.logits, dim=1)
            correct_predictions += torch.sum(preds == labels).item()
    return total_loss / len(test_data_loader), correct_predictions / len(test_data_loader)

# 7. Ajustar el modelo
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

optimizer = AdamW(model.parameters(), lr=2e-5)  # Ajuste fino del modelo BERT
total_steps = len(train_data_loader) * 3 # 3 epocas
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

EPOCHES = 3

for epoch in range(EPOCHES):
    print('Epoch {} de {}'.format(epoch+1, EPOCHES))
    print('---------------------------')
    train_loss = train_model(model, train_data_loader, optimizer, scheduler, device)
    print('Entrenamiento: Loss: {}'.format(train_loss))
    test_loss, test_acc = eval_model(model, test_data_loader, device)
    print('Evaluación: Loss: {}, Accuracy: {}'.format(test_loss, test_acc))

# 8. Guardar el modelo
model_save_path = 'fine_tuned_bert_spanish'
tokenizer.save_pretrained(model_save_path)
model.save_pretrained(model_save_path)
