In [1]:
from transformers import get_linear_schedule_with_warmup,AutoTokenizer,AutoModelForSequenceClassification
from textwrap import wrap
import pandas as pd
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
from torch import nn
import numpy as np
import torch
import pickle
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


### Inicializacion


In [2]:
RANDOM_SEED = 42
MAX_LEN = 250
BATCH_SIZE = 8
DATASET_PATH = '../Data/COAH.csv'
NCLASES = 2

np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

<torch._C.Generator at 0x25c94a16cb0>

### Cargar DataSet

In [31]:
df = pd.read_csv(DATASET_PATH, sep=';')
df.groupby('sentiment').nunique()

Unnamed: 0_level_0,review
sentiment,Unnamed: 1_level_1
0,500
1,1007


In [35]:
dfp = df[df['sentiment']==0][:5]
dfn = df[df['sentiment']==1][:5]
df = pd.concat([dfp,dfn])

### Tokenizacion

In [36]:
MODEL_TRAIN = '../Model/bert-base-spanish-uncased/'
tokenizer = AutoTokenizer.from_pretrained(MODEL_TRAIN)

### Crear DataSet

In [37]:
class CoahDataset(Dataset):

    def __init__(self, reviews, labels, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.reviews = reviews
        self.labels = labels

    def __len__(self):
        return len(self.reviews)
    
    def __getitem__(self,index):
        review = str(self.reviews[index])
        label = self.labels[index]

        encoding = tokenizer.encode_plus(
            review,
            add_special_tokens = True,
            max_length = self.max_len,
            padding = "max_length",
            return_token_type_ids = False,
            truncation = True,
            return_attention_mask = True,
            return_tensors = 'pt'
        )

        return{
            'review': review,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype= torch.long)
        }
        

In [38]:
def data_loader(df,tokenizer, max_len , batch_size):
    dataset = CoahDataset(
        reviews = df['review'].to_numpy(),
        labels = df['sentiment'].to_numpy(),
        tokenizer = tokenizer,
        max_len = max_len
    )

    return DataLoader(dataset, batch_size = batch_size, num_workers= 0)  #num_workers = 0. Probar con 1 y 2

### Entrenamiento

In [39]:
df_train, df_test = train_test_split(df, test_size=0.5, random_state=RANDOM_SEED)

train_data_loader = data_loader(df_train, tokenizer, MAX_LEN, BATCH_SIZE)
test_data_loader = data_loader(df_test, tokenizer, MAX_LEN, BATCH_SIZE)

### Modelo

In [124]:
class ModelSentiment(nn.Module):

    def __init__(self, n_classes):
        super(ModelSentiment, self).__init__()
        self.model = BertModel.from_pretrained('../Model/beto/')
        self.drop = nn.Dropout(0.3)
        self.linear = nn.Linear(768, n_classes)   #hidden.size=768
        print(self.model.config.cross_attention_hidden_size)

    def forward(self, input_ids, attention_mask):
        cls_output = self.model(input_ids, attention_mask)
        drop_output = self.drop(cls_output.pooler_output)
        output = self.linear(drop_output)
        return output

model = ModelSentiment(NCLASES)

None


In [40]:
model = AutoModelForSequenceClassification.from_pretrained(MODEL_TRAIN)
model.dropout = nn.Dropout(0.3)
model.classifier = nn.Linear(model.config.hidden_size, 2) 


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ../Model/bert-base-spanish-uncased/ and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [41]:
model

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(31002, 768, padding_idx=1)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

### Entrenamiento

In [42]:
#Entrenamiento
EPOCHES = 1
optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)
total_steps = len(train_data_loader)  *  EPOCHES
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps= 0,
    num_training_steps= total_steps
)
loss_fn = nn.CrossEntropyLoss()


In [49]:
def train_model(model, data_loader,loss_fn,optimizer,scheduler,n_examples):
    model = model.train()
    losses = []
    correct_predictions = 0
    for batch in tqdm(data_loader):
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']
        outputs = model(input_ids = input_ids, attention_mask = attention_mask)
        logits = outputs.logits
        _, preds = torch.max(logits, dim=1)
        loss = loss_fn(logits, labels)
        correct_predictions += torch.sum(preds == labels)
        losses.append(loss.item())
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm = 1.0)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
    return correct_predictions.double()/n_examples, np.mean(losses)

def eval_model(model, data_loader, loss_fn,n_examples):
    model = model.eval()
    losses = []
    correct_predictions = 0
    with torch.no_grad():
        for batch in tqdm(data_loader):
            input_ids = batch['input_ids']
            attention_mask = batch['attention_mask']
            labels = batch['labels']
            outputs = model(input_ids = input_ids, attention_mask = attention_mask)
            logits = outputs.logits
            _, preds = torch.max(logits, dim=1)
            loss = loss_fn(logits, labels)
            correct_predictions += torch.sum(preds == labels)
            losses.append(loss.item())
    return correct_predictions.double()/n_examples, np.mean(losses)

In [47]:
def train_model(model, data_loader,loss_fn,optimizer,scheduler,n_examples):
    model = model.train()
    losses = []
    correct_predictions = 0
    for batch in tqdm(data_loader):
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']
        outputs = model(input_ids = input_ids, attention_mask = attention_mask)
        _, preds = torch.max(outputs, dim=1)
        loss = loss_fn(outputs, labels)
        correct_predictions += torch.sum(preds == labels)
        losses.append(loss.item())
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm = 1.0)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
    return correct_predictions.double()/n_examples, np.mean(losses)

def eval_model(model, data_loader, loss_fn,n_examples):
    model = model.eval()
    losses = []
    correct_predictions = 0
    with torch.no_grad():
        for batch in tqdm(data_loader):
            input_ids = batch['input_ids']
            attention_mask = batch['attention_mask']
            labels = batch['labels']
            outputs = model(input_ids = input_ids, attention_mask = attention_mask)
            _, preds = torch.max(outputs, dim=1)
            loss = loss_fn(outputs, labels)
            correct_predictions += torch.sum(preds == labels)
            losses.append(loss.item())
    return correct_predictions.double()/n_examples, np.mean(losses)


In [50]:
for epoch in range(EPOCHES):
    print('Epoch {} de {}'.format(epoch+1, EPOCHES))
    print('---------------------------')
    train_acc, train_loss = train_model(
        model, train_data_loader, loss_fn, optimizer, scheduler, len(df_train)
    )
    print('Entrenamiento: Loss: {}, accuracy: {}'.format(train_loss,train_acc) )
    test_acc, test_loss = eval_model(
        model, test_data_loader, loss_fn, len(df_test)
    )
    print('Validacion: Loss: {}, accuracy: {}'.format(test_loss,test_acc) )
    # with open("../Model_Train/Albert_"+str(epoch)+".pkl", "wb") as file:
    #     pickle.dump(model, file)    

Epoch 1 de 1
---------------------------


100%|██████████| 1/1 [00:11<00:00, 11.26s/it]


Entrenamiento: Loss: 0.7211700677871704, accuracy: 0.2


100%|██████████| 1/1 [00:01<00:00,  1.73s/it]

Validacion: Loss: 0.6630756258964539, accuracy: 0.6





In [51]:
model.save_pretrained("Fine_turned")

In [61]:
model = AutoModelForSequenceClassification.from_pretrained("../Logic/Fine_turned/")

### Prediccion

In [78]:
def classifySentiment(review_text):
    encoding_review = tokenizer.encode_plus(
        review_text,
        add_special_tokens = True,
        max_length = 250,
        padding = "max_length",
        return_token_type_ids = False,
        truncation = True,
        return_attention_mask = True,
        return_tensors = 'pt'
    )

    input_ids = encoding_review['input_ids']
    attention_mask = encoding_review['attention_mask']
    output = model(input_ids,attention_mask)
    logits = output.logits
    _, prediction = torch.max(logits,dim=1)
    print(logits)
    print(prediction)
    print("\n".join(wrap(review_text)))
    if prediction:
        print("Sentimiento predicho: * * * * *")
    else:
        print("Sentimiento predicho: *")

In [85]:
classifySentiment("excelente")

tensor([[-0.2419, -0.3447]], grad_fn=<AddmmBackward0>)
tensor([0])
excelente
Sentimiento predicho: *


### Guardar Model

In [16]:
with open("../Model_Train/Bert_Model_Hotel_COAH", "wb") as file:
    pickle.dump(model, file)

### Guardar Tokenizer

In [136]:
tokenizer.save_pretrained('../Logic/Fine_turned/')

('../Logic/Fine_turned/tokenizer_config.json',
 '../Logic/Fine_turned/special_tokens_map.json',
 '../Logic/Fine_turned/vocab.txt',
 '../Logic/Fine_turned/added_tokens.json',
 '../Logic/Fine_turned/tokenizer.json')

In [20]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from datasets import Dataset
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader

# 1. Definir el tokenizador y el modelo
tokenizer = BertTokenizer.from_pretrained('../Model/bert-base-spanish-uncased/') 
model = BertForSequenceClassification.from_pretrained('../Model/bert-base-spanish-uncased/') # 2 clases: positivo/negativo

# 5. Definir la función de entrenamiento
def train_model(model, train_data_loader, optimizer, scheduler, device):
    model.train()
    total_loss = 0
    for batch in train_data_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
        scheduler.step()
    return total_loss / len(train_data_loader)

# 6. Definir la función de evaluación
def eval_model(model, test_data_loader, device):
    model.eval()
    total_loss = 0
    correct_predictions = 0
    with torch.no_grad():
        for batch in test_data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_loss += loss.item()

            _, preds = torch.max(outputs.logits, dim=1)
            correct_predictions += torch.sum(preds == labels).item()
    return total_loss / len(test_data_loader), correct_predictions / len(test_data_loader)

# 7. Ajustar el modelo
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

optimizer = AdamW(model.parameters(), lr=2e-5)  # Ajuste fino del modelo BERT
total_steps = len(train_data_loader) * 3 # 3 epocas
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

EPOCHES = 3

for epoch in range(EPOCHES):
    print('Epoch {} de {}'.format(epoch+1, EPOCHES))
    print('---------------------------')
    train_loss = train_model(model, train_data_loader, optimizer, scheduler, device)
    print('Entrenamiento: Loss: {}'.format(train_loss))
    test_loss, test_acc = eval_model(model, test_data_loader, device)
    print('Evaluación: Loss: {}, Accuracy: {}'.format(test_loss, test_acc))

# 8. Guardar el modelo
model_save_path = 'fine_tuned_bert_spanish'
tokenizer.save_pretrained(model_save_path)
model.save_pretrained(model_save_path)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ../Model/bert-base-spanish-uncased/ and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1 de 3
---------------------------
Entrenamiento: Loss: 0.7067222595214844
Evaluación: Loss: 0.6493831872940063, Accuracy: 1.0
Epoch 2 de 3
---------------------------
Entrenamiento: Loss: 0.5518473386764526
Evaluación: Loss: 0.6259074211120605, Accuracy: 1.0
Epoch 3 de 3
---------------------------
Entrenamiento: Loss: 0.4903680980205536
Evaluación: Loss: 0.6116399765014648, Accuracy: 1.0
