In [1]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/d5/43/cfe4ee779bbd6a678ac6a97c5a5cdeb03c35f9eaebbb9720b036680f9a2d/transformers-4.6.1-py3-none-any.whl (2.2MB)
[K     |████████████████████████████████| 2.3MB 27.4MB/s 
Collecting huggingface-hub==0.0.8
  Downloading https://files.pythonhosted.org/packages/a1/88/7b1e45720ecf59c6c6737ff332f41c955963090a18e72acbcbeac6b25e86/huggingface_hub-0.0.8-py3-none-any.whl
Collecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/d4/e2/df3543e8ffdab68f5acc73f613de9c2b155ac47f162e725dcac87c521c11/tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3MB)
[K     |████████████████████████████████| 3.3MB 38.4MB/s 
[?25hCollecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/75/ee/67241dc87f266093c533a2d4d3d69438e57d7a90abb216fa076e7d475d4a/sacremoses-0.0.45-py3-none-any.whl (895kB)
[K     |

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split

import os

In [3]:
FILE_PATH = "/content/drive/MyDrive/GPUs/Bert/DataSet/App_Review/App_reviews.csv"
df = pd.read_csv(FILE_PATH)

In [4]:
def sentiment(score):

  if score <= 4: return 0
  else: return 1
  
df['sentiment'] = df.score.apply(sentiment)



In [23]:
#DEF CONSTANTES
PRETRAINED_BERT_MODEL = "bert-base-uncased" 
RANDOM_SEED = 42
MAX_LEN = 100
BATCH_SIZE = 32
DROPOUT = 0.5
NCLASES = 2
NHIDDENS = 768
EPOCHS = 5

In [6]:
#SPLIT DATA
df_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_validation, df_test = train_test_split(df_test, test_size=0.5, random_state=42)

print(f"Training dataset : {df_train.shape}")
print(f"Test dataset : {df_test.shape}")
print(f"Validation dataset : {df_validation.shape}")

Training dataset : (14400, 13)
Test dataset : (1800, 13)
Validation dataset : (1800, 13)


In [9]:
#Init
device = ('cuda:0' if torch.cuda.is_available() else 'cpu')
tokenizer = BertTokenizer.from_pretrained(PRETRAINED_BERT_MODEL)

np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

<torch._C.Generator at 0x7f52c22d48d0>

In [11]:
class IMDBdataset(Dataset):
    
    def __init__(self, reviews, labels, tokenizer, max_len):
        self.reviews = reviews
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.reviews)

    def __getitem__(self, item):
        review = str(self.reviews[item])
        label = self.labels[item]

        encoding = tokenizer.encode_plus(
            review,
            max_length = self.max_len,
            truncation = True,
            add_special_tokens = True,
            return_token_type_ids = False,
            padding = 'max_length',
            return_attention_mask = True,
            return_tensors = 'pt'
        )

        return {
            'review': review,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        } 

In [14]:
def data_loader(df, tokenizer, max_len, batch_size):
    dataset = IMDBdataset(
        reviews = df.content.to_numpy(),
        labels = df.sentiment.to_numpy(),
        tokenizer = tokenizer,
        max_len = MAX_LEN
    )

    return DataLoader(dataset, batch_size=BATCH_SIZE, num_workers=2)


In [18]:
train_dataloader = data_loader(df_train, tokenizer, MAX_LEN, BATCH_SIZE)
test_dataloader = data_loader(df_test, tokenizer, MAX_LEN, BATCH_SIZE)
validation_dataloader = data_loader(df_validation, tokenizer, MAX_LEN, BATCH_SIZE)

In [22]:
class BERTArticleClassificator(nn.Module):

    def __init__(self, numClases):
        super(BERTArticleClassificator, self).__init__()
        self.bert = BertModel.from_pretrained(PRETRAINED_BERT_MODEL)
        self.drop = nn.Dropout(p=DROPOUT)
        self.linear = nn.Linear(self.bert.config.hidden_size, NCLASES)
    
    def forward(self, input_ids, attention_mask):

        outputs, cls_output = self.bert(
            input_ids = input_ids,
            attention_mask = attention_mask,
            return_dict = False
        )
        
        drop_out = self.drop(cls_output)
        output = self.linear(drop_out)

        return output

model = BERTArticleClassificator(NCLASES)
model = model.to(device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [24]:
optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)
total_steps = len(train_dataloader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

loss_fn = nn.CrossEntropyLoss().to(device)

In [26]:
#TRAINING
def train_model(model, data_loader, loss_fn, optimizer, device, scheduler, n_examples):
  model = model.train()
  losses = []
  correct_predictions = 0
  
  for batch in data_loader:
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)

    labels = batch['label'].to(device)
    output = model(input_ids=input_ids, attention_mask=attention_mask)
    _, preds = torch.max(output, dim=1)
    loss = loss_fn(output, labels)
    correct_predictions += torch.sum(preds == labels)
    losses.append(loss.item())
    loss.backward()
    nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
    optimizer.step()
    scheduler.step()
    optimizer.zero_grad()
  
  return correct_predictions.double()/n_examples, np.mean(losses)


In [27]:
def eval_model(model, data_loader, loss_fn, device, n_examples):
  model = model.eval()
  losses = []
  correct_predictions = 0
  with torch.no_grad():
    for batch in data_loader:
      input_ids = batch['input_ids'].to(device)
      attention_mask = batch['attention_mask'].to(device)
      labels = batch['label'].to(device)
      output = model(input_ids=input_ids, attention_mask=attention_mask)
      _, preds = torch.max(output, dim=1)
      loss = loss_fn(output, labels)
      correct_predictions += torch.sum(preds == labels)
      losses.append(loss.item())
    
  return correct_predictions.double()/n_examples, np.mean(losses)

In [28]:
for epoch in range(EPOCHS):
    print('Epoch {} de {}'.format(epoch+1, EPOCHS))
    print('-'*10)
    train_acc, train_loss = train_model(
          model, train_dataloader, loss_fn, optimizer, device, scheduler, len(df_train)
    )
    
    test_acc, test_loss = eval_model(
          model, validation_dataloader, loss_fn, device, len(df_test)
    )
    
    print('Entrenamiento: Loss: {}, accuracy: {}'.format(train_loss, train_acc))
    print('Validación: Loss: {}, accuracy: {}'.format(test_loss, test_acc))
    print('')

Epoch 1 de 5
----------
Entrenamiento: Loss: 0.49328311771154404, accuracy: 0.7840972222222222
Validación: Loss: 0.3784444000114474, accuracy: 0.8516666666666667

Epoch 2 de 5
----------
Entrenamiento: Loss: 0.3239939915637175, accuracy: 0.8715972222222222
Validación: Loss: 0.3521879568957446, accuracy: 0.8683333333333333

Epoch 3 de 5
----------
Entrenamiento: Loss: 0.26900113101634715, accuracy: 0.8915277777777778
Validación: Loss: 0.35466724934807997, accuracy: 0.8788888888888889

Epoch 4 de 5
----------
Entrenamiento: Loss: 0.24628989376955562, accuracy: 0.8990277777777778
Validación: Loss: 0.3699773940862271, accuracy: 0.8766666666666667

Epoch 5 de 5
----------
Entrenamiento: Loss: 0.2349013740817706, accuracy: 0.9019444444444444
Validación: Loss: 0.3716545925851454, accuracy: 0.8777777777777778

