In [2]:
!pip install transformers



In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split

import os

In [4]:
PRETRAINED_BERT_MODEL = "bert-base-cased" 
RANDOM_SEED = 42
MAX_LEN = 160
BATCH_SIZE = 16
DROPOUT = 0.3
NCLASES = 3
NHIDDENS = 768
EPOCHS = 10

In [5]:
df = pd.read_csv('/content/drive/MyDrive/GPUs/Bert/DataSet/GooglePlay_Review/reviews.csv')

def sentiment(score):
  
  if score <= 2:
    return 0
  elif score == 3: return 1
  else: return 2

df['sentiment'] = df.score.apply(sentiment)

df_train, df_test = train_test_split( 
  df,
  test_size=0.1,
  random_state=RANDOM_SEED
)
df_validation, df_test = train_test_split(
  df_test,
  test_size=0.5,
  random_state=RANDOM_SEED
)

In [6]:
#Init
device = ('cuda:0' if torch.cuda.is_available() else 'cpu')
tokenizer = BertTokenizer.from_pretrained(PRETRAINED_BERT_MODEL)

np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

<torch._C.Generator at 0x7f2ae927a350>

In [7]:
class IMDBdataset(Dataset):
    
    def __init__(self, reviews, labels, tokenizer, max_len):
        self.reviews = reviews
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.reviews)

    def __getitem__(self, item):
        review = str(self.reviews[item])
        label = self.labels[item]

        encoding = tokenizer.encode_plus(
            review,
            max_length = self.max_len,
            truncation = True,
            add_special_tokens = True,
            return_token_type_ids = False,
            padding = 'max_length',
            return_attention_mask = True,
            return_tensors = 'pt'
        )

        return {
            'review': review,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        } 

In [8]:
def data_loader(df, tokenizer, max_len, batch_size):
    dataset = IMDBdataset(
        reviews = df.content.to_numpy(),
        labels = df.sentiment.to_numpy(),
        tokenizer = tokenizer,
        max_len = MAX_LEN
    )

    return DataLoader(dataset, batch_size=BATCH_SIZE, num_workers=2)

train_dataloader = data_loader(df_train, tokenizer, MAX_LEN, BATCH_SIZE)
test_dataloader = data_loader(df_test, tokenizer, MAX_LEN, BATCH_SIZE)
validation_dataloader = data_loader(df_validation, tokenizer, MAX_LEN, BATCH_SIZE)

In [9]:
class BERTArticleClassificator(nn.Module):

    def __init__(self, numClases):
        super(BERTArticleClassificator, self).__init__()
        self.bert = BertModel.from_pretrained(PRETRAINED_BERT_MODEL)
        self.drop = nn.Dropout(p=DROPOUT)
        self.linear = nn.Linear(self.bert.config.hidden_size, NCLASES)
    
    def forward(self, input_ids, attention_mask):

        outputs, cls_output = self.bert(
            input_ids = input_ids,
            attention_mask = attention_mask,
            return_dict = False
        )
        
        drop_out = self.drop(cls_output)
        output = self.linear(drop_out)

        return output

model = BERTArticleClassificator(NCLASES)
model = model.to(device)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [10]:
optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)
total_steps = len(train_dataloader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

loss_fn = nn.CrossEntropyLoss().to(device)

In [11]:
#TRAINING
def train_model(model, data_loader, loss_fn, optimizer, device, scheduler, n_examples):
  model = model.train()
  losses = []
  correct_predictions = 0
  
  for batch in data_loader:
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)

    labels = batch['label'].to(device)
    output = model(input_ids=input_ids, attention_mask=attention_mask)
    _, preds = torch.max(output, dim=1)
    loss = loss_fn(output, labels)
    correct_predictions += torch.sum(preds == labels)
    losses.append(loss.item())
    loss.backward()
    nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
    optimizer.step()
    scheduler.step()
    optimizer.zero_grad()
  
  return correct_predictions.double()/n_examples, np.mean(losses)

In [12]:
def eval_model(model, data_loader, loss_fn, device, n_examples):
  model = model.eval()
  losses = []
  correct_predictions = 0
  with torch.no_grad():
    for batch in data_loader:
      input_ids = batch['input_ids'].to(device)
      attention_mask = batch['attention_mask'].to(device)
      labels = batch['label'].to(device)
      output = model(input_ids=input_ids, attention_mask=attention_mask)
      _, preds = torch.max(output, dim=1)
      loss = loss_fn(output, labels)
      correct_predictions += torch.sum(preds == labels)
      losses.append(loss.item())
    
  return correct_predictions.double()/n_examples, np.mean(losses)

In [13]:
results = {"TRAIN_ACC": [],
           "VALIDATION_ACC": [],
           "TRAINING_LOSS": [],
           "VALIDATION_LOSS": []
           }
           
for epoch in range(EPOCHS):
    print('Epoch {} de {}'.format(epoch+1, EPOCHS))
    print('-'*10)
    train_acc, train_loss = train_model(
          model, train_dataloader, loss_fn, optimizer, device, scheduler, len(df_train)
    )
    
    test_acc, test_loss = eval_model(
          model, validation_dataloader, loss_fn, device, len(df_test)
    )
    
    print('Entrenamiento: Loss: {}, accuracy: {}'.format(train_loss, train_acc))
    print('Validación: Loss: {}, accuracy: {}'.format(test_loss, test_acc))
    print('')

    results["TRAIN_ACC"] += [train_acc]
    results["VALIDATION_ACC"] += [test_acc]
    results["TRAINING_LOSS"] += [train_loss]
    results["VALIDATION_LOSS"] += [test_loss]

Epoch 1 de 10
----------
Entrenamiento: Loss: 0.7366054549174169, accuracy: 0.6683367440547597
Validación: Loss: 0.6127868866920472, accuracy: 0.7563451776649746

Epoch 2 de 10
----------
Entrenamiento: Loss: 0.4322514268180721, accuracy: 0.8348034718791899
Validación: Loss: 0.49650893852114675, accuracy: 0.8375634517766497

Epoch 3 de 10
----------
Entrenamiento: Loss: 0.23880040206164374, accuracy: 0.9232940512313881
Validación: Loss: 0.6377822248637677, accuracy: 0.8515228426395939

Epoch 4 de 10
----------
Entrenamiento: Loss: 0.1685613084537961, accuracy: 0.9529320443158563
Validación: Loss: 0.6721343879401683, accuracy: 0.8667512690355329

Epoch 5 de 10
----------
Entrenamiento: Loss: 0.12178720151267118, accuracy: 0.966198574553666
Validación: Loss: 0.6787563682161272, accuracy: 0.8705583756345177

Epoch 6 de 10
----------
Entrenamiento: Loss: 0.09248290180281733, accuracy: 0.9765013054830287
Validación: Loss: 0.7001340850954876, accuracy: 0.8756345177664974

Epoch 7 de 10
-----

In [14]:
res = pd.DataFrame(results)
pd.to_csv("/content/drive/My Drive/save.csv")

AttributeError: ignored

In [None]:
pd.s