In [1]:
!pip install transformers
!pip install comet_ml

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/ed/d5/f4157a376b8a79489a76ce6cfe147f4f3be1e029b7144fa7b8432e8acb26/transformers-4.4.2-py3-none-any.whl (2.0MB)
[K     |████████████████████████████████| 2.0MB 14.8MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 48.5MB/s 
Collecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/71/23/2ddc317b2121117bf34dd00f5b0de194158f2a44ee2bf5e47c7166878a97/tokenizers-0.10.1-cp37-cp37m-manylinux2010_x86_64.whl (3.2MB)
[K     |████████████████████████████████| 3.2MB 49.5MB/s 
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
  Created wheel for sacremoses: filename=sacremoses-0.0.43-cp37-none-any.whl size=893262 sha256=78d7f35a0f

In [2]:
import comet_ml
from comet_ml import Experiment
import os
import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset, random_split
from torch import optim
from transformers import BertForSequenceClassification
import pandas as pd
import numpy as np

In [3]:
from transformers import AutoTokenizer
class YelpDataset(Dataset):
    def __init__(self, file_path, pretrained_model):
        self.file_path = file_path 
        self.data = pd.read_json(file_path,lines=True)
        self.tokenizer = AutoTokenizer.from_pretrained(pretrained_model,model_max_length=512)
        self.len = len(self.data)
    
    def __len__(self):
        return self.len

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        label = row['stars']-1
        text = row['text']
        return {"text":text, "label":label}
def data_processing(data, tokenizer):
    text_batch = []
    label_batch = []
    for d in data:
        text_batch.append(d['text'])
        label_batch.append(d['label'])
    encoding = tokenizer(text_batch, return_tensors='pt', padding=True, truncation=True)
    input_ids, attn_masks = encoding['input_ids'], encoding['attention_mask'].detach()
    labels = torch.tensor(label_batch).detach()
    return input_ids, attn_masks, labels

In [4]:
def val_err(model, validation_data, criterion):
    total_loss = 0
    count = 0
    for batch_idx, data in enumerate(validation_data):
        count += 1
        input_ids, attention_mask, labels = data
        outputs = model((input_ids, attention_mask))
        total_loss += criterion(outputs.logits, labels)
    return total_loss / count

In [6]:
def train(model, training_data, validation_data, optimizer, criterion, num_epoch, device, experiment, accumulation_steps):
    model.train()
    i = 0
    with experiment.train():
        for e in range(num_epoch):
            print('Epoch:', e)
            for batch_idx, data in enumerate(training_data):
                input_ids, attention_mask, labels = data
                input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

                #optimizer.zero_grad()
                outputs = model((input_ids, attention_mask))
                loss = criterion(outputs.logits, labels)
                loss = loss/accumulation_steps  

                loss.backward() 
                if((i+1)%accumulation_steps)==0:
                    optimizer.step()       
                    optimizer.zero_grad()
                    print('Loss:',loss)
                experiment.log_metric('loss',loss.item(), step=i)
                #optimizer.step()
                i += 1

        print('validating...')
        print('Val Error: ', val_err(model, validation_data, criterion))

In [7]:
class YelpModel(nn.Module):
    def __init__(self, model_params):
        super(YelpModel, self).__init__()
        self.model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=5)

    def forward(self, data):
        input_ids, attention_mask = data
        return self.model(input_ids, attention_mask = attention_mask, return_dict=True)

In [8]:
TRAIN_VAL_SPLIT = 0.8
hparams = {
    "batch_size":5,
    "learning_rate":1e-6,
    "num_epoch":1,
    "accumulation_steps":100
}
model_params = {}

In [9]:
import random
experiment = Experiment(api_key="mMEniBv0duELpAyET4ek2F3gF", project_name="cs182-nlp")
experiment.set_name(str(random.randint(0,1000)))

COMET INFO: Experiment is live on comet.ml https://www.comet.ml/peter2000623/cs182-nlp/be2400c63fd34427848697f0783a19c5



In [10]:
device = torch.device('cuda:0') if torch.cuda.is_available() else torch.device('cpu') 
print(device)

cuda:0


In [11]:
dataset = YelpDataset("yelp_review_training_dataset.jsonl", "sentence-transformers/bert-base-nli-stsb-mean-tokens")
len_dataset = len(dataset)
train_dataset, val_dataset = random_split(dataset, [int(TRAIN_VAL_SPLIT*len_dataset), len_dataset-(int(TRAIN_VAL_SPLIT*len_dataset))])
train_dataloader = DataLoader(train_dataset, batch_size=hparams['batch_size'], collate_fn=lambda x: data_processing(x, dataset.tokenizer))
val_dataloader = DataLoader(val_dataset, batch_size=hparams['batch_size'], collate_fn=lambda x: data_processing(x, dataset.tokenizer))
print("data loaded...")

model = YelpModel(model_params).to(device)
optimizer = optim.AdamW(model.parameters(), hparams['learning_rate'])
loss = nn.CrossEntropyLoss().to(device)

ValueError: ignored

In [12]:
print("training...")
train(model, train_dataloader, val_dataloader, optimizer, loss, hparams['num_epoch'], device, experiment, hparams['accumulation_steps'])

training...


NameError: ignored