In [1]:
import os
import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset, random_split
from torch import optim
from transformers import BertForSequenceClassification
import pandas as pd
import numpy as np

In [2]:
from transformers import AutoTokenizer
class YelpDataset(Dataset):
    def __init__(self, file_path, pretrained_model):
        self.file_path = file_path 
        self.data = pd.read_json(file_path,lines=True)
        self.tokenizer = AutoTokenizer.from_pretrained(pretrained_model)
        self.len = len(self.data)
    
    def __len__(self):
        return self.len

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        label = row['stars']-1
        text = row['text']
        return {"text":text, "label":label}
def data_processing(data, tokenizer):
    text_batch = []
    label_batch = []
    for d in data:
        text_batch.append(d['text'])
        label_batch.append(d['label'])
    encoding = tokenizer(text_batch, return_tensors='pt', padding=True, truncation=True)
    input_ids, attn_masks = encoding['input_ids'], encoding['attention_mask']
    labels = torch.tensor(label_batch)
    return input_ids, attn_masks, labels

In [3]:
def train(model, training_data, optimizer, loss, num_epoch):
    for e in range(num_epoch):
        print('Epoch:', e)
        for batch_idx, data in enumerate(training_data):
            print('Batch:', batch_idx)
            input_ids, attention_mask, labels = data

            optimizer.zero_grad()
            outputs = model(data)
            loss = F.cross_entropy(outputs.logits, labels)
            loss.backward()
            optimizer.step()
            print('Loss:',loss)

In [4]:
class YelpModel(nn.Module):
    def __init__(self, model_params):
        super(YelpModel, self).__init__()
        self.model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
        self.model.train()

    def forward(self, data):
        input_ids, attention_mask, labels = data
        return self.model(input_ids, attention_mask = attention_mask, return_dict=True)

In [5]:
TRAIN_VAL_SPLIT = 0.8
hparams = {
    "batch_size":20,
    "learning_rate":1e-4,
    "num_epoch":1
}
model_params = {}


dataset = YelpDataset("yelp_review_training_dataset.jsonl", "sentence-transformers/bert-base-nli-stsb-mean-tokens")
len_dataset = len(dataset)
train_dataset, val_dataset = random_split(dataset, [int(TRAIN_VAL_SPLIT*len_dataset), len_dataset-(int(TRAIN_VAL_SPLIT*len_dataset))])
train_dataloader = DataLoader(train_dataset, batch_size=hparams['batch_size'], collate_fn=lambda x: data_processing(x, dataset.tokenizer))
val_dataloader = DataLoader(val_dataset, batch_size=hparams['batch_size'], collate_fn=lambda x: data_processing(x, dataset.tokenizer))
print("data loaded...")

model = YelpModel(model_params)
optimizer = optim.AdamW(model.parameters(), hparams['learning_rate'])
loss = nn.CrossEntropyLoss()



HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=461.0), HTML(value='')))




HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=231508.0), HTML(value='')))




HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=2.0), HTML(value='')))




HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=112.0), HTML(value='')))


data loaded...


HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=433.0), HTML(value='')))




HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=440473133.0), HTML(value='')))




Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [6]:
print("training...")
train(model, train_dataloader, optimizer, loss, hparams['num_epoch'])

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


training...
Epoch: 0
Batch: 0


IndexError: Target 4 is out of bounds.