In [1]:
!pip install -qq transformers

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m56.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.8/199.8 KB[0m [31m26.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m87.5 MB/s[0m eta [36m0:00:00[0m
[?25h

## 1. Load Yelp Polarity Review Dataset and Preprocess

In [2]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [3]:
import pandas as pd

train_data_big = pd.read_csv("/content/drive/MyDrive/Colab_Notebooks/Yelp/train.csv", header=None, names=["sentiment", "text"])
test_data_big = pd.read_csv("/content/drive/MyDrive/Colab_Notebooks/Yelp/test.csv", header=None, names=["sentiment", "text"])

In [4]:
def get_smaller_df(df, size):
    data_negative = df[df["sentiment"] == 1].head(size)
    data_postive = df[df["sentiment"] == 2].head(size)
    df_small = pd.concat([data_negative, data_postive])
    return df_small

train_data = get_smaller_df(train_data_big, 7000)
test_data = get_smaller_df(test_data_big, 3000)

### Use BERT Pretrained Model to Preprocess

In [5]:
import transformers
from transformers import BertTokenizer

In [6]:
PRE_TRAINED_MODEL_NAME = "bert-base-cased"
tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

#### Test Tokenizer

In [7]:
sentence1 = "When was I last outside?"
sentence2 = "I am stuck at home for 2 weeks."
encoded_dict = tokenizer(sentence1, sentence2)
token_ids = encoded_dict["input_ids"]
tokens = tokenizer.convert_ids_to_tokens(token_ids)
print(f" Sentence: {sentence1}{sentence2}")
print(f"   Tokens: {tokens}")
print(f"Token IDs: {token_ids}")

 Sentence: When was I last outside?I am stuck at home for 2 weeks.
   Tokens: ['[CLS]', 'When', 'was', 'I', 'last', 'outside', '?', '[SEP]', 'I', 'am', 'stuck', 'at', 'home', 'for', '2', 'weeks', '.', '[SEP]']
Token IDs: [101, 1332, 1108, 146, 1314, 1796, 136, 102, 146, 1821, 5342, 1120, 1313, 1111, 123, 2277, 119, 102]


In [8]:
encoding = tokenizer.encode_plus(sentence1, sentence2, max_length=32, padding="max_length",
                                 truncation=True, return_tensors="pt")
encoding.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

In [9]:
print(encoding["input_ids"][0])
print(encoding["token_type_ids"][0])
print(encoding["attention_mask"][0])

tensor([ 101, 1332, 1108,  146, 1314, 1796,  136,  102,  146, 1821, 5342, 1120,
        1313, 1111,  123, 2277,  119,  102,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0])
tensor([0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0])
tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0])


#### Choose Sequence Length

In [10]:
# token_lens = []
# for text in train_data.text:
#     tokens = tokenizer.encode(text, max_length=1024, truncation=True)
#     token_lens.append(len(tokens))

In [11]:
# import matplotlib.pyplot as plt
# import seaborn as sns
# sns.set_theme(style="whitegrid")

# sns.displot(token_lens)
# plt.xlim([0, 512])
# plt.xlabel("Token Count")

In [12]:
MAX_LEN = 512
BATCH_SIZE = 16

## 2. Create Dataloader

In [13]:
import torch
import torch.nn.functional as F
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils import weight_norm
import torch.optim.lr_scheduler as lr_scheduler

import numpy as np
import warnings
from tqdm import tqdm

warnings.filterwarnings("ignore") 
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cpu


In [14]:
def make_target(label):
    if label == 1:
        return torch.tensor([0], dtype=torch.long, device = device)
    else:
        return torch.tensor([1], dtype=torch.long, device = device)

In [15]:
class YelpReviewDataset(Dataset):
    def __init__(self, reviews, targets, tokenizer, max_len):
        super(YelpReviewDataset, self).__init__()
        self.reviews = reviews
        self.targets = targets
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.reviews)

    def __getitem__(self, item):
        review = self.reviews[item]
        target = make_target(self.targets[item])
        encoding = self.tokenizer.encode_plus(review, max_length=self.max_len, padding = "max_length",
                                              truncation=True, return_tensors="pt")
        return {
            "review": review,
            "target": target,
            "attention_mask": encoding["attention_mask"].flatten(),
            "input_ids": encoding["input_ids"].flatten()
        }


In [16]:
def create_data_loader(df, tokenizer, max_len, batch_size):
    ds = YelpReviewDataset(
        reviews = df.text.to_numpy(),
        targets = df.sentiment.to_numpy(),
        tokenizer = tokenizer,
        max_len = max_len
    )

    return DataLoader(ds, batch_size = batch_size, shuffle = True, num_workers = 0)

In [17]:
train_data_loader = create_data_loader(train_data, tokenizer, MAX_LEN, BATCH_SIZE)
test_data_loader = create_data_loader(test_data, tokenizer, MAX_LEN, BATCH_SIZE)

In [18]:
sample_data = next(iter(train_data_loader))
print(sample_data["input_ids"].shape)
print(sample_data["attention_mask"].shape)
print(sample_data["target"].shape)

torch.Size([16, 512])
torch.Size([16, 512])
torch.Size([16, 1])


## 3. Define and Train A BERT Classification Model

In [19]:
from transformers import BertModel, AdamW, get_linear_schedule_with_warmup

#### Bert Model Test

In [20]:
bert_model = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)

Downloading pytorch_model.bin:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [21]:
outputs = bert_model(**encoding)

In [22]:
last_hidden_state = outputs[0]
pooled_output = outputs[1]

print("last hidden state shape: ", last_hidden_state.shape)
print("    pooled output shape: ", pooled_output.shape)
print("      model hidden size: ", bert_model.config.hidden_size)

last hidden state shape:  torch.Size([1, 32, 768])
    pooled output shape:  torch.Size([1, 768])
      model hidden size:  768


#### Define Model

In [23]:
class BERTClassification(nn.Module):
    def __init__(self, num_classes):
        super(BERTClassification, self).__init__()
        self.bert = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)
        self.drop = nn.Dropout(p = 0.3)
        self.fc = nn.Linear(self.bert.config.hidden_size, num_classes)

    def forward(self, **inputs):
        bert_outputs = self.bert(**inputs)
        pooled_outputs = self.drop(bert_outputs[1])
        out = self.fc(pooled_outputs)
        return out

In [24]:
# test

# model = BERTClassification(num_classes = 2)
# model = model.to(device)
# inputs = {}
# inputs["input_ids"] = sample_data["input_ids"].to(device)
# inputs["attention_mask"] = sample_data["attention_mask"].to(device)

In [25]:
# outputs = model(**inputs)
# F.softmax(outputs, dim = 1)

#### Train and Eval

In [26]:
def train_epoch(model, data_loader, loss_fn, optimizer, scheduler, device, n_examples):
    model = model.train()
    losses = []
    correct_count = 0
    progress_bar = tqdm(enumerate(data_loader), total = len(data_loader))
    for idx, data in progress_bar:
        inputs = {}
        inputs["input_ids"] = data["input_ids"].to(device)
        inputs["attention_mask"] = data["attention_mask"].to(device)
        targets = data["target"].squeeze(dim=1).to(device)
        outputs = model(**inputs)
        _, preds = torch.max(outputs, dim=1)
        loss = loss_fn(outputs, targets)
        correct_count += torch.sum(preds == targets)
        losses.append(loss.item())
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
        progress_bar.set_description(f"loss:{loss.item():.4f}")

    return correct_count.double() / n_examples, np.mean(losses)

In [27]:
NUM_EPOCHES = 5

model = BERTClassification(num_classes = 2)
model = model.to(device)
optimizer = AdamW(model.parameters(), lr = 2e-5, correct_bias = False)
total_steps = len(train_data_loader) * NUM_EPOCHES
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps = total_steps)
loss_fn = nn.CrossEntropyLoss().to(device)

loss_file_name = "/content/drive/MyDrive/Colab_Notebooks/Yelp/models/" +  "plots/" + "bert_base_cased_model_loss.csv"

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [28]:
# f = open(loss_file_name,'w')
# f.write('iter, loss')
# f.write('\n')

# for epoch in range(NUM_EPOCHES):
#     print(f'Epoch {epoch+1}/{NUM_EPOCHES}')
#     print('-' * 10)

#     train_acc, train_loss = train_epoch(model, train_data_loader, loss_fn, optimizer, scheduler, device, len(train_data))
#     print('\n')
#     print(f'Train loss: {train_loss}  Accuracy: {train_acc}')

#     f.write(str(epoch+1) + "," + str(train_loss))
#     f.write('\n')

# torch.save(model, "/content/drive/MyDrive/Colab_Notebooks/Yelp/models/" + "bert_base_cased_model.pth")
# f.close()

In [29]:
from sklearn.metrics import classification_report

def eval_model(model, data_loader, device):
    model = model.eval()
    progress_bar = tqdm(enumerate(data_loader), total=len(data_loader))
    all_targets = []
    all_predictions = []
    for idx, data in progress_bar:
        inputs = {}
        inputs["input_ids"] = data["input_ids"].to(device)
        inputs["attention_mask"] = data["attention_mask"].to(device)
        targets = data["target"].squeeze(dim = 1).to(device)
        probs = model(**inputs)
        _, preds = torch.max(probs, dim = 1)
        all_targets += targets.cpu().numpy().tolist()
        all_predictions += preds.cpu().numpy().tolist()
    print(classification_report(all_targets, all_predictions))

In [None]:
e_model = torch.load("/content/drive/MyDrive/Colab_Notebooks/Yelp/models/" + "bert_base_cased_model.pth", map_location=torch.device(device))

eval_model(e_model, test_data_loader, device)

  0%|          | 1/375 [00:44<4:34:49, 44.09s/it]

In [None]:
loss_df = pd.read_csv(loss_file_name, header = 0, names = ["iter", "loss"])
pd.to_numeric(loss_df["loss"])
plt_bert_model = loss_df['loss'].plot()