In [1]:
import os
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
import torch
from torch.utils.data import Dataset, DataLoader

In [2]:
def read_yelp_sentiment(mode): 
    assert mode in ['train', 'dev', 'test']
    sentences = []
    labels = []
    base_dir = '/jupyter/prompt-generation/soft-Q-learning-for-text-generation/data/yelp-gpt2/raw/'
    for label_val in [0, 1]: 
        filename = f'sentiment.{mode}.{label_val}'
        filepath = os.path.join(base_dir, filename)
        with open(filepath, 'r') as fr: 
            new_sentences = [l.strip() for l in fr.readlines()]
            new_labels = [label_val for _ in new_sentences]
            sentences += new_sentences
            labels += new_labels
    return sentences, labels            

In [3]:
sentences_train, labels_train = read_yelp_sentiment('train')
sentences_dev, labels_dev = read_yelp_sentiment('dev')
sentences_test, labels_test = read_yelp_sentiment('test')

In [4]:
len(sentences_train)

444101

In [5]:
len(sentences_dev)

63483

In [6]:
len(sentences_test)

126670

In [9]:
sentences_train_test, labels_train_test = sentences_train + sentences_test, labels_train + labels_test

In [10]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased', use_fast=True)

In [11]:
# train_encodings = tokenizer(sentences_train, truncation=True, padding=True)
train_test_encodings = tokenizer(sentences_train_test, truncation=True, padding=True)
dev_encodings = tokenizer(sentences_dev, truncation=True, padding=True)
# test_encodings = tokenizer(sentences_test, truncation=True, padding=True)

In [12]:
class YelpDataset(Dataset): 
    def __init__(self, encodings, labels): 
        # assert len(sentences) == len(labels)
        self.encodings = encodings
        self.labels = labels
        
    def __len__(self): 
        return len(self.labels)
    
    def __getitem__(self, idx): 
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

In [13]:
# train_dataset = YelpDataset(train_encodings, labels_train)
train_test_dataset = YelpDataset(train_test_encodings, labels_train_test)
dev_dataset = YelpDataset(dev_encodings, labels_dev)
# test_dataset = YelpDataset(test_encodings, labels_test)

# Train and experiment with some models

In [14]:
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [15]:
# metric_name = "accuracy"
batch_size = 64

args = TrainingArguments(
    "./results-bert-base-train-test",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=5,
    warmup_steps=500,
    weight_decay=0.01,
    gradient_accumulation_steps=1,
#     logging_dir='/jupyter/runs/20210728-1144-yelp-classifier',
    logging_dir='/jupyter/runs/20211201-1652-yelp-classifier',
    logging_steps=10,
)

In [16]:
from datasets import load_metric
import numpy as np

metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [18]:
trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=args,                  # training arguments, defined above
    compute_metrics=compute_metrics,
#     train_dataset=train_dataset,         # training dataset
#     train_dataset=test_dataset, 
    train_dataset=train_test_dataset, 
    eval_dataset=dev_dataset             # evaluation dataset
)

In [19]:
trainer.train()

***** Running training *****
  Num examples = 570771
  Num Epochs = 5
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 1
  Total optimization steps = 22300
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mmingkaid[0m (use `wandb login --relogin` to force relogin)
[34m[1mwandb[0m: wandb version 0.12.7 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade




Epoch,Training Loss,Validation Loss,Accuracy
1,0.0814,0.050136,0.982515
2,0.0558,0.048613,0.984563
3,0.0086,0.053015,0.984453
4,0.0159,0.063666,0.98409
5,0.0064,0.072155,0.984279


***** Running Evaluation *****
  Num examples = 63483
  Batch size = 128
Saving model checkpoint to ./results-bert-base-train-test/checkpoint-4460
Configuration saved in ./results-bert-base-train-test/checkpoint-4460/config.json
Model weights saved in ./results-bert-base-train-test/checkpoint-4460/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 63483
  Batch size = 128
Saving model checkpoint to ./results-bert-base-train-test/checkpoint-8920
Configuration saved in ./results-bert-base-train-test/checkpoint-8920/config.json
Model weights saved in ./results-bert-base-train-test/checkpoint-8920/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 63483
  Batch size = 128
Saving model checkpoint to ./results-bert-base-train-test/checkpoint-13380
Configuration saved in ./results-bert-base-train-test/checkpoint-13380/config.json
Model weights saved in ./results-bert-base-train-test/checkpoint-13380/pytorch_model.bin
***** Running Evaluation *****
  Num examples 

TrainOutput(global_step=22300, training_loss=0.03429967642636477, metrics={'train_runtime': 8585.6371, 'train_samples_per_second': 332.399, 'train_steps_per_second': 2.597, 'total_flos': 6.15956906983698e+16, 'train_loss': 0.03429967642636477, 'epoch': 5.0})

In [13]:
tokenizer = AutoTokenizer.from_pretrained('roberta-base', use_fast=True)

In [14]:
train_encodings = tokenizer(sentences_train, truncation=True, padding=True)
dev_encodings = tokenizer(sentences_dev, truncation=True, padding=True)
test_encodings = tokenizer(sentences_test, truncation=True, padding=True)

In [15]:
class YelpDataset(Dataset): 
    def __init__(self, encodings, labels): 
        # assert len(sentences) == len(labels)
        self.encodings = encodings
        self.labels = labels
        
    def __len__(self): 
        return len(self.labels)
    
    def __getitem__(self, idx): 
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

In [16]:
train_dataset = YelpDataset(train_encodings, labels_train)
dev_dataset = YelpDataset(dev_encodings, labels_dev)
test_dataset = YelpDataset(test_encodings, labels_test)

In [17]:
model = AutoModelForSequenceClassification.from_pretrained("roberta-base", num_labels=2)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifie

In [18]:
# metric_name = "accuracy"
batch_size = 64

args = TrainingArguments(
    "./20210728-1155-results",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=5,
    warmup_steps=500,
    weight_decay=0.01,
    gradient_accumulation_steps=1,
    logging_dir='/jupyter/runs/20210728-1155-yelp-classifier',
    logging_steps=10,
)

In [20]:
trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=args,                  # training arguments, defined above
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,         # training dataset
    eval_dataset=dev_dataset             # evaluation dataset
)

In [21]:
trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy,Runtime,Samples Per Second
1,0.077,0.059425,0.980357,53.2944,1191.176
2,0.0453,0.062406,0.981318,54.3325,1168.417
3,0.0186,0.060153,0.983318,52.6974,1204.671
4,0.0359,0.062301,0.983492,52.7198,1204.158
5,0.0225,0.071162,0.983334,52.7492,1203.488




TrainOutput(global_step=17350, training_loss=0.04341075189969897, metrics={'train_runtime': 6121.7333, 'train_samples_per_second': 2.834, 'total_flos': 4.31776276184526e+16, 'epoch': 5.0, 'init_mem_cpu_alloc_delta': -154415104, 'init_mem_gpu_alloc_delta': 499849216, 'init_mem_cpu_peaked_delta': 154415104, 'init_mem_gpu_peaked_delta': 0, 'train_mem_cpu_alloc_delta': 4583424, 'train_mem_gpu_alloc_delta': 1501474304, 'train_mem_cpu_peaked_delta': 309334016, 'train_mem_gpu_peaked_delta': 1177982464})

# Try the best trained model

In [8]:
model = AutoModelForSequenceClassification.from_pretrained("./results-bert-base/checkpoint-10410/")

In [10]:
batch_size = 64
args = TrainingArguments(
    "./results",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=5,
    warmup_steps=500,
    weight_decay=0.01,
    gradient_accumulation_steps=1,
    logging_dir='/jupyter/runs/20210728-1144-yelp-classifier',
    logging_steps=10,
)

In [13]:
from datasets import load_metric
import numpy as np

metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [14]:
trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=args,                  # training arguments, defined above
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,         # training dataset
    eval_dataset=dev_dataset             # evaluation dataset
)

In [15]:
trainer.evaluate()



Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mmingkaid[0m (use `wandb login --relogin` to force relogin)


{'eval_loss': 0.055661026388406754,
 'eval_accuracy': 0.9839012018965707,
 'eval_runtime': 55.0991,
 'eval_samples_per_second': 1152.16,
 'init_mem_cpu_alloc_delta': 1674260480,
 'init_mem_gpu_alloc_delta': 439072256,
 'init_mem_cpu_peaked_delta': 93827072,
 'init_mem_gpu_peaked_delta': 0,
 'eval_mem_cpu_alloc_delta': 434532352,
 'eval_mem_gpu_alloc_delta': 0,
 'eval_mem_cpu_peaked_delta': 385024,
 'eval_mem_gpu_peaked_delta': 81342464}

In [16]:
trainer.evaluate(test_dataset)



{'eval_loss': 0.06177423521876335,
 'eval_accuracy': 0.9817320596826399,
 'eval_runtime': 103.5566,
 'eval_samples_per_second': 1223.196,
 'eval_mem_cpu_alloc_delta': 1658880,
 'eval_mem_gpu_alloc_delta': 0,
 'eval_mem_cpu_peaked_delta': 1024000,
 'eval_mem_gpu_peaked_delta': 71845888}

The test accuracy is 98.17%, which is pretty good. Let's try to make it a pipeline

In [4]:
from transformers import pipeline
from tqdm import tqdm
import numpy as np

In [5]:
classifier = pipeline('sentiment-analysis',
                      model='./results-bert-base/checkpoint-10410/',
                      tokenizer='bert-base-uncased',
                      device=1)

In [6]:
output = classifier(["Sorry but i do n't get the rave reviews for this place .",
            "the $ _num_ minimum charge to use a credit card is also annoying .",
            "they 're quite generous with the shrimp !",
                    "in my mind there are only two things I can have that my mind cannot control",
                    "I am not in a position to describe the issues that have occurred in my past. Now it is time"])
output

[{'label': 'LABEL_0', 'score': 0.9999176263809204},
 {'label': 'LABEL_0', 'score': 0.9999401569366455},
 {'label': 'LABEL_1', 'score': 0.9971464276313782},
 {'label': 'LABEL_0', 'score': 0.9991219639778137},
 {'label': 'LABEL_0', 'score': 0.9998542666435242}]

In [9]:
100 * False

0

In [7]:
type(output)

list

In [8]:
classifier(["the $ _num_ minimum charge to use a credit card is also annoying ."])

[{'label': 'LABEL_0', 'score': 0.9999401569366455}]

In [10]:
classifier(["they 're quite generous with the shrimp !"])

[{'label': 'LABEL_1', 'score': 0.9971464276313782}]

In [10]:
classifier(['Crisse & Rosemary'])

[{'label': 'LABEL_1', 'score': 0.7880922555923462}]

In [8]:
batch_size = 64
n_correct_0 = 0
n_total_0 = 0
n_predict_0 = 0

n_correct_1 = 0
n_total_1 = 0
n_predict_1 = 0
for i in tqdm(range(len(sentences_dev) // batch_size)): 
    sentences = sentences_dev[i*batch_size:(i+1)*batch_size]
    labels = np.array(labels_dev[i*batch_size:(i+1)*batch_size])
    outputs = classifier(sentences, truncation=True)
    predictions = np.array([int(o['label'][-1]) for o in outputs])
    
    is_correct = predictions == labels
    is_0 = labels == 0
    predict_0 = predictions == 0
    n_correct_0 += (is_correct & is_0).sum()
    n_predict_0 += predict_0.sum()
    n_total_0 += is_0.sum()
    n_correct_1 += (is_correct & ~is_0).sum()
    n_predict_1 += (~predict_0).sum()
    n_total_1 += (~is_0).sum()
    
print('Recall 0:', n_correct_0 / n_total_0)
print('Precision 0:', n_correct_0 / n_predict_0)
print('Recall 1:', n_correct_1 / n_total_1)
print('Precision 1:', n_correct_1 / n_predict_1)

100%|██████████| 991/991 [00:56<00:00, 17.53it/s]

Recall 0: 0.9759079041063375
Precision 0: 0.9835732227582632
Recall 1: 0.9891993918104126
Precision 1: 0.9841170487442297





In [10]:
dataloader = DataLoader(dev_dataset, batch_size=64, num_workers=8)

In [11]:
batch = next(iter(dataloader))

In [None]:
outputs = 

In [13]:
batch

{'input_ids': tensor([[  101,  7929,  2196,  ...,     0,     0,     0],
         [  101, 10957,  2154,  ...,     0,     0,     0],
         [  101,  1996,  3677,  ...,     0,     0,     0],
         ...,
         [  101,  2036,  1996,  ...,     0,     0,     0],
         [  101,  1045,  2572,  ...,     0,     0,     0],
         [  101,  1045,  2052,  ...,     0,     0,     0]]),
 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         ...,
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0]]),
 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]]),
 'labels': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [9]:
sentences_dev[0]

'ok never going back to this place again .'