<a href="https://colab.research.google.com/github/mukkatharun/advance-deep-learning-assignments/blob/main/Assignment4_Meta%20Learning%20and%20Continous%20learning/metalearning_on_top_of_BERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install transformers

In [2]:
# Let inspect the data
import json
from random import shuffle
from urllib.request import urlopen
from collections import Counter


In [3]:
response = urlopen('https://raw.githubusercontent.com/mailong25/meta-learning-bert/master/dataset.json')
reviews = json.loads(response.read())
reviews[:5]

[{'text': "GOOD LOOKING KICKS IF YOUR KICKIN IT OLD SCHOOL LIKE ME. AND COMFORTABLE. AND RELATIVELY CHEAP. I'LL ALWAYS KEEP A PAIR OF STAN SMITH'S AROUND FOR WEEKENDS",
  'label': 'positive',
  'domain': 'apparel'},
 {'text': 'These sunglasses are all right. They were a little crooked, but still cool..',
  'label': 'positive',
  'domain': 'apparel'},
 {'text': "I don't see the difference between these bodysuits and the more expensive ones. Fits my boy just right",
  'label': 'positive',
  'domain': 'apparel'},
 {'text': 'Very nice basic clothing. I think the size is fine. I really like being able to find these shades of green, though I have decided the lighter shade is really a feminine color. This is the only brand that I can find these muted greens',
  'label': 'positive',
  'domain': 'apparel'},
 {'text': 'I love these socks. They fit great (my 15 month old daughter has thick ankles) and she can zoom around on the kitchen floor and not take a nose dive into things. :',
  'label': 'p

In [4]:
mention_domain = [r['domain'] for r in reviews]
Counter(mention_domain)

Counter({'apparel': 1717,
         'baby': 1107,
         'beauty': 993,
         'books': 921,
         'camera_&_photo': 1086,
         'cell_phones_&_service': 698,
         'dvd': 893,
         'electronics': 1277,
         'grocery': 1100,
         'health_&_personal_care': 1429,
         'jewelry_&_watches': 1086,
         'kitchen_&_housewares': 1390,
         'magazines': 1133,
         'music': 1007,
         'outdoor_living': 980,
         'software': 1029,
         'sports_&_outdoors': 1336,
         'toys_&_games': 1363,
         'video': 1010,
         'automotive': 100,
         'computer_&_video_games': 100,
         'office_products': 100})

Let create meta learning tasks


In [5]:
import os
import torch
from torch.utils.data import Dataset
import numpy as np
import collections
import random
import json, pickle
from torch.utils.data import TensorDataset

LABEL_MAP  = {'positive':0, 'negative':1, 0:'positive', 1:'negative'}

class MetaTask(Dataset):
    
    def __init__(self, examples, num_task, k_support, k_query, tokenizer):
        """
        :param samples: list of samples
        :param num_task: number of training tasks.
        :param k_support: number of support sample per task
        :param k_query: number of query sample per task
        """
        self.examples = examples
        random.shuffle(self.examples)
        
        self.num_task = num_task
        self.k_support = k_support
        self.k_query = k_query
        self.tokenizer = tokenizer
        self.max_seq_length = 128
        self.create_batch(self.num_task)
    
    def create_batch(self, num_task):
        self.supports = []  # support set
        self.queries = []  # query set
        
        for b in range(num_task):  # for each task
            # 1.select domain randomly
            domain = random.choice(self.examples)['domain']
            domainExamples = [e for e in self.examples if e['domain'] == domain]
            
            # 1.select k_support + k_query examples from domain randomly
            selected_examples = random.sample(domainExamples,self.k_support + self.k_query)
            random.shuffle(selected_examples)
            exam_train = selected_examples[:self.k_support]
            exam_test  = selected_examples[self.k_support:]
            
            self.supports.append(exam_train)
            self.queries.append(exam_test)

    def create_feature_set(self,examples):
        all_input_ids      = torch.empty(len(examples), self.max_seq_length, dtype = torch.long)
        all_attention_mask = torch.empty(len(examples), self.max_seq_length, dtype = torch.long)
        all_segment_ids    = torch.empty(len(examples), self.max_seq_length, dtype = torch.long)
        all_label_ids      = torch.empty(len(examples), dtype = torch.long)

        for id_,example in enumerate(examples):
            input_ids = tokenizer.encode(example['text'])
            attention_mask = [1] * len(input_ids)
            segment_ids    = [0] * len(input_ids)

            while len(input_ids) < self.max_seq_length:
                input_ids.append(0)
                attention_mask.append(0)
                segment_ids.append(0)

            label_id = LABEL_MAP[example['label']]
            all_input_ids[id_] = torch.Tensor(input_ids).to(torch.long)
            all_attention_mask[id_] = torch.Tensor(attention_mask).to(torch.long)
            all_segment_ids[id_] = torch.Tensor(segment_ids).to(torch.long)
            all_label_ids[id_] = torch.Tensor([label_id]).to(torch.long)

        tensor_set = TensorDataset(all_input_ids, all_attention_mask, all_segment_ids, all_label_ids)  
        return tensor_set
    
    def __getitem__(self, index):
        support_set = self.create_feature_set(self.supports[index])
        query_set   = self.create_feature_set(self.queries[index])
        return support_set, query_set

    def __len__(self):
        # as we have built up to batchsz of sets, you can sample some small batch size of sets.
        return self.num_task

Split meta training and meta testing


In [6]:
low_resource_domains = ["office_products", "automotive", "computer_&_video_games"]
train_examples = [r for r in reviews if r['domain'] not in low_resource_domains]
test_examples = [r for r in reviews if r['domain'] in low_resource_domains]
print(len(train_examples), len(test_examples))

21555 300


In [7]:
import torch
from transformers import BertModel, BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case = True)
train = MetaTask(train_examples, num_task = 100, k_support=100, k_query=30, tokenizer = tokenizer)

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [8]:
#Take a glance at the first two samples from support set of 1st meta-task
train.supports[0][:2]

[{'text': 'Ordered 2 cables from DLN. They shipped quickly, keep me updated and the cables arrived well packaged. They are good quality and work well. I like that it charges the phone as you Sync up.Highly recommended - and at such a good price',
  'label': 'positive',
  'domain': 'cell_phones_&_service'},
 {'text': "The silicone protective covering for the Treo was exactly what I wanted and just what I expected it to be. It fits my Treo 650 perfectly and I couldn't be happier. I am going to buy another in a different color",
  'label': 'positive',
  'domain': 'cell_phones_&_service'}]

In [9]:
train[0]

(<torch.utils.data.dataset.TensorDataset at 0x7ff5e8406a90>,
 <torch.utils.data.dataset.TensorDataset at 0x7ff5e835a310>)

In [10]:
# Let take a look at the first two samples from support set
train[0][0][:2]

(tensor([[  101,  3641,  1016, 15196,  2013, 21469,  2078,  1012,  2027, 12057,
           2855,  1010,  2562,  2033,  7172,  1998,  1996, 15196,  3369,  2092,
          21972,  1012,  2027,  2024,  2204,  3737,  1998,  2147,  2092,  1012,
           1045,  2066,  2008,  2009,  5571,  1996,  3042,  2004,  2017, 26351,
           2039,  1012,  3811,  6749,  1011,  1998,  2012,  2107,  1037,  2204,
           3976,   102,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,   

Training meta


In [17]:
import time
import logging
logger = logging.getLogger()
logger.setLevel(logging.CRITICAL)
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

def random_seed(value):
    torch.backends.cudnn.deterministic=True
    torch.manual_seed(value)
    torch.cuda.manual_seed(value)
    np.random.seed(value)
    random.seed(value)

def create_batch_of_tasks(taskset, is_shuffle = True, batch_size = 4):
    idxs = list(range(0,len(taskset)))
    if is_shuffle:
        random.shuffle(idxs)
    for i in range(0,len(idxs), batch_size):
        yield [taskset[idxs[i]] for i in range(i, min(i + batch_size,len(taskset)))]

class TrainingArgs:
    def __init__(self):
        self.num_labels = 2
        self.meta_epoch= 5
        self.k_spt=80
        self.k_qry=20
        self.outer_batch_size = 2
        self.inner_batch_size = 12
        self.outer_update_lr = 5e-5
        self.inner_update_lr = 5e-5
        self.inner_update_step = 10
        self.inner_update_step_eval = 40
        self.bert_model = 'bert-base-uncased'
        self.num_task_train = 10
        self.num_task_test = 3

args = TrainingArgs()

Create Meta Learner

In [18]:
from torch import nn
from torch.nn import functional as F
from torch.utils.data import TensorDataset, DataLoader, RandomSampler
from torch.optim import Adam
from torch.nn import CrossEntropyLoss
from transformers import BertForSequenceClassification
from copy import deepcopy
import gc
from sklearn.metrics import accuracy_score
import torch
import numpy as np

class Learner(nn.Module):
    """
    Meta Learner
    """
    def __init__(self, args):
        """
        :param args:
        """
        super(Learner, self).__init__()
        
        self.num_labels = args.num_labels
        self.outer_batch_size = args.outer_batch_size
        self.inner_batch_size = args.inner_batch_size
        self.outer_update_lr  = args.outer_update_lr
        self.inner_update_lr  = args.inner_update_lr
        self.inner_update_step = args.inner_update_step
        self.inner_update_step_eval = args.inner_update_step_eval
        self.bert_model = args.bert_model
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        
        self.model = BertForSequenceClassification.from_pretrained(self.bert_model, num_labels = self.num_labels)
        self.outer_optimizer = Adam(self.model.parameters(), lr=self.outer_update_lr)
        self.model.train()

    def forward(self, batch_tasks, training = True):
        """
        batch = [(support TensorDataset, query TensorDataset),
                 (support TensorDataset, query TensorDataset),
                 (support TensorDataset, query TensorDataset),
                 (support TensorDataset, query TensorDataset)]
        
        # support = TensorDataset(all_input_ids, all_attention_mask, all_segment_ids, all_label_ids)
        """
        task_accs = []
        sum_gradients = []
        num_task = len(batch_tasks)
        num_inner_update_step = self.inner_update_step if training else self.inner_update_step_eval

        for task_id, task in enumerate(batch_tasks):
            support = task[0]
            query   = task[1]
            
            fast_model = deepcopy(self.model)
            fast_model.to(self.device)
            support_dataloader = DataLoader(support, sampler=RandomSampler(support),
                                            batch_size=self.inner_batch_size)
            
            inner_optimizer = Adam(fast_model.parameters(), lr=self.inner_update_lr)
            fast_model.train()
            
            print('----Task',task_id, '----')
            for i in range(0,num_inner_update_step):
                all_loss = []
                for inner_step, batch in enumerate(support_dataloader):
                    
                    batch = tuple(t.to(self.device) for t in batch)
                    input_ids, attention_mask, segment_ids, label_id = batch
                    outputs = fast_model(input_ids, attention_mask, segment_ids, labels = label_id)
                    
                    loss = outputs[0]              
                    loss.backward()
                    inner_optimizer.step()
                    inner_optimizer.zero_grad()
                    
                    all_loss.append(loss.item())
                
                if i % 4 == 0:
                    print("Inner Loss: ", np.mean(all_loss))
            
            fast_model.to(torch.device('cpu'))
            
            if training:
                meta_weights = list(self.model.parameters())
                fast_weights = list(fast_model.parameters())

                gradients = []
                for i, (meta_params, fast_params) in enumerate(zip(meta_weights, fast_weights)):
                    gradient = meta_params - fast_params
                    if task_id == 0:
                        sum_gradients.append(gradient)
                    else:
                        sum_gradients[i] += gradient

            fast_model.to(self.device)
            fast_model.eval()
            with torch.no_grad():
                query_dataloader = DataLoader(query, sampler=None, batch_size=len(query))
                query_batch = iter(query_dataloader).next()
                query_batch = tuple(t.to(self.device) for t in query_batch)
                q_input_ids, q_attention_mask, q_segment_ids, q_label_id = query_batch
                q_outputs = fast_model(q_input_ids, q_attention_mask, q_segment_ids, labels = q_label_id)

                q_logits = F.softmax(q_outputs[1],dim=1)
                pre_label_id = torch.argmax(q_logits,dim=1)
                pre_label_id = pre_label_id.detach().cpu().numpy().tolist()
                q_label_id = q_label_id.detach().cpu().numpy().tolist()

                acc = accuracy_score(pre_label_id,q_label_id)
                task_accs.append(acc)
            
            fast_model.to(torch.device('cpu'))
            del fast_model, inner_optimizer
            torch.cuda.empty_cache()
        
        if training:
            # Average gradient across tasks
            for i in range(0,len(sum_gradients)):
                sum_gradients[i] = sum_gradients[i] / float(num_task)

            #Assign gradient for original model, then using optimizer to update its weights
            for i, params in enumerate(self.model.parameters()):
                params.grad = sum_gradients[i]

            self.outer_optimizer.step()
            self.outer_optimizer.zero_grad()
            
            del sum_gradients
            gc.collect()
        
        return np.mean(task_accs)

In [19]:
learner = Learner(args)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [20]:
random_seed(123)
test = MetaTask(test_examples, num_task = 3, k_support=80, k_query=20, tokenizer = tokenizer)
random_seed(int(time.time() % 10))

In [21]:
test.supports[2]

[{'text': 'TO REPLACE THE BATTERY YOUR HAVE TO OPEN THE BACK, THAT MEANS YOU HAVE REMOVE IT FROM WHERE ITS STUCK, OPEN THE BACK AND RE-STICK IT AGAIN THUS LOOSING THE STRENGTH OF THE ORIGINAL GLU',
  'label': 'negative',
  'domain': 'automotive'},
 {'text': 'This is a very good duster I have owned one for 5 years and now I have to replace',
  'label': 'positive',
  'domain': 'automotive'},
 {'text': "I have had 2 faulty units right out of the box. The first would not hold a charge for more than a couple of days and that did not improve after cycling the battery a number of times. I returned it for a replacement and that 2nd unit never indicated a full charge - even after being on charge for several days. I am moving on to another brand.Don't waste your time and money on these poor quality items",
  'label': 'negative',
  'domain': 'automotive'},
 {'text': '* Reasonably priced* Used it once so far to jump start; worked perfectly* Very well built; feels sturdy(the LED charge indicator is

Start training

In [22]:
global_step = 0

for epoch in range(args.meta_epoch):
    
    train = MetaTask(train_examples, num_task = 50, k_support=80, k_query=20, tokenizer = tokenizer)
    db = create_batch_of_tasks(train, is_shuffle = True, batch_size = args.outer_batch_size)

    for step, task_batch in enumerate(db):
        
        f = open('log.txt', 'a')
        
        acc = learner(task_batch)
        
        print('Step:', step, '\ttraining Acc:', acc)
        f.write(str(acc) + '\n')
        
        if global_step % 20 == 0:
            random_seed(123)
            print("\n-----------------Testing Mode-----------------\n")
            db_test = create_batch_of_tasks(test, is_shuffle = False, batch_size = 1)
            acc_all_test = []

            for test_batch in db_test:
                acc = learner(test_batch, training = False)
                acc_all_test.append(acc)

            print('Step:', step, 'Test F1:', np.mean(acc_all_test))
            f.write('Test' + str(np.mean(acc_all_test)) + '\n')
            
            random_seed(int(time.time() % 10))
        
        global_step += 1
        f.close()

----Task 0 ----
Inner Loss:  0.6705008830342974
Inner Loss:  0.026188497032438005
Inner Loss:  0.0035540771537593435
----Task 1 ----
Inner Loss:  0.694626910345895
Inner Loss:  0.03510607087186405
Inner Loss:  0.004872651238526616
Step: 0 	training Acc: 0.8

-----------------Testing Mode-----------------

----Task 0 ----
Inner Loss:  0.603544111762728
Inner Loss:  0.018962965479918888
Inner Loss:  0.0038936012757143806
Inner Loss:  0.002167785506961601
Inner Loss:  0.0016486313959051455
Inner Loss:  0.0012128052434750966
Inner Loss:  0.0009087345346675388
Inner Loss:  0.0007802058992508266
Inner Loss:  0.0006276136430512581
Inner Loss:  0.000529784314234608
----Task 0 ----
Inner Loss:  0.7159415994371686
Inner Loss:  0.04298292366521699
Inner Loss:  0.004384980204382113
Inner Loss:  0.002289766595432801
Inner Loss:  0.0016442898527852126
Inner Loss:  0.0012598024914041162
Inner Loss:  0.0009344997982095395
Inner Loss:  0.0007940684112587146
Inner Loss:  0.0006482180989613491
Inner Loss