<a href="https://colab.research.google.com/github/mkhfring/Tutorial/blob/main/maml-code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

In [None]:
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('Not using a high-RAM runtime')
else:
  print('You are using a high-RAM runtime!')

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
cd drive/MyDrive/meta-learning-bert/

In [None]:
!pip install transformers
!pip install torch
!pip install datasets



In [None]:
import datasets
train_data = datasets.load_dataset("code_x_glue_cc_clone_detection_big_clone_bench")
#test_data = datasets.load_dataset("code_x_glue_cc_clone_detection_poj104")


In [None]:
import datasets
with open('train_clone.jsonl') as f:
  pass

test_dat = datasets.load_dataset('json', data_files={'test': 'train_clone.jsonl'})

In [None]:
test_dat['test'][0]


In [None]:
train_data['train'][0]

In [None]:
train = train_data["train"][:10000]
import pandas as pd
train_df = pd.DataFrame(train)
text = train_df["func1"]+ train_df["func2"]
train_df['text'] = text


In [None]:
test = test_dat["test"][:1000]
import pandas as pd
test_df = pd.DataFrame(test)
text = test_df["func1"]+ test_df["func2"]
test_df['text'] = text

In [None]:
train_df['label'] = train_df["label"].replace({True:"T", False:"F"})


In [None]:
test_df['label'] = test_df["label"].replace({True:"T", False:"F"})

In [None]:
test_df

## Creating meta learning tasks

In [None]:
import os
import torch
from torch.utils.data import Dataset
import numpy as np
import collections
import random
import json, pickle
from torch.utils.data import TensorDataset

LABEL_MAP  = {'positive':0, 'negative':1, 0:'positive', 1:'negative'}
LABEL_MAP  = { 'T':0, 'F':1, 0:'T', 1:'F'}
class MetaTask(Dataset):

    def __init__(self, examples, num_task, k_support, k_query, tokenizer):
        """
        :param samples: list of samples
        :param num_task: number of training tasks.
        :param k_support: number of support sample per task
        :param k_query: number of query sample per task
        """
        self.examples = examples
        random.shuffle(self.examples)

        self.num_task = num_task
        self.k_support = k_support
        self.k_query = k_query
        self.tokenizer = tokenizer
        # This part has been changed for the perpous of training with code
        # self.max_seq_length = 128
        self.max_seq_length = 512

        self.create_batch(self.num_task)

    def create_batch(self, num_task):
        self.supports = []  # support set
        self.queries = []  # query set

        for b in range(num_task):  # for each task
            # 1.select domain randomly
            #domain = random.choice(self.examples)['domain']
            #domainExamples = [e for e in self.examples if e['domain'] == domain]
            domainExamples = [e for e in self.examples]

            # 1.select k_support + k_query examples from domain randomly
            selected_examples = random.sample(domainExamples,self.k_support + self.k_query)
            random.shuffle(selected_examples)
            exam_train = selected_examples[:self.k_support]
            exam_test  = selected_examples[self.k_support:]

            self.supports.append(exam_train)
            self.queries.append(exam_test)

    def create_feature_set(self,examples):
        all_input_ids      = torch.empty(len(examples), self.max_seq_length, dtype = torch.long)
        all_attention_mask = torch.empty(len(examples), self.max_seq_length, dtype = torch.long)
        all_segment_ids    = torch.empty(len(examples), self.max_seq_length, dtype = torch.long)
        all_label_ids      = torch.empty(len(examples), dtype = torch.long)

        for id_,example in enumerate(examples):
          # I have changed this part of the code
          #  input_ids = tokenizer.encode(example['text'])
            input_ids = tokenizer.encode(example['text'], max_length=128, truncation=True)

            attention_mask = [1] * len(input_ids)
            segment_ids    = [0] * len(input_ids)

            while len(input_ids) < self.max_seq_length:
                input_ids.append(0)
                attention_mask.append(0)
                segment_ids.append(0)

            #print("Check point for lables")

            label_id = LABEL_MAP[example['label']]
            all_input_ids[id_] = torch.Tensor(input_ids).to(torch.long)
            all_attention_mask[id_] = torch.Tensor(attention_mask).to(torch.long)
            all_segment_ids[id_] = torch.Tensor(segment_ids).to(torch.long)
            all_label_ids[id_] = torch.Tensor([label_id]).to(torch.long)


        tensor_set = TensorDataset(all_input_ids, all_attention_mask, all_segment_ids, all_label_ids)
        return tensor_set

    def __getitem__(self, index):
        support_set = self.create_feature_set(self.supports[index])
        query_set   = self.create_feature_set(self.queries[index])
        return support_set, query_set

    def __len__(self):
        # as we have built up to batchsz of sets, you can sample some small batch size of sets.
        return self.num_task

## Split meta training and meta testing

In [None]:
#low_resource_domains = ["office_products", "automotive", "computer_&_video_games"]
#train_examples = [r for r in reviews if r['domain'] not in low_resource_domains]
#test_examples = [r for r in reviews if r['domain'] in low_resource_domains]
#print(len(train_examples), len(test_examples))
#train_examples[:4]

train_examples = train_df.to_dict('records')
test_examples = test_df.to_dict('records')
#test_examples = train_examples[:1000]
#train_examples = train_examples[1000:]


In [None]:
type(test_examples)
random.shuffle(test_examples)

In [None]:
import torch
from transformers import BertModel, BertTokenizer, AutoTokenizer, RobertaConfig, RobertaTokenizer, RobertaModel
#tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
tokenizer = RobertaTokenizer.from_pretrained("microsoft/codebert-base")
train = MetaTask(train_examples, num_task = 10, k_support=100, k_query=30, tokenizer = tokenizer)

In [None]:
#Take a glance at the first two samples from support set of 1st meta-task
len(train.supports[0])

In [None]:
# Information of the 1st meta-task. It contains two TensorDataset: support set and query set
train

In [None]:
# Let take a look at the first two samples from support set
train[0][0][:2]

## Training meta

In [None]:
import time
import logging
logger = logging.getLogger()
logger.setLevel(logging.CRITICAL)
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

def random_seed(value):
    torch.backends.cudnn.deterministic=True
    torch.manual_seed(value)
    torch.cuda.manual_seed(value)
    np.random.seed(value)
    random.seed(value)

def create_batch_of_tasks(taskset, is_shuffle = True, batch_size = 4):
    idxs = list(range(0,len(taskset)))
    if is_shuffle:
        random.shuffle(idxs)
    for i in range(0,len(idxs), batch_size):
        yield [taskset[idxs[i]] for i in range(i, min(i + batch_size,len(taskset)))]

class TrainingArgs:
    def __init__(self):
        self.num_labels = 2
        self.meta_epoch=4
        self.k_spt=80
        self.k_qry=20
        self.outer_batch_size = 2
        self.inner_batch_size = 12
        self.outer_update_lr = 5e-5
        self.inner_update_lr = 5e-5
        self.inner_update_step = 10
        self.inner_update_step_eval = 40
        self.bert_model = 'bert-base-uncased'
        self.num_task_train = 500
        self.num_task_test = 5

args = TrainingArgs()

## Create Meta Learner

In [None]:
from torch import nn
from torch.nn import functional as F
from torch.utils.data import TensorDataset, DataLoader, RandomSampler
from torch.optim import Adam
from torch.nn import CrossEntropyLoss
from transformers import BertForSequenceClassification, RobertaModel, RobertaForSequenceClassification
from copy import deepcopy
import gc
from sklearn.metrics import accuracy_score
import torch
import numpy as np

class Learner(nn.Module):
    """
    Meta Learner
    """
    def __init__(self, args):
        """
        :param args:
        """
        super(Learner, self).__init__()

        self.num_labels = args.num_labels
        self.outer_batch_size = args.outer_batch_size
        self.inner_batch_size = args.inner_batch_size
        self.outer_update_lr  = args.outer_update_lr
        self.inner_update_lr  = args.inner_update_lr
        self.inner_update_step = args.inner_update_step
        self.inner_update_step_eval = args.inner_update_step_eval
        self.bert_model = args.bert_model
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

        #self.model = BertForSequenceClassification.from_pretrained(self.bert_model, num_labels = self.num_labels)
        #self.model = AutoModelForSequenceClassification.from_pretrained('mrm8488/codebert-base-finetuned-detect-insecure-code')
        self.model = RobertaForSequenceClassification.from_pretrained("microsoft/codebert-base")
        self.outer_optimizer = Adam(self.model.parameters(), lr=self.outer_update_lr)
        self.model.train()

    def forward(self, batch_tasks, training = True):
        """
        batch = [(support TensorDataset, query TensorDataset),
                 (support TensorDataset, query TensorDataset),
                 (support TensorDataset, query TensorDataset),
                 (support TensorDataset, query TensorDataset)]

        # support = TensorDataset(all_input_ids, all_attention_mask, all_segment_ids, all_label_ids)
        """
        task_accs = []
        sum_gradients = []
        num_task = len(batch_tasks)
        num_inner_update_step = self.inner_update_step if training else self.inner_update_step_eval

        for task_id, task in enumerate(batch_tasks):
            support = task[0]
            query   = task[1]

            fast_model = deepcopy(self.model)
            fast_model.to(self.device)
            support_dataloader = DataLoader(support, sampler=RandomSampler(support),
                                            batch_size=self.inner_batch_size)

            inner_optimizer = Adam(fast_model.parameters(), lr=self.inner_update_lr)
            fast_model.train()

            print('----Task',task_id, '----')
            for i in range(0,num_inner_update_step):
                all_loss = []
                for inner_step, batch in enumerate(support_dataloader):

                    batch = tuple(t.to(self.device) for t in batch)
                    input_ids, attention_mask, segment_ids, label_id = batch
                    outputs = fast_model(input_ids, attention_mask, segment_ids, labels = label_id)

                    loss = outputs[0]
                    loss.backward()
                    inner_optimizer.step()
                    inner_optimizer.zero_grad()

                    all_loss.append(loss.item())

                if i % 4 == 0:
                    print("Inner Loss: ", np.mean(all_loss))

            fast_model.to(torch.device('cpu'))

            if training:
                meta_weights = list(self.model.parameters())
                fast_weights = list(fast_model.parameters())

                gradients = []
                for i, (meta_params, fast_params) in enumerate(zip(meta_weights, fast_weights)):
                    gradient = meta_params - fast_params
                    if task_id == 0:
                        sum_gradients.append(gradient)
                    else:
                        sum_gradients[i] += gradient

            fast_model.to(self.device)
            fast_model.eval()
            with torch.no_grad():
                query_dataloader = DataLoader(query, sampler=None, batch_size=len(query))
                query_batch = next(iter(query_dataloader))
                query_batch = tuple(t.to(self.device) for t in query_batch)
                q_input_ids, q_attention_mask, q_segment_ids, q_label_id = query_batch
                q_outputs = fast_model(q_input_ids, q_attention_mask, q_segment_ids, labels = q_label_id)

                q_logits = F.softmax(q_outputs[1],dim=1)
                pre_label_id = torch.argmax(q_logits,dim=1)
                pre_label_id = pre_label_id.detach().cpu().numpy().tolist()
                q_label_id = q_label_id.detach().cpu().numpy().tolist()

                acc = accuracy_score(pre_label_id,q_label_id)
                task_accs.append(acc)

            fast_model.to(torch.device('cpu'))
            del fast_model, inner_optimizer
            torch.cuda.empty_cache()

        if training:
            # Average gradient across tasks
            for i in range(0,len(sum_gradients)):
                sum_gradients[i] = sum_gradients[i] / float(num_task)

            #Assign gradient for original model, then using optimizer to update its weights
            for i, params in enumerate(self.model.parameters()):
                params.grad = sum_gradients[i]

            self.outer_optimizer.step()
            self.outer_optimizer.zero_grad()

            del sum_gradients
            gc.collect()

        return np.mean(task_accs)

In [None]:
learner = Learner(args)

In [None]:
random_seed(123)
test = MetaTask(test_examples, num_task =4, k_support=80, k_query=20, tokenizer = tokenizer)
random_seed(int(time.time() % 10))

## Start training

In [None]:
global_step = 0
torch.cuda.empty_cache()

for epoch in range(args.meta_epoch):

    train = MetaTask(train_examples, num_task = 5, k_support=80, k_query=20, tokenizer = tokenizer)
    db = create_batch_of_tasks(train, is_shuffle = True, batch_size = args.outer_batch_size)

    for step, task_batch in enumerate(db):
        print(len(task_batch))
        f = open('log.txt', 'a')

        acc = learner(task_batch)

        print('Step:', step, '\ttraining Acc:', acc)
        f.write(str(acc) + '\n')

        if global_step % 20 == 0:
            random_seed(123)
            print("\n-----------------Testing Mode-----------------\n")
            db_test = create_batch_of_tasks(test, is_shuffle = False, batch_size = 1)
            acc_all_test = []

            for test_batch in db_test:
                acc = learner(test_batch, training = False)
                acc_all_test.append(acc)

            print('Step:', step, 'Test F1:', np.mean(acc_all_test))
            f.write('Test' + str(np.mean(acc_all_test)) + '\n')

            random_seed(int(time.time() % 10))

        global_step += 1
        f.close()