In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = '2'
os.environ['HF_HOME'] = '/workspace/HF_cache/'
os.environ['HF_DATASETS_CACHE'] = '/workspace/HF_cache/datasets'
os.environ['TRANSFORMERS_CACHE'] = '/workspace/HF_cache/transformers_cache/'

In [None]:
import arxiv
import sys
import torch
import transformers
import copy
import numpy as np

from transformers import AutoConfig, AutoModelForQuestionAnswering, AutoModelForSequenceClassification 
from collections import defaultdict,OrderedDict

import sys
MGIT_PATH=os.path.dirname(os.path.dirname(os.path.dirname(os.getcwd())))
sys.path.append(MGIT_PATH)
from utils.lineage.graph import *
from utils import meta_functions

In [None]:
# To control logging level for various modules used in the application:
import logging
import re
def set_global_logging_level(level=logging.ERROR, prefices=[""]):
    """
    Override logging levels of different modules based on their name as a prefix.
    It needs to be invoked after the modules have been loaded so that their loggers have been initialized.

    Args:
        - level: desired level. e.g. logging.INFO. Optional. Default is logging.ERROR
        - prefices: list of one or more str prefices to match (e.g. ["transformers", "torch"]). Optional.
          Default is `[""]` to match all active loggers.
          The match is a case-sensitive `module_name.startswith(prefix)`
    """
    prefix_re = re.compile(fr'^(?:{ "|".join(prefices) })')
    for name in logging.root.manager.loggerDict:
        if re.match(prefix_re, name):
            logging.getLogger(name).setLevel(level)
            
set_global_logging_level(logging.ERROR, ["transformers"])

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
preprocess_file = MGIT_PATH + '/utils/preprocess_utils.py'
user_file = MGIT_PATH +'/experiments/creation/user_functions.py'
perturbation_file = MGIT_PATH + '/utils/perturbation_utils.py'

# Random Shuffle Model Pool

In [None]:
bert_mlm_cased = 'bert-base-cased'#bert-base-cased Nov 15, 2018
bert_mlm_uncased = 'bert-base-uncased'#bert-base-uncased Nov 15, 2018

bert_squad2_uncased_frozen = 'ericRosello/bert-base-uncased-finetuned-squad-frozen-v2' #ericRosello/bert-base-uncased-finetuned-squad-frozen-v2 
#Jan 4, 2022
bert_mnli ='aloxatel/bert-base-mnli' #aloxatel/bert-base-mnli Jul 28, 2020
bert_mlm_large_cased = 'bert-large-cased'#bert-large-cased Nov 30, 2018
bert_mlm_large_uncased = 'bert-large-uncased'#bert-large-uncased Nov 15, 2018
bert_mnli_large = 'TehranNLP-org/bert-large-mnli' #TehranNLP-org/bert-large-mnli Apr 30, 2022
bert_squad2_uncased = 'deepset/bert-base-uncased-squad2' #deepset/bert-base-uncased-squad2 Jan 14, 2022

roberta_mlm_large = 'roberta-large'#roberta-large  Aug 5, 2019
roberta_mnli_large = 'roberta-large-mnli'#roberta-large-mnli Aug 5, 2019
roberta_squad2_large = 'deepset/roberta-large-squad2'#deepset/roberta-large-squad2 Mar 11, 2021
roberta_mlm = 'roberta-base'#roberta-base Aug 4, 2019
roberta_mnli = 'textattack/roberta-base-MNLI'#textattack/roberta-base-MNLI Jun 7, 2020
roberta_squad2 ='deepset/roberta-base-squad2' #deepset/roberta-base-squad2 Jan 22, 2020

albert_mlm = 'albert-base-v2'#albert-base-v2 Nov 4, 2019
albert_mnli = 'prajjwal1/albert-base-v2-mnli'#prajjwal1/albert-base-v2-mnli May 26, 2020
albert_squad2 = 'twmkn9/albert-base-v2-squad2' #twmkn9/albert-base-v2-squad2 Mar 9, 2020

distilbert_mlm_cased = 'distilbert-base-cased'#distilbert-base-cased Feb 7, 2020
distilbert_mlm_uncased = 'distilbert-base-uncased'#distilbert-base-uncased Aug 28, 2019
distilbert_squad2_uncased = 'twmkn9/distilbert-base-uncased-squad2'#twmkn9/distilbert-base-uncased-squad2 Mar 23, 2020
distilbert_squad2_uncased_frozen = 'ericRosello/distilbert-base-uncased-finetuned-squad-frozen-v2'#ericRosello/distilbert-base-uncased-finetuned-squad-frozen-v2
#Jan 4, 2022

electra_mlm_small = 'google/electra-small-generator' #google/electra-small-generator Mar 24, 2020
electra_mnli_small = 'howey/electra-small-mnli' #howey/electra-small-mnli Apr 15, 2021

In [None]:
timestamps = [(bert_mlm_cased, '11/15/2018'),
(bert_mlm_uncased, '11/15/2018'),
(bert_squad2_uncased_frozen, '01/04/2022'),
(bert_mnli, '07/28/2020'),
(bert_mlm_large_cased, '11/30/2018'),
(bert_mlm_large_uncased, '11/15/2018'),
(bert_mnli_large, '04/30/2022'),
(bert_squad2_uncased, '01/14/2022'),
(roberta_mlm_large, '08/05/2019'),
(roberta_mnli_large, '08/05/2019'),
(roberta_squad2_large, '03/11/2021'),
(roberta_mlm, '08/04/2019'),
(roberta_mnli, '06/07/2020'),
(roberta_squad2, '01/22/2020'),
(albert_mlm, '11/04/2019'),
(albert_mnli, '05/26/2020'),
(albert_squad2, '03/09/2020'),
(distilbert_mlm_cased, '02/07/2020'),
(distilbert_mlm_uncased, '08/28/2019'),
(distilbert_squad2_uncased, '03/23/2020'),
(distilbert_squad2_uncased_frozen, '01/04/2022'),
(electra_mlm_small, '03/24/2020'),
(electra_mnli_small, '04/15/2021'),]

In [None]:
model_types = dict([(bert_mlm_cased, 'mlm'),
(bert_mlm_uncased, 'mlm'),
(bert_squad2_uncased_frozen, 'squad'),
(bert_mnli, 'mnli'),
(bert_mlm_large_cased, 'mlm'),
(bert_mlm_large_uncased, 'mlm'),
(bert_mnli_large, 'mnli'),
(bert_squad2_uncased, 'squad'),
(roberta_mlm_large, 'mlm'),
(roberta_mnli_large, 'mnli'),
(roberta_squad2_large, 'squad'),
(roberta_mlm, 'mlm'),
(roberta_mnli, 'mnli'),
(roberta_squad2, 'squad'),
(albert_mlm, 'mlm'),
(albert_mnli, 'mnli'),
(albert_squad2, 'squad'),
(distilbert_mlm_cased, 'mlm'),
(distilbert_mlm_uncased, 'mlm'),
(distilbert_squad2_uncased, 'squad'),
(distilbert_squad2_uncased_frozen, 'squad'),
(electra_mlm_small, 'mlm'),
(electra_mnli_small, 'mnli'),])

In [None]:
task_types = dict([(bert_mlm_cased, 'MaskedLM'),
(bert_mlm_uncased, 'MaskedLM'),
(bert_squad2_uncased_frozen, 'question_answering'),
(bert_mnli, 'sequence_classification'),
(bert_mlm_large_cased, 'MaskedLM'),
(bert_mlm_large_uncased, 'MaskedLM'),
(bert_mnli_large, 'sequence_classification'),
(bert_squad2_uncased, 'question_answering'),
(roberta_mlm_large, 'MaskedLM'),
(roberta_mnli_large, 'sequence_classification'),
(roberta_squad2_large, 'question_answering'),
(roberta_mlm, 'MaskedLM'),
(roberta_mnli, 'sequence_classification'),
(roberta_squad2, 'question_answering'),
(albert_mlm, 'MaskedLM'),
(albert_mnli, 'sequence_classification'),
(albert_squad2, 'question_answering'),
(distilbert_mlm_cased, 'MaskedLM'),
(distilbert_mlm_uncased, 'MaskedLM'),
(distilbert_squad2_uncased, 'question_answering'),
(distilbert_squad2_uncased_frozen, 'question_answering'),
(electra_mlm_small, 'MaskedLM'),
(electra_mnli_small, 'sequence_classification'),])

In [None]:
import random
model_pool = [
    bert_mlm_cased,
    bert_mlm_uncased,
    bert_mlm_large_cased,
    bert_mlm_large_uncased,
    
    bert_mnli,
    bert_squad2_uncased_frozen,
    bert_mnli_large,
    bert_squad2_uncased,
    
    roberta_mlm_large,
    roberta_mlm,
    
    roberta_mnli_large,
    roberta_squad2_large,
    roberta_mnli,
    roberta_squad2,

    distilbert_mlm_cased,
    distilbert_mlm_uncased,
    distilbert_squad2_uncased_frozen,
    distilbert_squad2_uncased,


    albert_mlm,
    albert_mnli,
    albert_squad2,

    electra_mlm_small,
    electra_mnli_small]

random.shuffle(model_pool)
print(model_pool)

# Insertion Order: model family + number_hidden_layer + timestamp

In [None]:
import time
import datetime
golden_model_pool = list(zip(*sorted(timestamps, key=lambda x: time.mktime(datetime.datetime.strptime(x[1],
                                             "%m/%d/%Y").timetuple())) ))[0] 

In [None]:
for model in golden_model_pool:
    os.system('touch '+ '--'.join([os.environ['TRANSFORMERS_CACHE']+'models'] + model.split('/')))
    time.sleep(1)

In [None]:
def load_config(checkpoint_filepath):
    return AutoConfig.from_pretrained(checkpoint_filepath)

In [None]:
def config_sort(model_ckpt):
    print(model_ckpt)
    config = load_config(model_ckpt)
    arch_index = float('inf')
    longest_len = 0
    for arch in ordered_arch:
        if arch in config.__dict__["architectures"][0].casefold() and len(arch) > longest_len:
            arch_index = ordered_arch.index(arch)
            longest_len = len(arch)
    try:
        n_layers = config.__dict__["num_hidden_layers"]
    except:
        try:
            n_layers = config.__dict__["n_layers"]
        except:
            n_layers = config.__dict__["num_layers"]
        
    model_ckpt = '--'.join([os.environ['TRANSFORMERS_CACHE']+'models'] + model_ckpt.split('/'))
    print(model_ckpt, arch_index, n_layers, os.path.getmtime(model_ckpt))
    return (arch_index, n_layers, os.path.getmtime(model_ckpt))

In [None]:
family_doi = {'bert':'1810.04805', 'distilbert':'1910.01108', 'roberta':'1907.11692', 'albert':'1909.11942', 'electra': '2003.10555'}
search = arxiv.Search(
  id_list = list(family_doi.values())
)
published_time = []

for result in search.results():
    print(result.title)
    published_time.append(result.published)
    
ordered_arch = list(zip(*sorted(list(zip(family_doi, published_time)), key=lambda x: x[1]) ))[0]
print(ordered_arch)

In [None]:
model_pool = sorted(model_pool, key=lambda a: config_sort(a))

# Create Tests

In [None]:
mlm_lineage_eval_dataset = LineageDataset('wikitext','wikitext-103-raw-v1',split='validation',feature_keys=['text'])

In [None]:
squad_lineage_eval_dataset = LineageDataset("squad_v2",split="validation",feature_keys=['context','question'])

In [None]:
squadv1_lineage_eval_dataset = LineageDataset("squad",split="validation",feature_keys=['context','question'])

In [None]:
mnli_lineage_eval_dataset = LineageDataset("glue", "mnli", split="validation_matched", feature_keys=['premise','hypothesis'])

In [None]:
mlm_test = LineageTest(
        preprocess_function_path=preprocess_file,
        preprocess_function_name='mlm_preprocess_function',
        eval_dataset=mlm_lineage_eval_dataset,
        metric_for_best_model='loss',
        name='mlm',
)

In [None]:
mnli_test = LineageTest(
        custom_test_function_path=user_file,
        custom_test_function_name='mnli_custom_test_function',
        eval_dataset=mnli_lineage_eval_dataset,
        metric_for_best_model='accuracy',
        name='mnli',
)

In [None]:
squadv2_test = LineageTest(
        preprocess_function_path=preprocess_file,
        preprocess_function_name='squad_preprocess_validation_function',
        eval_dataset=squad_lineage_eval_dataset,
        postprocess_function_path=preprocess_file,
        postprocess_function_name='postprocess_squad2_predictions',
        metric_for_best_model='f1',
        name='squad_v2',
)

In [None]:
squadv1_test = LineageTest(
        preprocess_function_path=preprocess_file,
        preprocess_function_name='squad_preprocess_validation_function',
        eval_dataset=squadv1_lineage_eval_dataset,
        postprocess_function_path=preprocess_file,
        postprocess_function_name='postprocess_squad_predictions',
        metric_for_best_model='f1',
        name='squad_v1',
)

# Build Graph

In [None]:
g = LineageGraph(compression_mode='lzma', single_model_compression=False)
g.register_test_to_type(mlm_test,'mlm')
g.register_test_to_type(mnli_test,'mnli')
g.register_test_to_type(squadv2_test,'squad')
g.register_test_to_type(squadv1_test,'squad')

In [None]:
scales = [1]

In [None]:
import time
from tqdm import tqdm
time_list = []

for i in scales:
    for j in tqdm(range(i)):
        time_cost = 0
        print('current scale factor: {0}, current time list: {1}'.format(j+1, time_list))
        for model in model_pool:
            print('Inserting: ' + model)
            node = LineageNode(output_dir=model + '_v{0}'.format(str(j)), init_checkpoint=model, \
                               model_type=model_types[model], task_type=task_types[model], is_delta=True)
            start = time.time()
            if not g.add(node) :
                g.add_root(node)
            end = time.time()
            for ex_node in g.nodes.values():
                if ex_node.is_unload:
                    ex_node.unload_model(save_model=False)
            print('\n')
            time_cost += end - start
        
        n = g.show(save_path="./LineageGraph_v{0}.html".format(str(j)))
        time_list.append(time_cost)

In [None]:
g.save('./',save_models=False)