In [2]:
from chop.tools import get_tokenized_dataset

checkpoint = "prajjwal1/bert-tiny"
tokenizer_checkpoint = "bert-base-uncased"
dataset_name = "imdb"

dataset, tokenizer = get_tokenized_dataset(
    dataset=dataset_name,
    checkpoint=tokenizer_checkpoint,
    return_tokenizer=True
)

[32mINFO    [0m [34mTokenizing dataset imdb with AutoTokenizer for bert-base-uncased.[0m
Map: 100%|██████████| 25000/25000 [00:04<00:00, 5377.23 examples/s]


In [3]:
from transformers import AutoConfig, AutoModelForSequenceClassification
from chop.tools import get_trainer

config = AutoConfig.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_config(config)

trainer = get_trainer(
    model = model,
    tokenized_dataset = dataset,
    tokenizer = tokenizer,
    evaluate_metric = "accuracy",
    num_train_epochs = 1
)

trainer.train()

eval_results = trainer.evaluate()

baseline = eval_results["eval_accuracy"]
print(baseline)


  trainer = Trainer(


Step,Training Loss
500,0.6938
1000,0.6325
1500,0.4591
2000,0.4054
2500,0.3784
3000,0.3813


0.84156


In [4]:
import torch.nn as nn
from chop.nn.modules import Identity

search_space = {
    "num_layers": [2, 4, 8],
    "num_heads": [2, 4, 8, 16],
    "hidden_size": [128, 192, 256, 384, 512],
    "intermediate_size": [512, 768, 1024, 1536, 2048],
    "linear_layer_choices" : ["linear", "identity"]
}


In [5]:
from transformers import AutoConfig, AutoModelForSequenceClassification
from chop.tools.utils import deepsetattr

track = set({})

def construct_model(trial):
    config = AutoConfig.from_pretrained(checkpoint)
    config.problem_type = "single_label_classification"
    ss_template = search_space

    for param in [
        "num_layers",
        "num_heads",
        "hidden_size",
        "intermediate_size"
    ]:
        chosen_idex = trial.suggest_int(param, 0, len(ss_template[param]) - 1)
        setattr(config, param, ss_template[param][chosen_idex])

    trial_model = AutoModelForSequenceClassification.from_config(config)

    for name, layer in trial_model.named_modules():
        if isinstance(layer, nn.Linear) and layer.in_features == layer.out_features:
            track.add(name)
            new_layer_cls = trial.suggest_categorical(
                f"{name}_type",
                ss_template["linear_layer_choices"],
            )

            if new_layer_cls == "linear":
                continue
            elif new_layer_cls == "identity":
                new_layer = Identity()
                deepsetattr(trial_model, name, new_layer)
            else:
                raise ValueError(f"Unkown layer type: {new_layer_cls}")

    return trial_model

In [6]:
from chop.tools import get_trainer

def objective(trial):
    model = construct_model(trial)

    trainer = get_trainer(
        model = model,
        tokenized_dataset = dataset,
        tokenizer = tokenizer,
        evaluate_metric = "accuracy",
        num_train_epochs = 1
    )

    trainer.train()

    eval_results = trainer.evaluate()

    trial.set_user_attr("model", model)

    return eval_results["eval_accuracy"]

In [7]:
import optuna
from optuna import Study
from optuna.samplers import GridSampler, RandomSampler, TPESampler

def getGridSearchSpace():
    grid_search_space = {
        "num_layers": [i for i in range(0, len(search_space['num_layers']))],
        "num_heads": [i for i in range(0, len(search_space['num_heads']))],
        "hidden_size": [i for i in range(0, len(search_space['hidden_size']))],
        "intermediate_size": [i for i in range(0, len(search_space['intermediate_size']))],
    }

    sampler = RandomSampler()

    study = optuna.create_study(
        direction="maximize",
        study_name="bert-tiny-nas-study",
        sampler=sampler
    )

    study.optimize(
        lambda trial: construct_model(trial=trial),
        n_trials = 100,
        timeout=60*60*24
    )

    for name in track :
        grid_search_space[f'{name}_type'] = ["linear", "identity"]

    return grid_search_space


In [8]:
# sampler = GridSampler(search_space=getGridSearchSpace()) # The sampler to use below
sampler = RandomSampler()
# sampler = TPESampler()

In [9]:
study = optuna.create_study(
    direction="maximize",
    study_name="bert-tiny-nas-study",
    sampler=sampler
)

study.optimize(
    objective,
    n_trials = 1,
    timeout=60*60*24
)

[I 2025-02-03 19:01:08,907] A new study created in memory with name: bert-tiny-nas-study
  trainer = Trainer(


Step,Training Loss
500,0.6913
1000,0.538
1500,0.4647
2000,0.4215
2500,0.392
3000,0.3839


[I 2025-02-03 19:02:04,117] Trial 0 finished with value: 0.84084 and parameters: {'num_layers': 1, 'num_heads': 1, 'hidden_size': 2, 'intermediate_size': 2, 'bert.encoder.layer.0.attention.self.query_type': 'identity', 'bert.encoder.layer.0.attention.self.key_type': 'identity', 'bert.encoder.layer.0.attention.self.value_type': 'linear', 'bert.encoder.layer.0.attention.output.dense_type': 'identity', 'bert.encoder.layer.1.attention.self.query_type': 'identity', 'bert.encoder.layer.1.attention.self.key_type': 'linear', 'bert.encoder.layer.1.attention.self.value_type': 'identity', 'bert.encoder.layer.1.attention.output.dense_type': 'identity', 'bert.pooler.dense_type': 'identity'}. Best is trial 0 with value: 0.84084.


In [11]:
from pathlib import Path
import dill

model = study.best_trial.user_attrs["model"].cpu()

# with open(f"{Path.home()}/mase/tasks/tutorial5/t5_best_model.pkl", "wb") as f:
    # dill.dump(model, f)

In [17]:
from chop.pipelines import CompressionPipeline
from chop import MaseGraph

mg = MaseGraph(model)
pipe = CompressionPipeline()

quantization_config = {
    "by": "type",
    "default": {
        "config": {
            "name": None,
        }
    },
    "linear": {
        "config": {
            "name": "integer",
            # data
            "data_in_width": 8,
            "data_in_frac_width": 4,
            # weight
            "weight_width": 8,
            "weight_frac_width": 4,
            # bias
            "bias_width": 8,
            "bias_frac_width": 4,
        }
    },
}

pruning_config = {
    "weight": {
        "sparsity": 0.5,
        "method": "l1-norm",
        "scope": "local",
    },
    "activation": {
        "sparsity": 0.5,
        "method": "l1-norm",
        "scope": "local",
    },
}

mg, _ = pipe(
    mg,
    pass_args={
        "quantize_transform_pass": quantization_config,
        "prune_transform_pass": pruning_config,
    },
)

`past_key_values` were not specified as input names, but model.config.use_cache = True. Setting model.config.use_cache = False.
[32mINFO    [0m [34mGetting dummy input for prajjwal1/bert-tiny.[0m


tensor([[ 101, 9932, 2089, 2202, 2058, 1996, 2088, 2028, 2154,  102],
        [ 101, 2023, 2003, 2339, 2017, 2323, 4553, 4748, 4877,  102]])
tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])
tensor([[ 101, 9932, 2089, 2202, 2058, 1996, 2088, 2028, 2154,  102],
        [ 101, 2023, 2003, 2339, 2017, 2323, 4553, 4748, 4877,  102]])
tensor([[1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]])
tensor([[1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]])
tensor([[1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]])
tensor([[[[1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]]],


        [[[1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]]]])
tensor([[[[1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
          [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
          [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
          [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
          [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
   

[32mINFO    [0m [34mPruning module: bert_encoder_layer_0_attention_self_value[0m
[32mINFO    [0m [34mPruning module: bert_encoder_layer_0_intermediate_dense[0m
[32mINFO    [0m [34mPruning module: bert_encoder_layer_0_output_dense[0m
[32mINFO    [0m [34mPruning module: bert_encoder_layer_1_attention_self_key[0m
[32mINFO    [0m [34mPruning module: bert_encoder_layer_1_intermediate_dense[0m
[32mINFO    [0m [34mPruning module: bert_encoder_layer_1_output_dense[0m
[32mINFO    [0m [34mPruning module: classifier[0m


In [12]:
import pandas as pd
from optuna import Study
from optuna.trial import FrozenTrial

def samplerTrial(sampler_name:str, trials:int, sampler, f=objective) -> None:

    data = [0]*trials

    def record_accuracy_callback(stud:Study, fzt:FrozenTrial):
        print(f"Trial: {fzt.number}, Accuracy: {fzt.value}")
        data[fzt.number-1] ={"n":fzt.number, "accuracy":fzt.value}

    if (sampler == None):
        raise RuntimeError("No Sampler Provided")

    study = optuna.create_study(
        direction="maximize",
        study_name=f"bert-tiny-nas-{sampler_name}-study",
        sampler=sampler
    )

    study.optimize(
        f,
        n_trials=trials,
        timeout=60*60*24,
        callbacks=[record_accuracy_callback]
    )

    df = pd.DataFrame(data)
    df.to_csv(f"{Path.home()}/mase/tasks/tutorial5/sampler_run_{sampler_name}.csv", index=False)


In [11]:
# samplerTrial(sampler_name="grid", trials=100, sampler=sampler)

In [89]:
# import matplotlib.pyplot as plt
# import pandas as pd
# from pathlib import Path
# import numpy as np

# df_r = pd.read_csv(f"{Path.home()}/mase/tasks/tutorial5/random.csv")
# df_t = pd.read_csv(f"{Path.home()}/mase/tasks/tutorial5/tpes.csv")
# df_g = pd.read_csv(f"{Path.home()}/mase/tasks/tutorial5/grid.csv")

# # Function to compute cumulative max accuracy
# def compute_cumulative_max(df):
#     df_sorted = df.sort_values(by='n')
#     df_sorted['max_accuracy'] = np.maximum.accumulate(df_sorted['accuracy'])
#     return df_sorted

# # Process data
# df_r = compute_cumulative_max(df_r)
# df_t = compute_cumulative_max(df_t)
# df_g = compute_cumulative_max(df_g)

# fig_pretrain = plt.figure(dpi=200)

# plt.scatter(x=df_r['n'], y=df_r['max_accuracy'] * 100, marker='x', s=20)
# plt.scatter(x=df_t['n'], y=df_t['max_accuracy'] * 100, marker='x', s=20)
# plt.scatter(x=df_g['n'], y=df_g['max_accuracy'] * 100, marker='x', s=20)
# plt.axhline(y=baseline * 100, linestyle='--', linewidth=0.8)
# lgd = plt.legend(['Random Sampler', 'TPES Sampler', 'Grid Sampler', 'Baseline'], bbox_to_anchor=(1.04, 1), loc="upper left")
# plt.xlabel("Trials")
# plt.ylabel("Accuracy %")
# plt.title("Trials vs Accuracy")
# plt.savefig(f"{Path.home()}/mase/tasks/tutorial5/samplers", bbox_extra_artists=(lgd,), bbox_inches='tight')
# plt.show()

In [18]:
import chop.passes as passes
from chop import MaseGraph

trials = 2

def objTrain(model, pre_evaluation=True):

    trainer = get_trainer(
        model = model,
        tokenized_dataset = dataset,
        tokenizer = tokenizer,
        evaluate_metric = "accuracy",
        num_train_epochs = 1
    )

    pre = {"eval_accuracy": 0 }

    if (pre_evaluation):
        pre = trainer.evaluate()

    trainer.train()
    eval_results = trainer.evaluate()

    return pre["eval_accuracy"], eval_results["eval_accuracy"]

data = [0] * trials

def pipeObjective(trial):
    model = construct_model(trial)

    mg = MaseGraph(
        model,
        hf_input_names=[
            "input_ids",
            "attention_mask",
            "labels"
        ]
    )

    mg, _ = passes.init_metadata_analysis_pass(mg)
    mg, _ = passes.add_common_metadata_analysis_pass(mg)

    _, post_t = objTrain(mg.model, False)

    mg.model = mg.model.cpu()
    quantization_config['by'] = "type"

    mg, _ = pipe(
            mg,
            pass_args={
                "quantize_transform_pass": quantization_config,
                "prune_transform_pass": pruning_config,
            }
        )

    post_comp_no_t, eval_acc = objTrain(mg.model)

    print(f"Trial:{trial.number}, FstTrain: {post_t}, Compressa: {post_comp_no_t}, SndTrain: {eval_acc}")

    data[trial.number] = {
        "n": trial.number,
        "fst_train": post_t,
        "compress": post_comp_no_t,
        "snd_train": eval_acc
    }

    return eval_acc

In [19]:

study = optuna.create_study(
    direction="maximize",
    study_name=f"bert-tiny-nas-study",
    sampler=sampler
)

study.optimize(
    pipeObjective,
    n_trials=trials,
    timeout=60*60*24
)

df = pd.DataFrame(data)
df.to_csv(f"/vol/bitbucket/oa321/mase/tasks/tutorial5/random_cp.csv", index=False)

[I 2025-02-03 19:05:32,752] A new study created in memory with name: bert-tiny-nas-study
`past_key_values` were not specified as input names, but model.config.use_cache = True. Setting model.config.use_cache = False.
[32mINFO    [0m [34mGetting dummy input for prajjwal1/bert-tiny.[0m


tensor([[ 101, 9932, 2089, 2202, 2058, 1996, 2088, 2028, 2154,  102],
        [ 101, 2023, 2003, 2339, 2017, 2323, 4553, 4748, 4877,  102]])
tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])
tensor([[ 101, 9932, 2089, 2202, 2058, 1996, 2088, 2028, 2154,  102],
        [ 101, 2023, 2003, 2339, 2017, 2323, 4553, 4748, 4877,  102]])
tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])
tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])
tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])
tensor([[[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]],


        [[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]]])
tensor([[[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
          [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
          [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
          [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
          [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
          [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
          [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
          [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
       

  trainer = Trainer(


Step,Training Loss
500,0.6946
1000,0.6194
1500,0.4714
2000,0.4176
2500,0.3925
3000,0.392


[32mINFO    [0m [34mGetting dummy input for prajjwal1/bert-tiny.[0m
[32mINFO    [0m [34mPruning module: bert_encoder_layer_0_attention_self_key[0m
[32mINFO    [0m [34mPruning module: bert_encoder_layer_0_attention_self_value[0m
[32mINFO    [0m [34mPruning module: bert_encoder_layer_0_attention_output_dense[0m
[32mINFO    [0m [34mPruning module: bert_encoder_layer_0_intermediate_dense[0m
[32mINFO    [0m [34mPruning module: bert_encoder_layer_0_output_dense[0m
[32mINFO    [0m [34mPruning module: bert_encoder_layer_1_attention_self_value[0m
[32mINFO    [0m [34mPruning module: bert_encoder_layer_1_intermediate_dense[0m
[32mINFO    [0m [34mPruning module: bert_encoder_layer_1_output_dense[0m
[32mINFO    [0m [34mPruning module: classifier[0m


tensor([[ 101, 9932, 2089, 2202, 2058, 1996, 2088, 2028, 2154,  102],
        [ 101, 2023, 2003, 2339, 2017, 2323, 4553, 4748, 4877,  102]])
tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])
tensor([[ 101, 9932, 2089, 2202, 2058, 1996, 2088, 2028, 2154,  102],
        [ 101, 2023, 2003, 2339, 2017, 2323, 4553, 4748, 4877,  102]])
tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])
tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])
tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])
tensor([[[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]],


        [[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]]])
tensor([[[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
          [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
          [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
          [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
          [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
          [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
          [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
          [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
       

  trainer = Trainer(


Step,Training Loss
500,0.3824
1000,0.3077
1500,0.3022
2000,0.3089
2500,0.3011
3000,0.324


[I 2025-02-03 19:07:56,093] Trial 0 finished with value: 0.86888 and parameters: {'num_layers': 1, 'num_heads': 1, 'hidden_size': 1, 'intermediate_size': 0, 'bert.encoder.layer.0.attention.self.query_type': 'identity', 'bert.encoder.layer.0.attention.self.key_type': 'linear', 'bert.encoder.layer.0.attention.self.value_type': 'linear', 'bert.encoder.layer.0.attention.output.dense_type': 'linear', 'bert.encoder.layer.1.attention.self.query_type': 'identity', 'bert.encoder.layer.1.attention.self.key_type': 'identity', 'bert.encoder.layer.1.attention.self.value_type': 'linear', 'bert.encoder.layer.1.attention.output.dense_type': 'identity', 'bert.pooler.dense_type': 'identity'}. Best is trial 0 with value: 0.86888.
`past_key_values` were not specified as input names, but model.config.use_cache = True. Setting model.config.use_cache = False.
[32mINFO    [0m [34mGetting dummy input for prajjwal1/bert-tiny.[0m


Trial:0, FstTrain: 0.8372, Compressa: 0.8118, SndTrain: 0.86888
tensor([[ 101, 9932, 2089, 2202, 2058, 1996, 2088, 2028, 2154,  102],
        [ 101, 2023, 2003, 2339, 2017, 2323, 4553, 4748, 4877,  102]])
tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])
tensor([[ 101, 9932, 2089, 2202, 2058, 1996, 2088, 2028, 2154,  102],
        [ 101, 2023, 2003, 2339, 2017, 2323, 4553, 4748, 4877,  102]])
tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])
tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])
tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])
tensor([[[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]],


        [[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]]])
tensor([[[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
          [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
          [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
          [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
          [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
          [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
          [1, 1, 1, 1, 1, 1

  trainer = Trainer(


Step,Training Loss
500,0.692
1000,0.5379
1500,0.4333
2000,0.385
2500,0.3493
3000,0.3647


[32mINFO    [0m [34mGetting dummy input for prajjwal1/bert-tiny.[0m
[32mINFO    [0m [34mPruning module: bert_encoder_layer_0_attention_self_key[0m
[32mINFO    [0m [34mPruning module: bert_encoder_layer_0_attention_self_value[0m
[32mINFO    [0m [34mPruning module: bert_encoder_layer_0_intermediate_dense[0m
[32mINFO    [0m [34mPruning module: bert_encoder_layer_0_output_dense[0m
[32mINFO    [0m [34mPruning module: bert_encoder_layer_1_attention_self_query[0m
[32mINFO    [0m [34mPruning module: bert_encoder_layer_1_attention_output_dense[0m
[32mINFO    [0m [34mPruning module: bert_encoder_layer_1_intermediate_dense[0m
[32mINFO    [0m [34mPruning module: bert_encoder_layer_1_output_dense[0m
[32mINFO    [0m [34mPruning module: bert_pooler_dense[0m
[32mINFO    [0m [34mPruning module: classifier[0m


tensor([[ 101, 9932, 2089, 2202, 2058, 1996, 2088, 2028, 2154,  102],
        [ 101, 2023, 2003, 2339, 2017, 2323, 4553, 4748, 4877,  102]])
tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])
tensor([[ 101, 9932, 2089, 2202, 2058, 1996, 2088, 2028, 2154,  102],
        [ 101, 2023, 2003, 2339, 2017, 2323, 4553, 4748, 4877,  102]])
tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])
tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])
tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])
tensor([[[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]],


        [[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]]])
tensor([[[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
          [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
          [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
          [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
          [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
          [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
          [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
          [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
       

  trainer = Trainer(


Step,Training Loss
500,0.3424
1000,0.2827
1500,0.2997
2000,0.2983
2500,0.2852
3000,0.3157


[I 2025-02-03 19:10:49,005] Trial 1 finished with value: 0.87256 and parameters: {'num_layers': 0, 'num_heads': 3, 'hidden_size': 1, 'intermediate_size': 4, 'bert.encoder.layer.0.attention.self.query_type': 'identity', 'bert.encoder.layer.0.attention.self.key_type': 'linear', 'bert.encoder.layer.0.attention.self.value_type': 'linear', 'bert.encoder.layer.0.attention.output.dense_type': 'identity', 'bert.encoder.layer.1.attention.self.query_type': 'linear', 'bert.encoder.layer.1.attention.self.key_type': 'identity', 'bert.encoder.layer.1.attention.self.value_type': 'identity', 'bert.encoder.layer.1.attention.output.dense_type': 'linear', 'bert.pooler.dense_type': 'linear'}. Best is trial 1 with value: 0.87256.


Trial:1, FstTrain: 0.85388, Compressa: 0.77484, SndTrain: 0.87256


In [21]:
def plot(df_name:str) -> None:
    df = pd.read_csv(f"{Path.home()}/mase/tasks/tutorial5/{df_name}.csv")

    plt.scatter(x=df['n'], y=df['fst_train'] * 100, marker='x', s=20)
    plt.scatter(x=df['n'], y=df['compress'] * 100, marker='x', s=20)
    plt.scatter(x=df['n'], y=df['snd_train'] * 100, marker='x', s=20)
    plt.axhline(y=baseline * 100, linestyle='--', linewidth=0.8)
    plt.legend(['Pre-Compress', 'After Compressing', 'Post Compress Training', 'Baseline'], bbox_to_anchor=(1.04, 1), loc="upper left")
    plt.xlabel("Trials")
    plt.ylabel("Accuracy %")
    plt.title("Trials vs Accuracy")
    plt.savefig(f"{Path.home()}/mase/tasks/tutorial5/cp_{df_name}")
    plt.show()

In [23]:
import matplotlib.pyplot as plt
import pandas as pd
from pathlib import Path


# plot('random_cp_partial')
# plot('tpes_cp')
