# Tuning LoRA Hyperparameters - Layers/Memory

## Environment Setup

In [1]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

In [2]:
%%capture
import sys
sys.path.append('src')

import boto3

import sagemaker
from sagemaker.pytorch import PyTorch
from sagemaker.tuner import HyperparameterTuner

import warnings
warnings.filterwarnings('ignore')

from src.util import (
    count_parameters, 
    calc_combinations
)
from src.nb_helper import (
    display_tuning_jobs,
    get_default_estimator_parameters,
    p,
    capture_results,
    graph_results,
    roberta_total, 
    roberta_learnable
)
from amtviz import visualize_tuning_job
import altair as alt
sm = boto3.client('sagemaker')

## Experiments

Let's try some experiments. For all of the experiments we use one learning rate (and weight decay / dropout) as a basis. We use the learning rate we found with the tuning job.

In [3]:
# We continue to use the results of the base lora tuner
%store -r
assert base_lora_tuner_name
base_lora_tuner = HyperparameterTuner.attach(base_lora_tuner_name)
base_lora_tuner_name

'lora-base-tuning-231209-0038'

In [4]:
base_lora_tuner.wait()
desc = sm.describe_hyper_parameter_tuning_job(HyperParameterTuningJobName=base_lora_tuner.describe()['HyperParameterTuningJobName'])
print(desc['HyperParameterTuningJobName'], '=>', desc['HyperParameterTuningJobStatus'])
best_hyperparameters = desc['BestTrainingJob']['TunedHyperParameters']
print('Best parameters found:', best_hyperparameters)
best_hyperparameters = {k:float(v.replace('"', '')) for k, v in best_hyperparameters.items()}; best_hyperparameters

!
lora-base-tuning-231209-0038 => Completed
Best parameters found: {'sst2-epochs': '14', 'sst2-learning-rate': '0.001446695925631095'}


{'sst2-epochs': 14.0, 'sst2-learning-rate': 0.001446695925631095}

In [21]:
from sagemaker.tuner import IntegerParameter, CategoricalParameter, TuningJobCompletionCriteriaConfig
vertical_estimator = PyTorch(**get_default_estimator_parameters())
vertical_estimator.set_hyperparameters(**{'sst2-lora-config': 'none', 'sst2-epochs': 10, 'sst2-learning-rate': 3e-5})

n_trials = 3
hpt_ranges = {
    'use-gradient-checkpointing': CategoricalParameter([0, 1]),
    #'empty-cuda-cache': CategoricalParameter([0, 1]),
    'dummy': CategoricalParameter(list(range(0, n_trials))),
}

tuner_parameters = {**(p('tuner_parameters') | dict(
    strategy='Grid',
    hyperparameter_ranges=hpt_ranges,
    metric_definitions=p('metric_definitions'),
    estimator = vertical_estimator,
    base_tuning_job_name = 'lora-layers',
    max_jobs = calc_combinations(hpt_ranges), 
    max_parallel_jobs = 10, 
))}


lora_layers_tuner = HyperparameterTuner(**tuner_parameters)
lora_layers_tuner.fit(wait=False)

Using provided s3_resource


### Analysis

#### Lora Layers

In [22]:
lora_layers_tuner.wait()

................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................!


In [32]:
graph, trials_df, full_df = visualize_tuning_job(
    lora_layers_tuner,
    job_metrics=[
        "sst2_valid_acc",
        "train_loss",
        "learnable_parameters",
        "learning_rate",
        "gpu_memory",
        "epoch",
        "train_samples_sec",
    ],
    advanced=True,
    return_dfs=True
)

Tuning job lora-layers-231213-1504   status: Completed

Number of training jobs with valid objective: 6
Lowest: 0.9438073635101318 Highest 0.9506880640983582


Unnamed: 0,dummy,use-gradient-checkpointing,TrainingJobName,TrainingJobStatus,TrainingStartTime,TrainingEndTime,TrainingElapsedTimeSeconds,TuningJobName,sst2_valid_acc
1,2,0,lora-layers-231213-1504-005-a167fa85,Completed,2023-12-13 15:05:58+01:00,2023-12-13 15:24:09+01:00,1091.0,lora-layers-231213-1504,0.950688
0,2,1,lora-layers-231213-1504-006-7842bbe8,Completed,2023-12-13 15:05:59+01:00,2023-12-13 15:27:06+01:00,1267.0,lora-layers-231213-1504,0.947248
2,1,1,lora-layers-231213-1504-004-5de18de4,Completed,2023-12-13 15:05:52+01:00,2023-12-13 15:27:09+01:00,1277.0,lora-layers-231213-1504,0.944954
5,0,0,lora-layers-231213-1504-001-b4d02132,Completed,2023-12-13 15:05:46+01:00,2023-12-13 15:43:04+01:00,2238.0,lora-layers-231213-1504,0.944954
3,1,0,lora-layers-231213-1504-003-784f4dcf,Completed,2023-12-13 15:05:50+01:00,2023-12-13 15:24:01+01:00,1091.0,lora-layers-231213-1504,0.943807
4,0,1,lora-layers-231213-1504-002-3814a68a,Completed,2023-12-13 15:05:50+01:00,2023-12-13 15:27:02+01:00,1272.0,lora-layers-231213-1504,0.943807


Cache Hit/Miss: HHHHHH


In [33]:
graph

In [25]:
#full_df['layer'] =  full_df['sst2-lora-config'].str.extract('(\d+)').astype(int)+1

In [30]:
objectives = ["gpu_memory", "sst2_valid_acc", "train_samples_sec"]
for objective in objectives: 
    df = full_df[full_df.label == objective].sort_values('ts', ascending=False).groupby('TrainingJobName').max()

    display(f'---- {objective} ----')
    display(
        alt.Chart(df).mark_bar(color='orange', size=18).encode(
            y=alt.Y('mean(value):Q', title=objective),
            x=alt.X('use-gradient-checkpointing:N'),
            #column=alt.Column('),
            #row=alt.Row('empty-cuda-cache:N'),
        )
    )

'---- gpu_memory ----'

'---- sst2_valid_acc ----'

'---- train_samples_sec ----'

In [28]:
alt.__version__

'5.0.1'

In [31]:
import altair as alt
alt.renderers.enable('default')

RendererRegistry.enable('default')