# Tuning LoRA Hyperparameters - Experiments Rank, 2nd Attempt

## Environment Setup

In [2]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

In [3]:
%%capture
import sys
sys.path.append('src')

import boto3

import sagemaker
from sagemaker.pytorch import PyTorch
from sagemaker.tuner import HyperparameterTuner

import warnings
warnings.filterwarnings('ignore')

from src.util import (
    count_parameters, 
    calc_combinations
)
from src.nb_helper import (
    display_tuning_jobs,
    get_default_estimator_parameters,
    p,
    capture_results,
    graph_results,
    roberta_total, 
    roberta_learnable
)
from amtviz import visualize_tuning_job
import altair as alt
sm = boto3.client('sagemaker')

## Experiments

Let's try some experiments. For all of the experiments we use one learning rate (and weight decay / dropout) as a basis. We use the learning rate we found with the tuning job.

In [4]:
# We continue to use the results of the base lora tuner
%store -r
assert base_lora_tuner_name
base_lora_tuner = HyperparameterTuner.attach(base_lora_tuner_name)
base_lora_tuner_name

sagemaker.config INFO - Not applying SDK defaults from location: /Library/Preferences/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /Users/mkamp/Library/Preferences/sagemaker/config.yaml


'lora-base-tuning-231031-1241'

In [5]:
base_lora_tuner.wait()
desc = sm.describe_hyper_parameter_tuning_job(HyperParameterTuningJobName=base_lora_tuner.describe()['HyperParameterTuningJobName'])
print(desc['HyperParameterTuningJobName'], '=>', desc['HyperParameterTuningJobStatus'])
best_hyperparameters = desc['BestTrainingJob']['TunedHyperParameters']
print('Best parameters found:', best_hyperparameters)
best_hyperparameters = {k:float(v.replace('"', '')) for k, v in best_hyperparameters.items()}; best_hyperparameters

!
lora-base-tuning-231031-1241 => Completed
Best parameters found: {'sst2-learning-rate': '0.0017805837777105818'}


{'sst2-learning-rate': 0.0017805837777105818}

### Another Shot At Tuning Rank

One reason that we did not see that much of an impact when tuning `r`, could be that `learning rate` is very sensitive when changing `r`. But we always used the same learning rate. Let's find out what happens if we run separate tuning jobs for `r=2`, `r=4`, `r=8`, `r=16`, `r=32`.

And we use a Bayesian search to find a good learning rate for these individual scenarios. We expect that the `learning rate` cannot be that difference. Hence we use a search range from `95%` to `105%` of the `learning rate` from our LoRA base tuning. 

If it turns out that AMT will cluster trials to the lower or upper bound of the learning rate we specified, then we would need to run another experiment.

Also if the performance improves dramatically as a result of tuning `r` and on the upper or lower bound, we should also run another experiment, verifying where exactly the point of saturation is.

In [17]:
from sagemaker.tuner import ContinuousParameter, CategoricalParameter

n_trials = 7

hpt_ranges = {
    'sst2-learning-rate': ContinuousParameter(best_hyperparameters['sst2-learning-rate'] * 0.95, best_hyperparameters['sst2-learning-rate']*1.05),
    'clf-droput': CategoricalParameter([0.0, 0.01, 0.1, 0.5])
}

In [18]:
r_tuning_jobs = []
for r in [2, 4, 8, 16, 32, 64]:
    estimator = PyTorch(**get_default_estimator_parameters())
    estimator.set_hyperparameters(**{'sst2-lora-config': 'all', 'sst2-lora-r': r, **best_hyperparameters})
    
    tuner_parameters = {**(p('tuner_parameters') | dict(
        strategy='Random',
        hyperparameter_ranges=hpt_ranges,
        metric_definitions=p('metric_definitions'),
        estimator = estimator,
        base_tuning_job_name = f'r{r:02d}',
        max_jobs = n_trials,
        max_parallel_jobs = 1,
    ))}
    
    tuner = HyperparameterTuner(**tuner_parameters)
    tuner.fit(wait=False)
    r_tuning_jobs.append((r, tuner))

sagemaker.config INFO - Not applying SDK defaults from location: /Library/Preferences/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /Users/mkamp/Library/Preferences/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /Library/Preferences/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /Users/mkamp/Library/Preferences/sagemaker/config.yaml
Using provided s3_resource
sagemaker.config INFO - Not applying SDK defaults from location: /Library/Preferences/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /Users/mkamp/Library/Preferences/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /Library/Preferences/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /Users/mkamp/Library/Preferences/sagemaker/config.yaml
Using provided s3_resource
sagemaker.config INFO - Not applyi

In [24]:
for j in r_tuning_jobs:
    print(j[-1].describe()['HyperParameterTuningJobName'])
    j[-1].wait()

r02-231031-1457
!
r04-231031-1457
!
r08-231031-1457
!
r16-231031-1457
!
r32-231031-1457
!
r64-231031-1457
!


In [25]:
graph, trials_df, full_df = visualize_tuning_job([j[1] for j in r_tuning_jobs], return_dfs=True, job_metrics=['train_loss', 'valid_loss', 'learnable_parameters'], advanced=True)
graph

Tuning job r08-231031-1457           status: Completed
sagemaker.config INFO - Not applying SDK defaults from location: /Library/Preferences/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /Users/mkamp/Library/Preferences/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /Library/Preferences/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /Users/mkamp/Library/Preferences/sagemaker/config.yaml
Tuning job r32-231031-1457           status: Completed
sagemaker.config INFO - Not applying SDK defaults from location: /Library/Preferences/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /Users/mkamp/Library/Preferences/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /Library/Preferences/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /Users/mkamp/Library/Preferences/

Unnamed: 0,clf-droput,sst2-learning-rate,TrainingJobName,TrainingJobStatus,TrainingStartTime,TrainingEndTime,TrainingElapsedTimeSeconds,TuningJobName,sst2_valid_acc
1,0.01,0.001792,r08-231031-1457-006-4e903bb7,Completed,2023-10-31 16:07:22+01:00,2023-10-31 16:20:27+01:00,785.0,r08-231031-1457,0.947248
4,0.1,0.001702,r16-231031-1457-003-f34a6849,Completed,2023-10-31 15:26:02+01:00,2023-10-31 15:37:31+01:00,689.0,r16-231031-1457,0.946101
4,0.1,0.001693,r04-231031-1457-003-e6c0628b,Completed,2023-10-31 15:24:07+01:00,2023-10-31 15:34:46+01:00,639.0,r04-231031-1457,0.943807
0,0.01,0.001858,r16-231031-1457-007-03e3a57a,Completed,2023-10-31 16:18:02+01:00,2023-10-31 16:29:36+01:00,694.0,r16-231031-1457,0.941514
1,0.1,0.00173,r16-231031-1457-006-282fdb19,Completed,2023-10-31 16:05:43+01:00,2023-10-31 16:16:22+01:00,639.0,r16-231031-1457,0.940367
0,0.1,0.001741,r08-231031-1457-007-1d380729,Completed,2023-10-31 16:22:47+01:00,2023-10-31 16:33:26+01:00,639.0,r08-231031-1457,0.93922
3,0.5,0.001798,r32-231031-1457-004-013464c1,Completed,2023-10-31 15:34:11+01:00,2023-10-31 15:50:52+01:00,1001.0,r32-231031-1457,0.93922
5,0.01,0.001726,r04-231031-1457-002-78ef0c1e,Completed,2023-10-31 15:09:23+01:00,2023-10-31 15:20:57+01:00,694.0,r04-231031-1457,0.93922
6,0.1,0.001694,r16-231031-1457-001-65bd80dd,Completed,2023-10-31 14:59:08+01:00,2023-10-31 15:10:19+01:00,671.0,r16-231031-1457,0.93922
2,0.1,0.001857,r08-231031-1457-005-690febea,Completed,2023-10-31 15:55:14+01:00,2023-10-31 16:05:54+01:00,640.0,r08-231031-1457,0.93922


Cache Hit/Miss: HHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHH


In [26]:
# Search space for the learning rates
trials_df.groupby('TuningJobName').agg({'sst2-learning-rate': ['median', 'mean', 'min', 'max', 'std']})

Unnamed: 0_level_0,sst2-learning-rate,sst2-learning-rate,sst2-learning-rate,sst2-learning-rate,sst2-learning-rate
Unnamed: 0_level_1,median,mean,min,max,std
TuningJobName,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
r02-231031-1457,0.001835,0.001813,0.00173,0.001857,4.9e-05
r04-231031-1457,0.001768,0.001771,0.001693,0.001844,5.7e-05
r08-231031-1457,0.001758,0.001776,0.001733,0.001857,4.3e-05
r16-231031-1457,0.00173,0.001752,0.001694,0.001858,6.4e-05
r32-231031-1457,0.001799,0.001795,0.001709,0.001835,4.4e-05
r64-231031-1457,0.001759,0.001778,0.00172,0.001853,6e-05


In [27]:
# The resulting learning rates (p80) are all within one std. 
trials_df.sort_values('sst2_valid_acc', ascending=False).groupby('TuningJobName').quantile(0.8)

Unnamed: 0_level_0,clf-droput,sst2-learning-rate,TrainingElapsedTimeSeconds,sst2_valid_acc
TuningJobName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
r02-231031-1457,0.082,0.001848,697.0,0.936468
r04-231031-1457,0.1,0.001824,698.0,0.938761
r08-231031-1457,0.1,0.001794,765.0,0.93922
r16-231031-1457,0.1,0.001807,685.4,0.941284
r32-231031-1457,0.42,0.001827,788.0,0.936468
r64-231031-1457,0.1,0.00184,532.4,0.906881


Maybe we would need to zoom out to see the effect?

In [28]:
full_df = full_df[full_df['TuningJobName'].str.startswith('r') ] 
full_df = full_df[full_df.label == 'learnable_parameters']
full_df['r'] = full_df['TuningJobName'].str.extract('r(\d+)-').astype('int')
learnable_df = full_df.groupby('r').max('value').reset_index()
learnable_df['relative_percent'] = learnable_df['value'] / roberta_learnable * 100; learnable_df # 125 Million parameters

Unnamed: 0,r,value,clf-droput,sst2-learning-rate,TrainingElapsedTimeSeconds,sst2_valid_acc,relative_percent
0,2,923906.0,0.1,0.001857,780.0,0.938073,0.741217
1,4,1255682.0,0.1,0.001844,700.0,0.943807,1.007389
2,8,1919234.0,0.1,0.001857,785.0,0.947248,1.539733
3,16,3246338.0,0.1,0.001858,694.0,0.946101,2.604422
4,32,5900546.0,0.5,0.001835,1001.0,0.93922,4.733799
5,64,11208962.0,0.5,0.001853,644.0,0.913991,8.992552


In [29]:
k=5
# Filter out the base tuning job
t_df = trials_df[trials_df['TuningJobName'].str.startswith('r') ] 

# Extract r out of the name of the TuningJob
t_df['r'] = t_df['TuningJobName'].str.extract('r(\d+)-').astype('int')

# Sort by r and performance
t_df = t_df.sort_values(by=['r', 'sst2_valid_acc'], ascending=[True, False])

# Just keep the top k trials
topk_df = t_df.groupby('r').nth[:k].reset_index() # Three trials with the best objective metric per r

# Summarize
summary_df = topk_df.groupby('r').agg({'sst2_valid_acc': ['mean', 'std'], 'sst2-learning-rate': ['mean', 'std']}); summary_df

Unnamed: 0_level_0,sst2_valid_acc,sst2_valid_acc,sst2-learning-rate,sst2-learning-rate
Unnamed: 0_level_1,mean,std,mean,std
r,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
2,0.935321,0.002236,0.001816,5e-05
4,0.938073,0.003626,0.00178,6.6e-05
8,0.940826,0.00359,0.001788,4.5e-05
16,0.941055,0.003098,0.001736,6.9e-05
32,0.935321,0.002762,0.001794,5e-05
64,0.905734,0.005583,0.00178,6.5e-05


In [30]:
topk_df

Unnamed: 0,r,clf-droput,sst2-learning-rate,TrainingJobName,TrainingJobStatus,TrainingStartTime,TrainingEndTime,TrainingElapsedTimeSeconds,TuningJobName,sst2_valid_acc
0,2,0.01,0.001835,r02-231031-1457-001-94e4461a,Completed,2023-10-31 14:58:59+01:00,2023-10-31 15:09:39+01:00,640.0,r02-231031-1457,0.938073
1,2,0.01,0.00173,r02-231031-1457-004-f61012e3,Completed,2023-10-31 15:38:05+01:00,2023-10-31 15:48:44+01:00,639.0,r02-231031-1457,0.936927
2,2,0.1,0.001821,r02-231031-1457-003-8288c140,Completed,2023-10-31 15:25:56+01:00,2023-10-31 15:36:40+01:00,644.0,r02-231031-1457,0.934633
3,2,0.1,0.001857,r02-231031-1457-005-ad6f390c,Completed,2023-10-31 15:50:22+01:00,2023-10-31 16:02:01+01:00,699.0,r02-231031-1457,0.934633
4,2,0.01,0.001838,r02-231031-1457-002-1a4dbf24,Completed,2023-10-31 15:11:13+01:00,2023-10-31 15:22:42+01:00,689.0,r02-231031-1457,0.932339
5,4,0.1,0.001693,r04-231031-1457-003-e6c0628b,Completed,2023-10-31 15:24:07+01:00,2023-10-31 15:34:46+01:00,639.0,r04-231031-1457,0.943807
6,4,0.01,0.001726,r04-231031-1457-002-78ef0c1e,Completed,2023-10-31 15:09:23+01:00,2023-10-31 15:20:57+01:00,694.0,r04-231031-1457,0.93922
7,4,0.0,0.001844,r04-231031-1457-005-71c1ab19,Completed,2023-10-31 15:50:56+01:00,2023-10-31 15:59:40+01:00,524.0,r04-231031-1457,0.936927
8,4,0.1,0.001808,r04-231031-1457-006-c7b33dad,Completed,2023-10-31 16:01:22+01:00,2023-10-31 16:12:01+01:00,639.0,r04-231031-1457,0.93578
9,4,0.01,0.001828,r04-231031-1457-007-c9586b83,Completed,2023-10-31 16:13:46+01:00,2023-10-31 16:25:25+01:00,699.0,r04-231031-1457,0.934633


In [31]:
graphs = []
for field, title in [('sst2_valid_acc', 'Performance vs r'), ('sst2-learning-rate', 'Learning Rate vs r')]:
    summary = alt.Chart(topk_df, title=title).mark_line(opacity=0.5).encode(
        x = alt.X('r:Q'),
        y = alt.Y(f'mean({field}):Q', scale=alt.Scale(zero=False))
    )
    std = summary.mark_errorbar(color='grey', opacity=0.75).encode(
        y = alt.Y(f'{field}:Q', scale=alt.Scale(zero=False))
    )
    individual_points = std.mark_circle(color='red')

    graphs.append(summary + std + individual_points)

learnable_chart = alt.Chart(learnable_df, title='Tunable Parameters vs r', width=300).mark_bar(color='orange').encode(
    x = alt.X('r:O'),
    y = alt.Y('relative_percent:Q', title='parameters (%)')
)
    
alt.hconcat(*graphs, learnable_chart)

In [32]:
import altair as alt

In [33]:
alt.renderers.enable('default')

RendererRegistry.enable('default')