/
cosmo_1b.yaml
169 lines (157 loc) · 5.18 KB
/
cosmo_1b.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
# CUDA_DEVICE_MAX_CONNECTIONS=1 torchrun --nproc_per_node=8 use_trainer.py --config-file ./examples/loubna/config_llama_1b.yaml
general:
project: train_synthetic
ignore_sanity_checks: true
run: 1b_final_150b_3
seed: 1234
checkpoints:
checkpoints_path: /fsx/loubna/checkpoints/1b_final_150b_v2_3
checkpoint_interval: 5000
resume_checkpoint_path: s3://synthetic-project-models/1b_final_150b_v2_3
checkpoints_path_is_shared_file_system: False
kill_switch_path: ./kill_1b_final_150b_v2_3
lighteval: null
profiler: null
logging:
iteration_step_info_interval: 1
log_level: info
log_level_replica: info
s3_upload:
upload_s3_path: s3://synthetic-project-models/1b_final_150b_v2_3
remove_after_upload: true
s5cmd_numworkers: 15
s5cmd_concurrency: 6
s5cmd_path: /fsx/nouamane/miniconda/envs/2-1-cu121/bin/s5cmd
parallelism:
# 20 nodes
dp: 160
pp: 1
tp: 1
pp_engine: 1f1b
tp_mode: REDUCE_SCATTER
tp_linear_async_communication: true
# recompute_granularity: SELECTIVE
model:
ddp_bucket_cap_mb: 25
dtype: bfloat16
init_method:
std: 0.022
make_vocab_size_divisible_by: 1
model_config:
bos_token_id: 1
eos_token_id: 2
hidden_act: silu
hidden_size: 2048
initializer_range: 0.02
intermediate_size: 8192
is_llama_config: true
max_position_embeddings: 2048
num_attention_heads: 16
num_hidden_layers: 24
num_key_value_heads: 16
pad_token_id: null
pretraining_tp: 1
rms_norm_eps: 1.0e-05
rope_scaling: null
tie_word_embeddings: false
use_cache: true
vocab_size: 32000
tokenizer:
tokenizer_name_or_path: "mistralai/Mistral-7B-v0.1"
tokenizer_max_length: null
tokenizer_revision: null
tokens:
batch_accumulation_per_replica: 1
val_check_interval: 1
limit_val_batches: 1
micro_batch_size: 4 # GBS=1.3M tokens
sequence_length: 2048
train_steps: 140000 # 180B tokens 115000*160*4*2048
optimizer:
accumulate_grad_in_fp32: true
adam_beta1: 0.9
adam_beta2: 0.95
adam_eps: 1.0e-08
clip_grad: 1.0
learning_rate_scheduler:
learning_rate: 3.0e-4
lr_decay_steps: 140000
lr_decay_style: cosine
lr_warmup_steps: 2000
lr_warmup_style: linear
min_decay_lr: 3.0e-5
torch_adam_is_fused: true
weight_decay: 0.01
zero_stage: 0
experiment_logger:
tensorboard_logger:
flush_secs: 30
tensorboard_dir: /fsx/loubna/checkpoints/logs/tensorboard/1b_final_150b_v2_3
wandb_logger:
wandb_project: synthetic_data_models_4
wandb_entity: loubnabnl
brrr_data:
seed: 1234
num_loading_workers: 1
dataset:
index_mapping_dir: null # path to save index mapping .npy files, by default will save in the same location as data_prefix
splits_string: 9999,1,0 # train, val, test (we normalize by sum)
skip_warmup: true
dataloader_type: single # cyclic
validation_drop_last: true # Set to false if the last partial validation samples is to be consumed
eod_mask_loss: false # Mask loss for the end of document tokens
no_seqlen_plus_one_input_tokens: false # Set to true to disable fetching (sequence length + 1) input tokens, instead get (sequence length) input tokens and mask the last token
pad_samples_to_global_batch_size: false # Set to true if you want to pad the last partial batch with -1's to equal global batch size
# ultrachat upsampled 3 times
data_prefix:
- 20
- /fsx/synthetic_data/tokenized_textbooks_20B_dedup_bigcode_decontaminated/tokenized_completion_document
- 1.3
- /fsx/synthetic_data/code_tokenized/amt_web/tokenized_text_document
- 2.4
- /fsx/synthetic_data/code_tokenized/amt_python/tokenized_text_document
- 3
- /fsx/synthetic_data/code_tokenized/notebooks/tokenized_script_document
- 5.1
- /fsx/synthetic_data/tokenized_extras/stories_ultrachat/tokenized_completion_document
- 3.6
- /fsx/synthetic_data/tokenized_extras/stories_openhermes/tokenized_completion_document
- 2.4
- /fsx/synthetic_data/tokenized_extras/amt_khanacademy/tokenized_completion_document
- 0.54
- /fsx/synthetic_data/tokenized_instruct/ultrachat/tokenized_train_prompt_document
lighteval:
batch_size: 16
checkpoints_path: null
generation: null
logging:
hub_repo_details: null
hub_repo_results: null
hub_repo_tensorboard: loubnabnl/tensorboard_1b_final_150b_v2-3
local_output_path: /scratch/loubna/lighteval/synthetic_model-1b_final_150b_v2-3
push_details_to_hub: null
push_results_to_hub: null
push_results_to_tensorboard: true
tensorboard_metric_prefix: e
parallelism:
dp: 8
pp: 1
pp_engine: 1f1b
recompute_granularity: null
tp: 1
tp_linear_async_communication: false
tp_mode: ALL_REDUCE
slurm_script_dir: /fsx/loubna/logs/synthetic_training/logs_evals/1b_final_150b
slurm_template: /fsx/loubna/projects/training/brrr/examples/loubna/run_eval.slurm.jinja
tasks:
custom_tasks: brrr.lighteval.custom_tasks
dataset_loading_processes: 8
max_samples: 1000
multichoice_continuations_start_space: null
no_multichoice_continuations_start_space: null
num_fewshot_seeds: null
tasks: early-signal
wandb:
wandb_entity: loubnabnl
wandb_project: synthetic_data_evaluations
wandb_run_name: 1b_150b_3