Skip to content

Commit

Permalink
Bump version to 0.6.0 (#1023)
Browse files Browse the repository at this point in the history
  • Loading branch information
dakinggg committed Mar 12, 2024
1 parent 4e43792 commit 257c25d
Show file tree
Hide file tree
Showing 8 changed files with 14 additions and 60 deletions.
8 changes: 2 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -132,9 +132,7 @@ We *strongly* recommend working with LLM Foundry inside a Docker container (see
```bash
git clone https://github.com/mosaicml/llm-foundry.git
cd llm-foundry
pip install -e ".[gpu-flash2]" # or `pip install -e .` if no NVIDIA GPU.
# Note: Currently, `pip install -e ".[gpu-flash2]"` installs Flash Attention v2, and `pip install -e ".[gpu]"` installs Flash Attention v1.
# However, once the support for Flash Attention v1 is removed, both of these commands will install Flash Attention v2.
pip install -e ".[gpu]" # or `pip install -e .` if no NVIDIA GPU.
```

### Without Docker (not recommended)
Expand All @@ -152,9 +150,7 @@ source llmfoundry-venv/bin/activate

pip install cmake packaging torch # setup.py requires these be installed

pip install -e ".[gpu-flash2]" # or `pip install -e .` if no NVIDIA GPU.
# Note: Currently, `pip install -e ".[gpu-flash2]"` installs Flash Attention v2, and `pip install -e ".[gpu]"` installs Flash Attention v1.
# However, once the support for Flash Attention v1 is removed, both of these commands will install Flash Attention v2.
pip install -e ".[gpu]" # or `pip install -e .` if no NVIDIA GPU.
```

### TransformerEngine and amp_fp8 support
Expand Down
13 changes: 2 additions & 11 deletions llmfoundry/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,6 @@

hf_dynamic_modules_logger.addFilter(new_files_warning_filter)

# Before importing any transformers models, we need to disable transformers flash attention if
# we are in an environment with flash attention version <2. Transformers hard errors on a not properly
# gated import otherwise.
import transformers

from llmfoundry import optim, utils
from llmfoundry.data import (ConcatTokensDataset, MixtureOfDenoisersCollator,
NoConcatDataset, Seq2SeqFinetuningCollator,
Expand All @@ -33,18 +28,14 @@
ComposerHFT5)
from llmfoundry.models.layers.attention import (
MultiheadAttention, attn_bias_shape, build_alibi_bias, build_attn_bias,
flash_attn_fn, is_flash_v1_installed,
scaled_multihead_dot_product_attention, triton_flash_attn_fn)
flash_attn_fn, scaled_multihead_dot_product_attention, triton_flash_attn_fn)
from llmfoundry.models.layers.blocks import MPTBlock
from llmfoundry.models.layers.ffn import FFN_CLASS_REGISTRY, MPTMLP, build_ffn
from llmfoundry.models.model_registry import COMPOSER_MODEL_REGISTRY
from llmfoundry.models.mpt import (ComposerMPTCausalLM, MPTConfig,
MPTForCausalLM, MPTModel, MPTPreTrainedModel)
from llmfoundry.tokenizers import TiktokenTokenizerWrapper

if is_flash_v1_installed():
transformers.utils.is_flash_attn_available = lambda: False

__all__ = [
'build_text_denoising_dataloader',
'build_finetuning_dataloader',
Expand Down Expand Up @@ -77,4 +68,4 @@
'TiktokenTokenizerWrapper',
]

__version__ = '0.5.0'
__version__ = '0.6.0'
7 changes: 0 additions & 7 deletions llmfoundry/models/layers/attention.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,13 +44,6 @@ def check_alibi_support(attention_impl: str) -> bool:
v2_version='v2.4.2')


# Before importing any transformers models, we need to disable transformers flash attention if
# we are in an environment with flash attention version <2. Transformers hard errors on a not properly
# gated import otherwise.
if is_flash_v1_installed():
import transformers
transformers.utils.is_flash_attn_available = lambda: False

from transformers.models.llama.modeling_llama import apply_rotary_pos_emb


Expand Down
8 changes: 0 additions & 8 deletions llmfoundry/models/mpt/configuration_mpt.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
from transformers import PretrainedConfig

from llmfoundry.models.layers.attention import (check_alibi_support,
is_flash_v1_installed,
is_flash_v2_installed)
from llmfoundry.models.layers.blocks import attn_config_defaults

Expand Down Expand Up @@ -230,13 +229,6 @@ def _validate_config(self) -> None:
raise NotImplementedError(
'prefix_lm only implemented with torch and triton attention.')

if self.attn_config['attn_impl'] == 'flash' and is_flash_v1_installed():
warnings.warn(
VersionedDeprecationWarning(
'Support for Flash Attention v1 is deprecated. Please upgrade to Flash Attention v2.4.2. To install Flash Attention v2.4.2, please run `pip install -e ".[gpu-flash2]"` from the root directory of the llm-foundry repository.',
remove_version='0.6.0',
))

if self.attn_config[
'attn_impl'] == 'triton' and not self.attn_config['prefix_lm']:
warnings.warn(
Expand Down
9 changes: 1 addition & 8 deletions llmfoundry/models/mpt/modeling_mpt.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,7 @@
from composer.utils import dist

from llmfoundry.metrics import TokenAccuracy
from llmfoundry.models.layers.attention import (is_flash_v1_installed,
is_flash_v2_installed)
from llmfoundry.models.layers.attention import is_flash_v2_installed
from llmfoundry.models.layers.norm import NORM_CLASS_REGISTRY

if is_flash_v2_installed():
Expand All @@ -39,12 +38,6 @@
except Exception as e:
raise e

if is_flash_v1_installed():
try: # This try...except is needed because transformers requires it despite the 'if' statement above
from flash_attn import bert_padding
except Exception as e:
raise e

from omegaconf import DictConfig
from omegaconf import OmegaConf as om
from transformers import PreTrainedModel, PreTrainedTokenizerBase
Expand Down
4 changes: 2 additions & 2 deletions mcli/mcli-llama2-finetune.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ parameters:
pretrained: true
# Note: you must have set the HUGGING_FACE_HUB_TOKEN environment variable and have access to the llama2 models
use_auth_token: true
attention_patch_type: triton
use_flash_attention_2: true

# Tokenizer
tokenizer:
Expand All @@ -62,7 +62,7 @@ parameters:
# # Or use `python llmfoundry/scripts/misc/profile_packing.py --yaml-path /path/to/this/yaml/ ...`
# # to profile this run's optimal packing_ratio as it depends on GPU count,
# # batch size, sequence length
# packing_ratio:
# packing_ratio: auto
drop_last: true
num_workers: 8
pin_memory: false
Expand Down
10 changes: 4 additions & 6 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

"""MosaicML LLM Foundry package setup."""

import copy
import os
import re

Expand Down Expand Up @@ -98,16 +99,13 @@
'mosaicml[tensorboard]>=0.20.1,<0.21',
]

extra_deps['gpu'] = [
'flash-attn==1.0.9',
# PyPI does not support direct dependencies, so we remove this line before uploading from PyPI
'xentropy-cuda-lib@git+https://github.com/HazyResearch/flash-attention.git@v1.0.9#subdirectory=csrc/xentropy',
]

# Flash 2 group kept for backwards compatibility
extra_deps['gpu-flash2'] = [
'flash-attn==2.5.0',
]

extra_deps['gpu'] = copy.deepcopy(extra_deps['gpu-flash2'])

extra_deps['peft'] = [
'mosaicml[peft]>=0.20.1,<0.21',
]
Expand Down
15 changes: 3 additions & 12 deletions tests/models/layers/test_huggingface_flash.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,23 +12,14 @@
from composer.utils import reproducibility
from omegaconf import DictConfig
from omegaconf import OmegaConf as om
from transformers.models.llama.modeling_llama import LlamaAttention

from llmfoundry import COMPOSER_MODEL_REGISTRY
from llmfoundry.models.hf.hf_fsdp import rgetattr
from llmfoundry.models.layers.attention import (is_flash_v1_installed,
is_flash_v2_installed)
from llmfoundry.utils.builders import build_tokenizer

# Before importing any transformers models, we need to disable transformers flash attention if
# we are in an environment with flash attention version <2. Transformers hard errors on a not properly
# gated import otherwise.
if is_flash_v1_installed():
transformers.utils.is_flash_attn_available = lambda: False

from transformers.models.llama.modeling_llama import LlamaAttention

from llmfoundry.models.layers.attention import is_flash_v2_installed
from llmfoundry.models.layers.llama_attention_monkeypatch import (
llama_attention_patch_torch, llama_attention_patch_triton)
from llmfoundry.utils.builders import build_tokenizer


@pytest.mark.parametrize('patch_fn_name', ['torch', 'triton'])
Expand Down

0 comments on commit 257c25d

Please sign in to comment.