diff --git a/3rdparty/tvm b/3rdparty/tvm index 07648907e..240802497 160000 --- a/3rdparty/tvm +++ b/3rdparty/tvm @@ -1 +1 @@ -Subproject commit 07648907e1678ec2b84d8ec579b2ec8f4925d218 +Subproject commit 2408024972b9199683491871329829d98b59dc5c diff --git a/bitblas/gpu/matmul_analysis.py b/bitblas/gpu/matmul_analysis.py index 1d0889fa3..36cba1969 100644 --- a/bitblas/gpu/matmul_analysis.py +++ b/bitblas/gpu/matmul_analysis.py @@ -623,7 +623,8 @@ def check_last_trait(region: List[Range]): # Currently, we only support block reduction depth 2 for small M # When the func is a dequantize like ops, we should consider the M require_block_reduce = False - if hasattr(func.attrs, "dequantize_info"): + # And we only support float16 for now + if hasattr(func.attrs, "dequantize_info") and in_dtype == "float16": for arg in func.params: inp_shape = func.buffer_map[arg].shape M = inp_shape[0] diff --git a/bitblas/ops/general_matmul/__init__.py b/bitblas/ops/general_matmul/__init__.py index 16908dd41..dea4042e1 100644 --- a/bitblas/ops/general_matmul/__init__.py +++ b/bitblas/ops/general_matmul/__init__.py @@ -85,6 +85,8 @@ class MatmulConfig(OperatorConfig): None # propagate_b is a flag to control the ladder permutation ) + # TODO: This is a temporary solution to legalize the dynamic symbolic. + # Maybe we should remove this in the future. # optimize strategy, default is SingleBatchDecodeOnly optimize_stratety: Union[int, OptimizeStrategy] = OptimizeStrategy.SingleBatchDecodeOnly diff --git a/integration/BitNet/eval_correctness.py b/integration/BitNet/eval_correctness.py index 4017a6c17..6bd787535 100644 --- a/integration/BitNet/eval_correctness.py +++ b/integration/BitNet/eval_correctness.py @@ -72,18 +72,19 @@ def get_runtime(num_repeats=1): def main(): model = BitnetForCausalLM.from_pretrained( model_path, - use_flash_attention_2=True, + use_flash_attention_2=False, torch_dtype=torch.float16, ).cuda().half() - with torch.no_grad(): - model._post_process_weights() tokenizer = BitnetTokenizer.from_pretrained(model_path, use_fast=False) input_id = tokenizer("Hello")['input_ids'] input_id = torch.tensor(input_id).unsqueeze(0).cuda() - output = model(input_id) - print(output) + print("original model generated text:") + print(generate_text(model, tokenizer, "Hello", max_length=100)) + + model.quantize() + print("quantized model generated text:") print(generate_text(model, tokenizer, "Hello", max_length=100)) diff --git a/integration/BitNet/maint/create_bitblas_ckpt.py b/integration/BitNet/maint/create_bitblas_ckpt.py index 0bf603e0d..6ddb04cba 100644 --- a/integration/BitNet/maint/create_bitblas_ckpt.py +++ b/integration/BitNet/maint/create_bitblas_ckpt.py @@ -68,7 +68,7 @@ def main(): model = ( BitnetForCausalLM.from_pretrained( model_name_or_path, - use_flash_attention_2=True, + use_flash_attention_2=False, torch_dtype=torch.float16, ).cuda().half()) tokenizer = BitnetTokenizer.from_pretrained(model_name_or_path, use_fast=False) diff --git a/integration/BitNet/maint/generate_bitnet_model_bitblas_format.sh b/integration/BitNet/maint/generate_bitnet_model_bitblas_format.sh index 3ace58031..e265658ac 100755 --- a/integration/BitNet/maint/generate_bitnet_model_bitblas_format.sh +++ b/integration/BitNet/maint/generate_bitnet_model_bitblas_format.sh @@ -18,6 +18,9 @@ fi if [ -z "$SAVED_MODEL_DIR" ]; then python ./maint/create_bitblas_ckpt.py --model_name_or_path $MODEL_DIR else + if [ ! -d "$SAVED_MODEL_DIR" ]; then + mkdir -p $SAVED_MODEL_DIR + fi python ./maint/create_bitblas_ckpt.py --model_name_or_path $MODEL_DIR --saved_model_path $SAVED_MODEL_DIR fi diff --git a/integration/BitNet/requirements.txt b/integration/BitNet/requirements.txt index 7d4b14956..45952b615 100644 --- a/integration/BitNet/requirements.txt +++ b/integration/BitNet/requirements.txt @@ -1,2 +1,3 @@ lm_eval==0.3.0 flash_attn +transformers==4.40 \ No newline at end of file