diff --git a/3rdparty/tvm b/3rdparty/tvm
index 07648907e..240802497 160000
--- a/3rdparty/tvm
+++ b/3rdparty/tvm
@@ -1 +1 @@
-Subproject commit 07648907e1678ec2b84d8ec579b2ec8f4925d218
+Subproject commit 2408024972b9199683491871329829d98b59dc5c
diff --git a/bitblas/gpu/matmul_analysis.py b/bitblas/gpu/matmul_analysis.py
index 1d0889fa3..36cba1969 100644
--- a/bitblas/gpu/matmul_analysis.py
+++ b/bitblas/gpu/matmul_analysis.py
@@ -623,7 +623,8 @@ def check_last_trait(region: List[Range]):
         # Currently, we only support block reduction depth 2 for small M
         # When the func is a dequantize like ops, we should consider the M
         require_block_reduce = False
-        if hasattr(func.attrs, "dequantize_info"):
+        # And we only support float16 for now
+        if hasattr(func.attrs, "dequantize_info") and in_dtype == "float16":
             for arg in func.params:
                 inp_shape = func.buffer_map[arg].shape
                 M = inp_shape[0]
diff --git a/bitblas/ops/general_matmul/__init__.py b/bitblas/ops/general_matmul/__init__.py
index 16908dd41..dea4042e1 100644
--- a/bitblas/ops/general_matmul/__init__.py
+++ b/bitblas/ops/general_matmul/__init__.py
@@ -85,6 +85,8 @@ class MatmulConfig(OperatorConfig):
         None  # propagate_b is a flag to control the ladder permutation
     )
 
+    # TODO: This is a temporary solution to legalize the dynamic symbolic.
+    # Maybe we should remove this in the future.
     # optimize strategy, default is SingleBatchDecodeOnly
     optimize_stratety: Union[int, OptimizeStrategy] = OptimizeStrategy.SingleBatchDecodeOnly
 
diff --git a/integration/BitNet/eval_correctness.py b/integration/BitNet/eval_correctness.py
index 4017a6c17..6bd787535 100644
--- a/integration/BitNet/eval_correctness.py
+++ b/integration/BitNet/eval_correctness.py
@@ -72,18 +72,19 @@ def get_runtime(num_repeats=1):
 def main():
     model = BitnetForCausalLM.from_pretrained(
         model_path,
-        use_flash_attention_2=True,
+        use_flash_attention_2=False,
         torch_dtype=torch.float16,
     ).cuda().half()
-    with torch.no_grad():
-        model._post_process_weights()
 
     tokenizer = BitnetTokenizer.from_pretrained(model_path, use_fast=False)
     input_id = tokenizer("Hello")['input_ids']
     input_id = torch.tensor(input_id).unsqueeze(0).cuda()
-    output = model(input_id)
-    print(output)
 
+    print("original model generated text:")
+    print(generate_text(model, tokenizer, "Hello", max_length=100))
+
+    model.quantize()
+    print("quantized model generated text:")
     print(generate_text(model, tokenizer, "Hello", max_length=100))
 
 
diff --git a/integration/BitNet/maint/create_bitblas_ckpt.py b/integration/BitNet/maint/create_bitblas_ckpt.py
index 0bf603e0d..6ddb04cba 100644
--- a/integration/BitNet/maint/create_bitblas_ckpt.py
+++ b/integration/BitNet/maint/create_bitblas_ckpt.py
@@ -68,7 +68,7 @@ def main():
     model = (
         BitnetForCausalLM.from_pretrained(
             model_name_or_path,
-            use_flash_attention_2=True,
+            use_flash_attention_2=False,
             torch_dtype=torch.float16,
         ).cuda().half())
     tokenizer = BitnetTokenizer.from_pretrained(model_name_or_path, use_fast=False)
diff --git a/integration/BitNet/maint/generate_bitnet_model_bitblas_format.sh b/integration/BitNet/maint/generate_bitnet_model_bitblas_format.sh
index 3ace58031..e265658ac 100755
--- a/integration/BitNet/maint/generate_bitnet_model_bitblas_format.sh
+++ b/integration/BitNet/maint/generate_bitnet_model_bitblas_format.sh
@@ -18,6 +18,9 @@ fi
 if [ -z "$SAVED_MODEL_DIR" ]; then
   python ./maint/create_bitblas_ckpt.py --model_name_or_path $MODEL_DIR
 else
+  if [ ! -d "$SAVED_MODEL_DIR" ]; then
+    mkdir -p $SAVED_MODEL_DIR
+  fi
   python ./maint/create_bitblas_ckpt.py --model_name_or_path $MODEL_DIR --saved_model_path $SAVED_MODEL_DIR
 fi
 
diff --git a/integration/BitNet/requirements.txt b/integration/BitNet/requirements.txt
index 7d4b14956..45952b615 100644
--- a/integration/BitNet/requirements.txt
+++ b/integration/BitNet/requirements.txt
@@ -1,2 +1,3 @@
 lm_eval==0.3.0
 flash_attn
+transformers==4.40
\ No newline at end of file