diff --git a/3rdparty/tvm b/3rdparty/tvm
index 192ed5484..3c6317a1e 160000
--- a/3rdparty/tvm
+++ b/3rdparty/tvm
@@ -1 +1 @@
-Subproject commit 192ed5484db254311c8169e0291ee5ab78eaf186
+Subproject commit 3c6317a1ea614b7277ffe0b4ede18b4652afad1c
diff --git a/README.md b/README.md
index 7eeda4ab0..43f1d92d7 100644
--- a/README.md
+++ b/README.md
@@ -61,6 +61,14 @@ For more detailed information on benchmark sets with other formats (NF4/FP4) and
 
 | **A_dtype** | **W_dtype** | **Accum_dtype** |     **Out_dtype**    | **BitBLAS Support** |                  **Tested Platform**                 |
 |:-----------:|:-----------:|:---------------:|:--------------------:|:-------------------:|:----------------------------------------------------:|
+|     BF16    |     BF16    |    FP32/FP16    |         FP16         |        **√**        |                A100(SM_80)/A6000(SM_86)              |
+|     BF16    |   FP4_E2M1  |    FP32/FP16    |         FP16         |        **√**        |                A100(SM_80)/A6000(SM_86)              |
+|     BF16    |   FP8_E4M3  |    FP32/FP16    |         FP16         |        **√**        |                A100(SM_80)/A6000(SM_86)              |
+|     BF16    |     INT8    |    FP32/FP16    |         FP16         |        **√**        |                A100(SM_80)/A6000(SM_86)              |
+|     BF16    |  UINT4/INT4 |    FP32/FP16    |         FP16         |        **√**        |                A100(SM_80)/A6000(SM_86)              |
+|     BF16    |  UINT2/INT2 |    FP32/FP16    |         FP16         |        **√**        |                A100(SM_80)/A6000(SM_86)              |
+|     BF16    |    UINT1    |    FP32/FP16    |         FP16         |        **√**        |                A100(SM_80)/A6000(SM_86)              |
+|     BF16    |     NF4     |    FP32/FP16    |         FP16         |        **√**        |                A100(SM_80)/A6000(SM_86)              |
 |     FP16    |     FP16    |    FP32/FP16    |         FP16         |        **√**        | V100(SM_70)/A100(SM_80)/A6000(SM_86)/RTX 4090(SM_89) |
 |     FP16    |   FP4_E2M1  |    FP32/FP16    |         FP16         |        **√**        | V100(SM_70)/A100(SM_80)/A6000(SM_86)/RTX 4090(SM_89) |
 |     FP16    |   FP8_E4M3  |    FP32/FP16    |         FP16         |        **√**        | V100(SM_70)/A100(SM_80)/A6000(SM_86)/RTX 4090(SM_89) |
diff --git a/VERSION b/VERSION
index 419bd5f01..511aa8188 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-0.0.1.dev13
\ No newline at end of file
+0.0.1.dev14
\ No newline at end of file
diff --git a/bitblas/__init__.py b/bitblas/__init__.py
index a1bc95f39..58faec4ce 100644
--- a/bitblas/__init__.py
+++ b/bitblas/__init__.py
@@ -112,4 +112,4 @@ def new_func(*args, **kwargs):
     return decorator
 
 
-__version__ = "0.0.1.dev13"
+__version__ = "0.0.1.dev14"
diff --git a/bitblas/base/utils.py b/bitblas/base/utils.py
index 92c642b68..d2168c850 100644
--- a/bitblas/base/utils.py
+++ b/bitblas/base/utils.py
@@ -20,6 +20,8 @@
 import itertools
 from tvm.ir.supply import GlobalVarSupply
 from bitblas.utils import tensor_replace_dp4a, tensor_remove_make_int4, tensor_remove_make_int2
+from bitblas.utils.tensor_adapter import (
+    np_float2np_bf16,)
 import logging
 
 logger = logging.getLogger(__name__)
@@ -149,17 +151,21 @@ def map_numpy_type(intype):
 
         numpy_dtype = map_numpy_type(arg.dtype)
         if distribution == "uniform":
-            profile_tensors.append(
-                tvm.nd.array(
-                    np.random.rand(*[var_wrapper(i) for i in arg.shape]).astype(numpy_dtype),
-                    device=device,
-                ))
+            data_np = np.random.rand(*[var_wrapper(i) for i in arg.shape])
+            if arg.dtype == "bfloat16":
+                profile_tensors.append(
+                    tvm.nd.empty(data_np.shape, device=device, dtype=arg.dtype).copyfrom(
+                        np_float2np_bf16(data_np.astype(np.float32))))
+            else:
+                profile_tensors.append(tvm.nd.array(data_np.astype(numpy_dtype), device=device))
         elif distribution == "onefill":
-            profile_tensors.append(
-                tvm.nd.array(
-                    np.ones([var_wrapper(i) for i in arg.shape]).astype(numpy_dtype),
-                    device=device,
-                ))
+            data_np = np.ones(*[var_wrapper(i) for i in arg.shape])
+            if arg.dtype == "bfloat16":
+                profile_tensors.append(
+                    tvm.nd.empty(data_np.shape, device=device,
+                                 dtype=arg.dtype).copyfrom(np_float2np_bf16(data_np)))
+            else:
+                profile_tensors.append(tvm.nd.array(data_np.astype(numpy_dtype), device=device))
         else:
             raise ValueError("Not supported distribution: ", distribution)
     return profile_tensors
diff --git a/bitblas/builder/wrapper/tir.py b/bitblas/builder/wrapper/tir.py
index 0bedf70ed..59d63298b 100644
--- a/bitblas/builder/wrapper/tir.py
+++ b/bitblas/builder/wrapper/tir.py
@@ -18,7 +18,7 @@ class TIRCUDASourceWrapper(object):
     _TYPE_MAP = {
         "float32": "float",
         "float16": "half",
-        "bfloat16": "__nv_bfloat162",
+        "bfloat16": "__nv_bfloat16",
         "e4m3_float8": "__nv_fp8_e4m3",
         "e5m2_float8": "__nv_fp8_e5m2",
         "float64": "double",
diff --git a/bitblas/gpu/gemv_dequantize.py b/bitblas/gpu/gemv_dequantize.py
index 32c8cfbd1..9d56e1233 100644
--- a/bitblas/gpu/gemv_dequantize.py
+++ b/bitblas/gpu/gemv_dequantize.py
@@ -55,7 +55,8 @@ def check_weight_decode_info(weight_decode_info):
             conditions.append(weight_decode_info["source_format"]["bits"] in [1, 2, 4, 8])
             # check target format in ["float16", "int8"]
             conditions.append("target_format" in weight_decode_info)
-            conditions.append(weight_decode_info["target_format"] in ["float16", "int8"])
+            conditions.append(
+                weight_decode_info["target_format"] in ["float16", "bfloat16", "int8"])
             return all(conditions)
 
         if not check_weight_decode_info(weight_decode_info):
@@ -223,7 +224,8 @@ def check_weight_decode_info(weight_decode_info):
             conditions.append(weight_decode_info["source_format"]["bits"] in [1, 2, 4, 8])
             # check target format in ["float16", "int8"]
             conditions.append("target_format" in weight_decode_info)
-            conditions.append(weight_decode_info["target_format"] in ["float16", "int8"])
+            conditions.append(
+                weight_decode_info["target_format"] in ["float16", "bfloat16", "int8"])
             return all(conditions)
 
         if not check_weight_decode_info(weight_decode_info):
diff --git a/bitblas/gpu/intrin/lop3.py b/bitblas/gpu/intrin/lop3.py
index f078e7f47..466466ed9 100644
--- a/bitblas/gpu/intrin/lop3.py
+++ b/bitblas/gpu/intrin/lop3.py
@@ -1626,7 +1626,8 @@ def get_lop3_intrin_group(
     Dict[str, str]
         A dictionary mapping the names of the intrinsics to their corresponding implementations.
     """
-    assert out_dtype in ["float16", "int8"]
+    assert out_dtype in ["float16",
+                         "int8"], (f"Invalid out_dtype: {out_dtype}. Expected 'float16' or 'int8'.")
 
     dtype_mapping = {"float16": "f16", "int8": "i8", "int32": "i32"}
     target_dtype = dtype_mapping[out_dtype]
diff --git a/bitblas/gpu/matmul_analysis.py b/bitblas/gpu/matmul_analysis.py
index 36cba1969..16f33664a 100644
--- a/bitblas/gpu/matmul_analysis.py
+++ b/bitblas/gpu/matmul_analysis.py
@@ -624,7 +624,7 @@ def check_last_trait(region: List[Range]):
         # When the func is a dequantize like ops, we should consider the M
         require_block_reduce = False
         # And we only support float16 for now
-        if hasattr(func.attrs, "dequantize_info") and in_dtype == "float16":
+        if (hasattr(func.attrs, "dequantize_info") and in_dtype in ["bfloat16", "float16"]):
             for arg in func.params:
                 inp_shape = func.buffer_map[arg].shape
                 M = inp_shape[0]
@@ -690,12 +690,14 @@ def get_propagate_map(trans: bool = True, dtype="float16", matrix_name="A", inde
     )
 
     assert dtype in [
+        "bfloat16",
         "float16",
         "int8",
         "e4m3_float8",
         "e5m2_float8",
-    ], "Only support float16, int8, e4m3_float8, e5m2_float8"
-    if dtype == "float16":
+    ], "Only support bfloat16, float16, int8, e4m3_float8, e5m2_float8"
+    # TODO(lei): actually should analyze based on bits instead of dtype
+    if dtype in ["bfloat16", "float16"]:
         ldmatrix_layout = ldmatrix_32x8_to_shared_16x16_layout
         ldmatrix_layout_trans = ldmatrix_trans_32x8_to_shared_16x16_layout
     elif dtype in ["int8", "e4m3_float8", "e5m2_float8"]:
@@ -723,7 +725,7 @@ def ldmatrix_permutation_16x32_32x16_32x16(kernel_i, kernel_j):
         local_id = kernel_j % 16
         return ldmatrix_layout(thread_id, local_id)
 
-    if dtype == "float16":
+    if dtype in ["bfloat16", "float16"]:
         ldmatrix_index_map = (
             ldmatrix_trans_permutation_16x16_32x8_16x16
             if trans else ldmatrix_permutation_16x16_32x8_16x16)
@@ -732,7 +734,7 @@ def ldmatrix_permutation_16x32_32x16_32x16(kernel_i, kernel_j):
 
     ldmatrix_index_map = IndexMap.from_func(ldmatrix_index_map, index_dtype=index_dtype)
     # TODO(lei): index_dtype should be analyzed from the schedule
-    row, col = [16, 16] if dtype == "float16" else [16, 32]
+    row, col = [16, 16] if dtype in ["bfloat16", "float16"] else [16, 32]
     inversed_index_map = ldmatrix_index_map.inverse([row, col])
     return ldmatrix_index_map, inversed_index_map
 
@@ -753,12 +755,13 @@ def shared_32x16_to_mma_32x16_layout(i, j):
         return thread_id, local_id
 
     assert dtype in [
+        "bfloat16",
         "float16",
         "int8",
         "e4m3_float8",
         "e5m2_float8",
     ], "Only support float16, int8, e4m3_float8, e5m2_float8"
-    if dtype == "float16":
+    if dtype in ["bfloat16", "float16"]:
         stage3_layout = shared_32x8_to_mma_32x8_layout
     elif dtype in ["int8", "e4m3_float8", "e5m2_float8"]:
         stage3_layout = shared_32x16_to_mma_32x16_layout
@@ -782,14 +785,14 @@ def ladder_stage3_permutation_16x32_32x16_32x16_16x32(kernel_i, kernel_j):
         new_kernel_j = (new_thread_id * 16 + new_local_id) % 32
         return new_kernel_i, new_kernel_j
 
-    if dtype == "float16":
+    if dtype in ["bfloat16", "float16"]:
         stage3_index_map = ladder_stage3_permutation_16x16_32x8_32x8_16x16
     else:
         stage3_index_map = ladder_stage3_permutation_16x32_32x16_32x16_16x32
 
     stage3_index_map = IndexMap.from_func(stage3_index_map, index_dtype=index_dtype)
     # TODO(lei): index_dtype should be analyzed from the schedule
-    row, col = [16, 16] if dtype == "float16" else [16, 32]
+    row, col = [16, 16] if dtype in ["bfloat16", "float16"] else [16, 32]
     inversed_index_map = stage3_index_map.inverse([row, col])
     return stage3_index_map, inversed_index_map
 
diff --git a/bitblas/gpu/matmul_mma_dequantize.py b/bitblas/gpu/matmul_mma_dequantize.py
index e3a4fef04..b5c327190 100644
--- a/bitblas/gpu/matmul_mma_dequantize.py
+++ b/bitblas/gpu/matmul_mma_dequantize.py
@@ -1237,7 +1237,8 @@ def check_weight_decode_info(weight_decode_info):
             conditions.append(weight_decode_info["source_format"]["bits"] in [1, 2, 4, 8])
             # check target format in ["float16", "int8"]
             conditions.append("target_format" in weight_decode_info)
-            conditions.append(weight_decode_info["target_format"] in ["float16", "int8"])
+            conditions.append(
+                weight_decode_info["target_format"] in ["bfloat16", "float16", "int8"])
             return all(conditions)
 
         assert check_weight_decode_info(weight_decode_info), "Invalid B_decode_info"
diff --git a/bitblas/module/__init__.py b/bitblas/module/__init__.py
index 428814e3c..242589c7b 100644
--- a/bitblas/module/__init__.py
+++ b/bitblas/module/__init__.py
@@ -39,6 +39,25 @@ def unpack_qzeros(qzeros, bits):
     return torch.bitwise_and(unpacked_zeros + 1, 2**bits - 1)
 
 
+# For gptqv2 from gptqmodel
+def unpack_qzeros_v2(qzeros, bits):
+    qzeros = qzeros.view(torch.int32)
+    elems_per_int32 = 32 // bits
+    unpacked_zeros = torch.zeros(
+        (qzeros.shape[0], qzeros.shape[1] * elems_per_int32),
+        dtype=torch.int8,
+        device=qzeros.device,
+        requires_grad=False,
+    )
+    for col in range(unpacked_zeros.shape[1]):
+        i = col % elems_per_int32
+        unpacked_zeros[:, col] = (qzeros[:, col // elems_per_int32] >> (bits * i))
+
+    # Follow the instruction in AutoGPTQ qlinear_cuda_old.py line 303
+    # NOTE: It appears that casting after the `unpacked_zeros  + 1` is important.
+    return torch.bitwise_and(unpacked_zeros, 2**bits - 1)
+
+
 def unpack_qweight(qweight, bits):
     qweight = qweight.view(torch.int8)
     elems_per_int8 = 8 // bits
@@ -318,6 +337,31 @@ def repack_from_gptq(self, gptq_module):
         if self.bias is not None:
             self.bias = gptq_module.bias.data.to(torch.float16).contiguous()
 
+    def repack_from_gptq_v2(self, gptq_module):
+        # qweight in gptq old quant linear stored with (out_features, in_features), should be transposed.
+        qweight = gptq_module.qweight.T.contiguous().view(self.TORCH_STORAGE_DTYPE)
+        intweight = unpack_qweight(qweight, self.bits).contiguous()
+        if self.bitblas_matmul.weight_transform is not None:
+            qweight = self.bitblas_matmul.weight_transform(intweight.cpu()).cuda()
+        self.qweight = qweight
+        # scales in gptq old quant linear stored with (in_features // group_size, out_features), should be transposed.
+        scales = gptq_module.scales.T.contiguous().view(self.torch_dtype)
+        self.scales = scales
+        # qzeros should be dequantized to int zeros.
+        intzeros = unpack_qzeros_v2(gptq_module.qzeros, self.bits).T.contiguous()
+        if self.bitblas_matmul.config.zeros_mode == "original":
+            self.zeros = intzeros.to(torch.float16).contiguous()
+        elif self.bitblas_matmul.config.zeros_mode == "rescale":
+            self.zeros[:, :] = intzeros.to(torch.float16)[:, :] * self.scales[:, :]
+        elif self.bitblas_matmul.config.zeros_mode == "quantized":
+            self.zeros = (
+                torch.Tensor(general_compress(intzeros.T.contiguous().cpu().numpy(), self.bits)).to(
+                    self.qweight.device).to(self.zeros.dtype).contiguous())
+        else:
+            raise ValueError(f"Unsupported zeros type: {self.bitblas_matmul.config.zeros_mode}")
+        if self.bias is not None:
+            self.bias = gptq_module.bias.data.to(torch.float16).contiguous()
+
     @property
     def consistent(self):
         return self.is_consitent
diff --git a/bitblas/ops/general_matmul/__init__.py b/bitblas/ops/general_matmul/__init__.py
index dea4042e1..2945996df 100644
--- a/bitblas/ops/general_matmul/__init__.py
+++ b/bitblas/ops/general_matmul/__init__.py
@@ -30,6 +30,7 @@
     ("float64", "float64"),
     ("float32", "float32"),
     ("float16", "float16"),
+    ("bfloat16", "bfloat16"),
     ("int8", "int8"),
     ("e4m3_float8", "e4m3_float8"),
     ("e4m3_float8", "e5m2_float8"),
@@ -140,7 +141,7 @@ def __initialize_propagate(self, propagate_a: Optional[TransformKind],
 
         # TODO(lei): This is a limitation arose by pytorch and llvm
         # Should be removed in the future.
-        if self.A_dtype in ["e4m3_float8", "e5m2_float8"]:
+        if self.A_dtype in ["e4m3_float8", "e5m2_float8", "bfloat16"]:
             object.__setattr__(self, "propagate_a", TransformKind.NonTransform)
             object.__setattr__(self, "propagate_b", TransformKind.NonTransform)
 
@@ -159,6 +160,9 @@ def is_not_fast_decoding_supported():
             # if the w_dtype is int4/uint4 and the a_dtype is int8
             # we do not require fast decoding
             conditions.append(self.W_dtype in ["int4", "uint4"] and self.A_dtype in ["int8"])
+            # do not support bfloat16 currently
+            # TODO(lei): should implement to improve the performance
+            conditions.append(self.A_dtype == "bfloat16")
             return any(conditions)
 
         if fast_decoding is not None:
@@ -214,6 +218,7 @@ def __post_init__(self):
 
         if self.A_dtype == self.W_dtype and self.W_dtype in [
                 "float16",
+                "bfloat16",
                 "int8",
                 "e4m3_float8",
                 "e5m2_float8",
@@ -228,6 +233,7 @@ class Matmul(Operator):
         "float64": ("fp", 64),
         "float32": ("fp", 32),
         "float16": ("fp", 16),
+        "bfloat16": ("bf", 16),
         "int32": ("int", 32),
         "uint32": ("uint", 32),
         "int16": ("int", 16),
@@ -260,8 +266,13 @@ def __init__(
         if target is None:
             target = auto_detect_nvidia_target()
             logger.info(f"Auto detected target: {target}")
+
         assert (config.A_dtype
                 in self.BITBLAS_TRICK_DTYPE_MAP), f"Unsupported input dtype {config.A_dtype}"
+
+        assert (config.W_dtype
+                in self.BITBLAS_TRICK_DTYPE_MAP), f"Unsupported weight dtype {config.W_dtype}"
+
         source_format, bit = self.BITBLAS_TRICK_DTYPE_MAP[config.W_dtype]
 
         self.source_format = source_format
diff --git a/bitblas/ops/ladder_permutate/ladder_permutate_impl.py b/bitblas/ops/ladder_permutate/ladder_permutate_impl.py
index 44d368a1a..5d6d8f981 100644
--- a/bitblas/ops/ladder_permutate/ladder_permutate_impl.py
+++ b/bitblas/ops/ladder_permutate/ladder_permutate_impl.py
@@ -12,7 +12,7 @@
 def select_implementation(
     M: int,
     N: int,
-    datatype: Literal["float16", "int8", "e4m3_float8", "e5m2_float8"] = "float16",
+    datatype: Literal["float16", "bfloat16", "int8", "e4m3_float8", "e5m2_float8"] = "float16",
     dequantize_bits: int = -1,
     storage_dtype: Literal["float16", "int8", "uint8", "int32", "uint32"] = "float16",
     propagate_kind: Literal["A", "B"] = "B",
diff --git a/bitblas/utils/tensor_adapter.py b/bitblas/utils/tensor_adapter.py
index d4d052dbb..5dbcb1663 100644
--- a/bitblas/utils/tensor_adapter.py
+++ b/bitblas/utils/tensor_adapter.py
@@ -91,11 +91,11 @@ def tvm_tensor_to_torch(tensor: Union[tvm.te.Tensor, tvm.nd.NDArray]):
     else:
         raise RuntimeError("Not supported type: ", type(tensor))
 
+
 def lazy_tvm_tensor_to_torch(tensor: Union[tvm.te.Tensor, tvm.nd.NDArray]):
     # It additionally needs the ctypes type as torch type
     def as_tensor(address, shape, elems_inbytes, torch_type):
-        arr = (ctypes.c_int8 * elems_inbytes).from_address(
-            address)
+        arr = (ctypes.c_int8 * elems_inbytes).from_address(address)
         return torch.frombuffer(arr, dtype=torch_type).view(*shape)
 
     if isinstance(tensor, tvm.nd.NDArray):
@@ -110,11 +110,11 @@ def as_tensor(address, shape, elems_inbytes, torch_type):
     else:
         raise RuntimeError("Not supported type: ", type(tensor))
 
+
 def lazy_torch_to_tvm_tensor(tensor):
     # It additionally needs the ctypes type as torch type
     def as_tensor(address, shape, elems_inbytes, numpy_type):
-        arr = (ctypes.c_int8 * elems_inbytes).from_address(
-            address)
+        arr = (ctypes.c_int8 * elems_inbytes).from_address(address)
         return np.frombuffer(arr, dtype=numpy_type).reshape(shape)
 
     if isinstance(tensor, torch.Tensor):
@@ -122,9 +122,24 @@ def as_tensor(address, shape, elems_inbytes, numpy_type):
         shape = tensor.shape
         torch_dtype = tensor.dtype
         numpy_dtype = str(torch_dtype).replace("torch.", "")
-        num_elems_inbytes  = prod(shape) * tensor.itemsize
+        num_elems_inbytes = prod(shape) * tensor.itemsize
         np_tensor = as_tensor(data_ptr, shape, num_elems_inbytes, numpy_dtype)
         tvm_tensor = tvm.nd.array(np_tensor)
         return tvm_tensor
     else:
         raise RuntimeError("Not supported type: ", type(tensor))
+
+
+def np_float2np_bf16(arr):
+    """Convert a numpy array of float to a numpy array
+    of bf16 in uint16"""
+    orig = arr.view("<u4")
+    bias = np.bitwise_and(np.right_shift(orig, 16), 1) + 0x7FFF
+    return np.right_shift(orig + bias, 16).astype("uint16")
+
+
+def np_bf162np_float(arr):
+    """Convert a numpy array of bf16 (uint16) to a numpy array
+    of float"""
+    u32 = np.left_shift(arr.astype("uint32"), 16)
+    return u32.view("<f4")
diff --git a/bitblas/wrapper/general.py b/bitblas/wrapper/general.py
index 4aaebc64e..4e7c65c2c 100644
--- a/bitblas/wrapper/general.py
+++ b/bitblas/wrapper/general.py
@@ -20,7 +20,7 @@
 _TYPE_MAP = {
     "float32": "float",
     "float16": "half",
-    "bfloat16": "__nv_bfloat162",
+    "bfloat16": "__nv_bfloat16",
     "e4m3_float8": "__nv_fp8_e4m3",
     "e5m2_float8": "__nv_fp8_e5m2",
     "float64": "double",
diff --git a/integration/BitNet/maint/upload_models.sh b/integration/BitNet/maint/upload_models.sh
old mode 100644
new mode 100755
index d1e51f1ec..44548243d
--- a/integration/BitNet/maint/upload_models.sh
+++ b/integration/BitNet/maint/upload_models.sh
@@ -32,6 +32,8 @@ git commit -m "Initial commit"
 
 git remote add origin $REMOTE_DIR
 
+huggingface-cli lfs-enable-largefiles .
+
 git fetch origin
 
 git push -f --set-upstream origin main
diff --git a/testing/python/module/test_repack_from_gptq_v2.py b/testing/python/module/test_repack_from_gptq_v2.py
new file mode 100644
index 000000000..66d6afb86
--- /dev/null
+++ b/testing/python/module/test_repack_from_gptq_v2.py
@@ -0,0 +1,87 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+import bitblas
+import torch
+
+torch.manual_seed(0)
+bitblas.set_log_level("DEBUG")
+
+
+def assert_output_with_gptq(m, in_features, out_features, group_size, sym=False):
+    try:
+        import gptqmodel  # noqa: F401
+    except ImportError:  # noqa: F401
+        return  # Skip test if gptqmodel is not installed
+
+    from gptqmodel.nn_modules.qlinear.qlinear_exllama import ExllamaQuantLinear
+
+    if group_size == -1:
+        group_size = in_features
+    _, linear, s, _ = bitblas.quantization.gen_quant4(in_features, out_features, group_size)
+
+    zeros = torch.full((in_features // group_size, out_features), 8, dtype=torch.int32)
+
+    old_linear = ExllamaQuantLinear(
+        bits=4,
+        group_size=group_size,
+        infeatures=in_features,
+        outfeatures=out_features,
+        bias=False,
+        sym=sym,
+        desc_act=False,
+    )
+    old_linear._use_act_order = False
+    old_linear.pack(linear, s.T, zeros.T, g_idx=None)
+    old_linear = old_linear.to("cuda")
+    old_linear.post_init()
+
+    W_dtype = "uint4"
+    with_zeros = True
+
+    if sym:
+        W_dtype = "int4"
+        with_zeros = False
+
+    bitblas_linear = bitblas.Linear(
+        opt_M=m,
+        in_features=in_features,
+        out_features=out_features,
+        bias=False,
+        A_dtype="float16",  # activation A dtype
+        W_dtype=W_dtype,  # weight W dtype
+        accum_dtype="float16",  # accumulation dtype
+        out_dtype="float16",  # output dtype
+        # configs for weight only quantization
+        group_size=group_size,  # setting for grouped quantization
+        with_scaling=True,  # setting for scaling factor
+        with_zeros=with_zeros,  # setting for zeros
+        zeros_mode="quantized",  # setting for how to calculating zeros
+    )
+    # Repack weights from CudaOldQuantLinear to BitBLAS linear module
+    bitblas_linear.repack_from_gptq_v2(old_linear)
+
+    # Prepare input data
+    inp = torch.rand(m, in_features, dtype=torch.float16, device="cuda")
+
+    # Move models to CUDA for execution
+    old_linear = old_linear.to("cuda")
+    bitblas_linear = bitblas_linear.to("cuda")
+
+    # Perform inference without gradient calculations
+    with torch.no_grad():
+        res_cuda_old = old_linear(inp)
+        res_bitblas = bitblas_linear(inp)
+
+    # Verify the outputs are close within specified tolerances
+    torch.testing.assert_close(res_bitblas, res_cuda_old, rtol=1e-0, atol=1e-1)
+
+
+def test_assert_output_with_gptq():
+    assert_output_with_gptq(1, 256, 256, 64, True)
+    assert_output_with_gptq(1, 256, 256, -1, True)
+    assert_output_with_gptq(1, 256, 256, 64, False)
+    assert_output_with_gptq(1, 256, 256, -1, False)
+
+
+if __name__ == "__main__":
+    bitblas.testing.main()
diff --git a/testing/python/operators/test_general_matmul_bf16.py b/testing/python/operators/test_general_matmul_bf16.py
new file mode 100644
index 000000000..632d05f51
--- /dev/null
+++ b/testing/python/operators/test_general_matmul_bf16.py
@@ -0,0 +1,170 @@
+import torch
+import bitblas
+from bitblas import MatmulConfig, Matmul
+import logging
+from bitblas import set_log_level
+
+set_log_level(logging.DEBUG)
+
+
+def matmul_torch_forward(M, N, K, A_dtype, W_dtype, accum_dtype, out_dtype, layout, with_bias,
+                         group_size, with_scaling, with_zeros, zeros_mode):
+    torch.random.manual_seed(0)
+
+    matmul_config = MatmulConfig(
+        M=M,
+        N=N,
+        K=K,
+        A_dtype=A_dtype,
+        W_dtype=W_dtype,
+        accum_dtype=accum_dtype,
+        out_dtype=out_dtype,
+        layout=layout,
+        with_bias=with_bias,
+        group_size=group_size,
+        with_scaling=with_scaling,
+        with_zeros=with_zeros,
+        zeros_mode=zeros_mode,
+        propagate_b=2,
+    )
+    matmul = Matmul(config=matmul_config, enable_tuning=True)
+
+    input_shape = (M, K)
+    weight_shape = (N, K) if layout == "nt" else (K, N)
+
+    def map_torch_type(intype):
+        return getattr(torch, intype)
+
+    torch_type_a = map_torch_type(A_dtype)
+    torch_type_b = map_torch_type(W_dtype)
+    torch_type_c = map_torch_type(out_dtype)
+    torch_a = torch.rand(M * K).uniform_(-1, 1).reshape(input_shape).type(torch_type_a).cuda()
+    torch_b = torch.rand(N * K).uniform_(-1, 1).reshape(weight_shape).type(torch_type_b).cuda()
+    ref_out = torch.matmul(torch_a.to(torch.float32),
+                           torch_b.t().to(torch.float32)) if layout == "nt" else torch.matmul(
+                               torch_a.to(torch.float32), torch_b.to(torch.float32))
+
+    ref_out = ref_out.to(torch_type_c)
+
+    print("torch_ref_out", ref_out)
+    new_torch_b = matmul.transform_weight(torch_b)
+    bitblas_out = matmul(torch_a, new_torch_b)
+    print("bitblas_out", bitblas_out)
+
+
+@bitblas.testing.requires_cuda_compute_version(8, 0)
+def test_matmul_torch_forward():
+    matmul_torch_forward(1, 1024, 1024, "bfloat16", "bfloat16", "float32", "float32", "nt", None,
+                         None, None, None, None)
+    matmul_torch_forward(1024, 1024, 1024, "bfloat16", "bfloat16", "float32", "float32", "nt", None,
+                         None, None, None, None)
+
+
+def matmul_torch_forward_weight_dequantize(M, N, K, A_dtype, W_dtype, accum_dtype, out_dtype,
+                                           layout, with_bias, group_size, with_scaling, with_zeros,
+                                           zeros_mode):
+    import torch
+    torch.random.manual_seed(0)
+    import numpy as np
+    from bitblas.quantization import general_compress
+
+    matmul_config = MatmulConfig(
+        M=M,
+        N=N,
+        K=K,
+        A_dtype=A_dtype,
+        W_dtype=W_dtype,
+        accum_dtype=accum_dtype,
+        out_dtype=out_dtype,
+        layout=layout,
+        with_bias=with_bias,
+        group_size=group_size,
+        with_scaling=with_scaling,
+        with_zeros=with_zeros,
+        zeros_mode=zeros_mode,
+        fast_decoding=False,
+        propagate_a=False,
+    )
+    matmul = Matmul(config=matmul_config, enable_tuning=True)
+
+    input_shape = (M, K)
+    weight_shape = (N, K) if layout == "nt" else (K, N)
+    output_shape = (M, N)
+    inputs = []
+    inputs.append(torch.rand(input_shape, dtype=getattr(torch, A_dtype)).cuda() - 0.5)
+    source_format, bit = matmul.BITBLAS_TRICK_DTYPE_MAP[W_dtype]
+    maxq = 2**(bit - 1)
+    zeros = maxq
+    if source_format == "uint":
+        inputs.append(torch.randint(0, maxq, weight_shape, dtype=torch.int8).cuda())
+    elif source_format == "int":
+        inputs.append(torch.randint(-maxq, maxq, weight_shape, dtype=torch.int8).cuda())
+    else:
+        raise NotImplementedError
+
+    inputs.append(torch.rand(output_shape, dtype=getattr(torch, out_dtype)).cuda())
+
+    intweight = inputs[1]
+    intweight = intweight.cpu().to(torch.int8)
+    if source_format == "int":
+        intweight = intweight + maxq
+    if with_zeros:
+        inputs[1] = inputs[1] - zeros
+    bias = torch.rand((output_shape[-1],), dtype=getattr(torch, out_dtype)).cuda()
+    ref_result = torch.matmul(inputs[0], (inputs[1].t() if layout == "nt" else inputs[1]).to(
+        getattr(torch, A_dtype))).to(getattr(torch, out_dtype))
+    if with_bias:
+        ref_result = ref_result + bias
+    permuted_inputs = []
+    permuted_inputs.append(inputs[0])
+    if matmul.weight_transform is not None:
+        permuted_inputs.append(matmul.weight_transform(intweight.cpu()).cuda())
+    else:
+        permuted_inputs.append(intweight)
+    if with_scaling:
+        if group_size == -1:
+            group_size = K
+        permuted_inputs.append(
+            torch.ones([N, K // group_size], dtype=getattr(torch, A_dtype)).cuda())
+    if with_zeros:
+        if zeros_mode == "original":
+            permuted_inputs.append(
+                torch.ones([N, K // group_size], dtype=getattr(torch, A_dtype)).cuda() * zeros)
+        elif zeros_mode == "rescale":
+            original_zeros = torch.ones([N, K // group_size], dtype=getattr(torch,
+                                                                            A_dtype)).cuda() * zeros
+            scaled_zeros = original_zeros * permuted_inputs[-1]
+            permuted_inputs.append(scaled_zeros)
+        elif zeros_mode == "quantized":
+            original_zeros = torch.ones([K // group_size, N], dtype=torch.int8).cuda() * zeros
+            qzeros = general_compress(
+                original_zeros.cpu().numpy(), source_bits=bit, storage_dtype=np.int8)
+            permuted_inputs.append(torch.from_numpy(qzeros).cuda())
+        else:
+            raise NotImplementedError
+    if with_bias:
+        permuted_inputs.append(bias)
+    permuted_inputs.append(inputs[2])
+    matmul(*permuted_inputs[:-1], output=permuted_inputs[-1])
+    print(permuted_inputs[-1])
+    print(ref_result)
+    if zeros_mode == "rescale":
+        torch.testing.assert_close(permuted_inputs[-1], ref_result, rtol=1e2, atol=1e0)
+    else:
+        torch.testing.assert_close(permuted_inputs[-1], ref_result, rtol=1e2, atol=1e0)
+
+
+@bitblas.testing.requires_cuda_compute_version(8, 0)
+def test_matmul_torch_forward_weight_dequantize():
+    matmul_torch_forward_weight_dequantize(1, 1024, 1024, "bfloat16", "uint4", "float32", "float32",
+                                           "nt", None, None, None, None, None)
+    matmul_torch_forward_weight_dequantize(1024, 1024, 1024, "bfloat16", "uint4", "float32",
+                                           "float32", "nt", None, None, None, None, None)
+    matmul_torch_forward_weight_dequantize(1, 1024, 1024, "bfloat16", "uint4", "float32", "float32",
+                                           "nt", None, 32, True, None, None)
+    matmul_torch_forward_weight_dequantize(1024, 1024, 1024, "bfloat16", "uint4", "float32",
+                                           "float32", "nt", None, 32, True, None, None)
+
+
+if __name__ == "__main__":
+    bitblas.testing.main()
diff --git a/testing/python/operators/test_general_matmul_fp8.py b/testing/python/operators/test_general_matmul_fp8.py
index 0514b9209..b4dd8b7e4 100644
--- a/testing/python/operators/test_general_matmul_fp8.py
+++ b/testing/python/operators/test_general_matmul_fp8.py
@@ -7,7 +7,8 @@
 set_log_level(logging.DEBUG)
 
 
-def matmul_torch_forward(M, N, K, A_dtype, W_dtype, accum_dtype, out_dtype, layout, with_bias, group_size, with_scaling, with_zeros, zeros_mode):
+def matmul_torch_forward(M, N, K, A_dtype, W_dtype, accum_dtype, out_dtype, layout, with_bias,
+                         group_size, with_scaling, with_zeros, zeros_mode):
     torch.random.manual_seed(0)
 
     matmul_config = MatmulConfig(
@@ -56,14 +57,22 @@ def map_torch_type(intype):
     bitblas_out = matmul(torch_a, new_torch_b)
     print("bitblas_out", bitblas_out)
 
+
 @bitblas.testing.requires_cuda_compute_version(8, 9)
 def test_matmul_torch_forward():
-    matmul_torch_forward(1, 1024, 1024, "e4m3_float8", "e4m3_float8", "float32", "float32", "nt", None, None, None, None, None)
-    matmul_torch_forward(1024, 1024, 1024, "e4m3_float8", "e4m3_float8", "float32", "float32", "nt", None, None, None, None, None)
-    matmul_torch_forward(1, 1024, 1024, "e5m2_float8", "e5m2_float8", "float32", "float32", "nt", None, None, None, None, None)
-    matmul_torch_forward(1024, 1024, 1024, "e5m2_float8", "e5m2_float8", "float32", "float32", "nt", None, None, None, None, None)
-
-def matmul_torch_forward_weight_dequantize(M, N, K, A_dtype, W_dtype, accum_dtype, out_dtype, layout, with_bias, group_size, with_scaling, with_zeros, zeros_mode):
+    matmul_torch_forward(1, 1024, 1024, "e4m3_float8", "e4m3_float8", "float32", "float32", "nt",
+                         None, None, None, None, None)
+    matmul_torch_forward(1024, 1024, 1024, "e4m3_float8", "e4m3_float8", "float32", "float32", "nt",
+                         None, None, None, None, None)
+    matmul_torch_forward(1, 1024, 1024, "e5m2_float8", "e5m2_float8", "float32", "float32", "nt",
+                         None, None, None, None, None)
+    matmul_torch_forward(1024, 1024, 1024, "e5m2_float8", "e5m2_float8", "float32", "float32", "nt",
+                         None, None, None, None, None)
+
+
+def matmul_torch_forward_weight_dequantize(M, N, K, A_dtype, W_dtype, accum_dtype, out_dtype,
+                                           layout, with_bias, group_size, with_scaling, with_zeros,
+                                           zeros_mode):
     torch.random.manual_seed(0)
 
     matmul_config = MatmulConfig(
@@ -84,7 +93,6 @@ def matmul_torch_forward_weight_dequantize(M, N, K, A_dtype, W_dtype, accum_dtyp
         propagate_b=False,
     )
     matmul = Matmul(config=matmul_config, enable_tuning=False)
-    print(matmul.src_name)
     input_shape = (M, K)
     weight_shape = (N, K) if layout == "nt" else (K, N)
 
@@ -136,12 +144,18 @@ def map_torch_type(intype):
 
     torch.testing.assert_close(ref_out, bitblas_out, rtol=1e-1, atol=1e-1)
 
+
 @bitblas.testing.requires_cuda_compute_version(8, 9)
 def test_matmul_torch_forward_weight_dequantize():
-    matmul_torch_forward_weight_dequantize(1, 1024, 1024, "float16", "e4m3_float8", "float16", "float16", "nt", None, None, None, None, None)
-    matmul_torch_forward_weight_dequantize(1024, 1024, 1024, "float16", "e4m3_float8", "float16", "float16", "nt", None, None, None, None, None)
-    matmul_torch_forward_weight_dequantize(1, 1024, 1024, "float16", "e4m3_float8", "float16", "float16", "nt", None, 32, True, None, None)
-    matmul_torch_forward_weight_dequantize(1024, 1024, 1024, "float16", "e4m3_float8", "float16", "float16", "nt", None, 32, True, None, None)
+    matmul_torch_forward_weight_dequantize(1, 1024, 1024, "float16", "e4m3_float8", "float16",
+                                           "float16", "nt", None, None, None, None, None)
+    matmul_torch_forward_weight_dequantize(1024, 1024, 1024, "float16", "e4m3_float8", "float16",
+                                           "float16", "nt", None, None, None, None, None)
+    matmul_torch_forward_weight_dequantize(1, 1024, 1024, "float16", "e4m3_float8", "float16",
+                                           "float16", "nt", None, 32, True, None, None)
+    matmul_torch_forward_weight_dequantize(1024, 1024, 1024, "float16", "e4m3_float8", "float16",
+                                           "float16", "nt", None, 32, True, None, None)
+
 
 if __name__ == "__main__":
     bitblas.testing.main()