In [1]:
from pathlib import Path

THIS = Path(".")
BASE = THIS / ".."
BASE = BASE.absolute().resolve()

MODEL = BASE / "model"
EXPORTS = BASE / "exports"

MODEL_NAME="Llama-2-7b-hf"

MODEL_PATH = MODEL / MODEL_NAME
EXPORTS = EXPORTS / MODEL_NAME


## Quant

## Omniquant

In [2]:
METHOD = "quant/omniquant"
METHOD_EXPORTS = EXPORTS / METHOD


In [3]:
# print paths
print("BASE:", BASE)
print("MODEL:", MODEL)
print("MODEL_PATH:", MODEL_PATH)
print("METHOD_EXPORTS:", METHOD_EXPORTS)

BASE: /home/shwu/LLM-Efficiency-Survey
MODEL: /home/shwu/LLM-Efficiency-Survey/model
MODEL_PATH: /home/shwu/LLM-Efficiency-Survey/model/Llama-2-7b-hf
METHOD_EXPORTS: /home/shwu/LLM-Efficiency-Survey/exports/Llama-2-7b-hf/quant/omniquant


In [None]:
# OmniQuant quant model sizes
omniquant_models = [x for x in METHOD_EXPORTS.iterdir() if x.is_dir()]
omniquant_models.append(MODEL_PATH)

# AWQ_EXPORTS=EXPORTS / "quant/awq"
# awq_models = [x for x in AWQ_EXPORTS.iterdir() if x.is_dir()]
# omniquant_models.extend(awq_models)

for model in omniquant_models:

    # model_size := size of output.safetensors or <model>.p
    model_size = sum([x.stat().st_size for x in model.glob("*.safetensors")])
    print(f"{model.name}: {model_size / (1024 ** 3):.2f} GiB")

In [None]:
# max mem usage
import pandas as pd
import matplotlib.pyplot as plt
all_csvs = list(METHOD_EXPORTS.glob("*.csv"))

# csv header
# utilization.gpu [%], utilization.memory [%], memory.used [MiB], memory.free [MiB], memory.total [MiB]
utlization_gpu_key = "utilization.gpu [%]"
utilization_memory_key = " " + "utilization.memory [%]"
memory_used_key = " " + "memory.used [MiB]"
memory_free_key = " " + "memory.free [MiB]"
memory_total_key = " " + "memory.total [MiB]"

for csv in all_csvs:
    if not "bench" in csv.name:
        continue
    df = pd.read_csv(csv)

    # plot csv of utilization.memory [%] and utilization.gpu [%] both on y-axis
    fig, ax1 = plt.subplots(figsize=(20, 4))
    plt.title(csv.name.split('usage')[-1][1:-4])
    ax1.set_ylabel("Stats [%]")
    ax1.set_xlabel("Time [s]")
    ax1.plot(df.index * 0.1, df[utlization_gpu_key], label=f"utilization.compute [max = {df[utlization_gpu_key].max()} %]")
    ax1.plot(df.index * 0.1, df[utilization_memory_key], label=f"utilization.memory [max = {df[utilization_memory_key].max()} %]")
    ax1.plot(df.index * 0.1, (df[memory_used_key] / df[memory_total_key]) * 100, label=f"memory [max = {df[memory_used_key].max() / 1024:.2f} GiB]")
    ax1.legend(loc="upper left")

In [44]:
# ppl logs
# all files with suffix .txt in method_exports/ppl/<any_dir>
ppl_logs = list(METHOD_EXPORTS.glob("ppl/**/*.txt"))
data = {}
for log in ppl_logs:
    # if not "bench" in log.name:
    #     continue
    method = f"{log}".split("/")[-2]
    print(method, end=": ")
    with open(log) as f:
        lines = f.readlines()
    ppl = float(lines[-1].strip().split()[-1])
    print(ppl)
    data[method] = ppl

w3a16g128: 6.656012771453326
w4a16g128: 5.977879409190858


In [45]:
import numpy as np
import pandas as pd
# update data with baseline
data.update({"baseline FP16": 5.85})
df = pd.DataFrame([(k,v) for (k,v) in sorted(data.items())], columns=['method', 'ppl'])
df["+ppl"] = df["ppl"] - min(df["ppl"])
df


Unnamed: 0,method,ppl,+ppl
0,baseline FP16,5.85,0.0
1,w3a16g128,6.656013,0.806013
2,w4a16g128,5.977879,0.127879
