In [None]:
!pip install huggingface_hub transformers accelerate diffusers

In [None]:
!pip install tensorizer

In [None]:
import torch
from tensorizer import TensorSerializer
from transformers import AutoModelForCausalLM

model_ref = "NousResearch/Llama-2-7b-hf"
# For less intensive requirements, swap above with the line below:
# model_ref = "EleutherAI/gpt-neo-125M"
model_name = model_ref.split("/")[-1]
# Change this to your S3 bucket.
#s3_bucket = "bucket"
#s3_uri = f"s3://{s3_bucket}/{model_name}.tensors"

model = AutoModelForCausalLM.from_pretrained(
    model_ref,
    torch_dtype=torch.float16,
    low_cpu_mem_usage=True,
    device_map="auto",
    use_safetensors=True
)

serializer = TensorSerializer("model1.tensors")
serializer.write_module(model)
serializer.close()

In [None]:
import time
import torch
from tensorizer import TensorDeserializer
from tensorizer.utils import no_init_or_tensor, convert_bytes, get_mem_usage

from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig

model_ref = "NousResearch/Llama-2-7b-hf"
# To run this at home, swap this with the line below for a smaller example:
# model_ref = "EleutherAI/gpt-neo-125M"
model_name = model_ref.split("/")[-1]
# Change this to your S3 bucket.
load_loc = "model1.tensors"

config = AutoConfig.from_pretrained(model_ref)

# This ensures that the model is not initialized.
with no_init_or_tensor():
    model = AutoModelForCausalLM.from_config(config)

before_mem = get_mem_usage()

# Lazy load the tensors from S3 into the model.
start = time.time()
deserializer = TensorDeserializer(load_loc, plaid_mode=True)
deserializer.load_into_module(model)
end = time.time()

# Brag about how fast we are.
total_bytes_str = convert_bytes(deserializer.total_tensor_bytes)
duration = end - start
per_second = convert_bytes(deserializer.total_tensor_bytes / duration)
after_mem = get_mem_usage()
deserializer.close()
print(f"Deserialized {total_bytes_str} in {end - start:0.2f}s, {per_second}/s")
print(f"Memory usage before: {before_mem}")
print(f"Memory usage after: {after_mem}")

# Tokenize and generate
model.eval()
tokenizer = AutoTokenizer.from_pretrained(model_ref)
eos = tokenizer.eos_token_id
input_ids = tokenizer.encode(
    "Hello, Who is Presindent of USA?", return_tensors="pt"
).to("cuda")

with torch.no_grad():
    output = model.generate(
        input_ids, max_new_tokens=50, do_sample=True, pad_token_id=eos
    )

print(f"Output: {tokenizer.decode(output[0], skip_special_tokens=True)}")

In [None]:
# MEMORY USAGE and CPU USAGE
import cProfile
import psutil
import time
import csv
import re
import torch
from tensorizer import TensorDeserializer
from tensorizer.utils import no_init_or_tensor, convert_bytes, get_mem_usage, get_gpu_name
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig

import matplotlib.pyplot as plt
import seaborn as sns
import re

def load_and_run_model(csv_filename):
    model_ref = "/runpod-volume/model-13b"
    model_name = model_ref.split("/")[-1]
    load_loc = "model2.tensors"

    config = AutoConfig.from_pretrained(model_ref)

    with no_init_or_tensor():
        model = AutoModelForCausalLM.from_config(config)

    before_mem = get_mem_usage()

    start = time.time()
    deserializer = TensorDeserializer(load_loc, plaid_mode=True)
    deserializer.load_into_module(model)
    end = time.time()

    total_bytes_str = convert_bytes(deserializer.total_tensor_bytes)
    duration = end - start
    per_second = convert_bytes(deserializer.total_tensor_bytes / duration)
    after_mem = f"Python Used RAM: {get_mem_usage()}"
    deserializer.close()

    print(f"Deserialized {total_bytes_str} in {duration:0.2f}s, {per_second}/s")
    print(f"Memory usage before: {before_mem}")
    print(f"Memory usage after: {after_mem}")

    plot_memory_usage(after_mem, end - start, model_name, get_gpu_name(), "t-13b.csv")

    model.eval()
    tokenizer = AutoTokenizer.from_pretrained(model_ref)
    eos = tokenizer.eos_token_id
    input_ids = tokenizer.encode(
        "Hello, Who is President of USA?", return_tensors="pt"
    ).to("cuda")

    with torch.no_grad():
        output = model.generate(
            input_ids, max_new_tokens=50, do_sample=True, pad_token_id=eos
        )

    print(f"Output: {tokenizer.decode(output[0], skip_special_tokens=True)}")
    
def plot_memory_usage(data_str, total_loading_time, input_directory, gpu_name, csv_filename):
    """
    Plots memory usage and model details and saves them to a CSV file.

    :param data_str: String containing memory usage data.
    :param total_loading_time: Total time taken to load the model.
    :param input_directory: Directory where the model is loaded from.
    :param gpu_name: Name of the GPU used.
    :param csv_filename: Filename to save the CSV data.
    """

    # Parsing the data using regular expressions
    parsed_values = re.findall(r'(\d+,\d+|\d+)MiB', data_str)
    parsed_values = [int(val.replace(',', '')) for val in parsed_values]

    # Assigning values to categories
    categories = ['CPU Maxrss', 'CPU Free', 'GPU Used', 'GPU Free', 'GPU Total', 'Torch Reserved', 'Torch Allocated']
    values = parsed_values[:7]

    # Saving data to CSV
    with open(csv_filename, 'w', newline='') as csvfile:
        csvwriter = csv.writer(csvfile)
        csvwriter.writerow(['Category', 'Memory (MiB)', 'Total Loading Time (s)', 'Input Directory', 'GPU Name'])
        for category, value in zip(categories, values):
            csvwriter.writerow([category, value, total_loading_time, input_directory, gpu_name])

    # Optional: Use Seaborn for better styling
    sns.set_theme()

    # Creating the bar chart
    plt.figure(figsize=(14, 8))
    plt.bar(categories, values, color=sns.color_palette("viridis", len(categories)))

    # Adding titles, labels, and annotations
    plt.title('Memory Usage Metrics and Model Details')
    plt.ylabel('Memory (MiB)')
    plt.xlabel('Categories')

    # Annotations in the top right corner
    additional_info = f"Total Loading Time: {total_loading_time} s\nInput Directory: {input_directory}\nGPU: {gpu_name}"
    plt.annotate(additional_info, xy=(0.95, 0.95), xycoords='axes fraction',
                 ha='right', va='top', fontsize=12, bbox=dict(boxstyle="round,pad=0.3", edgecolor="#cccccc", facecolor="#ffffff"))

    # Save and show the plot
    plt.savefig("memory_usage.png")
    plt.show()
def profile_function(csv_filename):
    profiler = cProfile.Profile()
    profiler.enable()

    load_and_run_model(csv_filename)

    profiler.disable()
    profiler.dump_stats('model_operations_profiling.prof')

def monitor_resources(duration=20, interval=1):
    cpu_usage = []
    start_time = time.time()
    while time.time() - start_time < duration:
        cpu_usage.append(psutil.cpu_percent(interval=interval))
    return cpu_usage

def save_to_csv(data, filename, headers):
    with open(filename, 'w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(headers)
        for i, usage in enumerate(data):
            writer.writerow([i, usage])

if __name__ == "__main__":
    csv_filename = 'tensorhhh1.csv'

    # Initialize CSV file with headers for memory usage logging
    with open(csv_filename, 'w', newline='') as csvfile:
        csvwriter = csv.writer(csvfile)
        csvwriter.writerow(['Model Name', 'Memory Before (MiB)', 'Memory After (MiB)', 'Total Bytes', 'Loading Duration (s)'])

    profile_function(csv_filename)

    cpu_usage_data = monitor_resources()
    save_to_csv(cpu_usage_data, 'cpu_usage_profile.csv', ['Time (s)', 'CPU Usage (%)'])

    print("Profiling and resource monitoring complete.")

