In [11]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from time import time
import deepspeed


model = AutoModelForCausalLM.from_pretrained(model_path,
    device_map="auto",
  torch_dtype=torch.float16,
  offload_folder="offload",
  trust_remote_code=True,
  low_cpu_mem_usage=True)
    

def optimize_and_run_mistral(model_path, input_tokens=128, output_tokens=128, model_parallel_size=2):
    

    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_path)

    # Load model in chunks
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    

    # Cache (stores last output for repeated prompts)
    cached_output = None

    while True:
        # Get user prompt
        prompt = input("Enter your prompt (or 'quit' to exit): ")

        if prompt.lower() == "quit":
            break

        # Tokenize input
        input_ids = tokenizer.encode(prompt, return_tensors="pt").to(device)

        # Inference
        start_time = time()
        if cached_output is not None and prompt == cached_prompt:
            # Use cached output if prompt matches
            output_ids = cached_output
        else:
            with torch.inference_mode():
                # Generate model output
                output_ids = model.generate(input_ids, max_length=output_tokens, use_cache=True)

        # Decode and print output
        output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
        print(f"\nModel Response: {output_text}")

        # Calculate and print performance metrics
        inference_time = time() - start_time
        throughput = (input_tokens + output_tokens) / inference_time
        print(f"\nInference Latency: {inference_time:.3f} seconds")
        print(f"Throughput: {throughput:.3f} tokens/second")

        cached_prompt = prompt
        cached_output = output_ids

#prefer a sharded version or shard before inputting as a technique for efficient cpu/gpu usage
if __name__ == "__main__":
    model_path = "alexsherstinsky/Mistral-7B-v0.1-sharded"  # Replace with your desired model from huggingface
    model_parallel_size = 2  # Set the model parallel size (number of GPUs) for efficient loading
    optimize_and_run_mistral(model_path, model_parallel_size=model_parallel_size)


Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

Enter your prompt (or 'quit' to exit):  heights


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
2024-03-07 17:58:29.694859: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-07 17:58:29.694970: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-07 17:58:29.857435: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered



Model Response: heights = [160, 185, 170, 193, 176, 180, 178]

# 1. 평균 구하기
print(sum(heights) / len(heights))

# 2. 최대값 구하기
print(max(heights))

# 3. 최소값 구하기
print(min(heights))

# 4. 중앙값 구하기

Inference Latency: 20.859 seconds
Throughput: 12.273 tokens/second


Enter your prompt (or 'quit' to exit):  firefighter


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.



Model Response: firefighter
    Joined: Mon Jan 21, 2013 11:00 pm

### Re: The Official "I'm Going to Law School" Thread

> *Jay wrote:*
>
> > *firefighter wrote:*
> >
> > > *Jay wrote:*
> > >
> > > > *firefighter wrote:*
> > > >
> > > > > *Jay wrote:*
> > > > >
> > > > > > *firefighter wrote

Inference Latency: 7.675 seconds
Throughput: 33.355 tokens/second


Enter your prompt (or 'quit' to exit):  best


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.



Model Response: best-selling author, speaker, and coach

# About

## About

### About

##### I’m a best-selling author, speaker, and coach.

I’m a best-selling author, speaker, and coach. I’m also a wife, mother, and grandmother. I’m a former teacher, and I’ve been a student of personal development for over 30 years.

I’ve been a student of personal development for over 30 years. I’ve been a student of personal development for over 30 years. I’

Inference Latency: 7.851 seconds
Throughput: 32.607 tokens/second


Enter your prompt (or 'quit' to exit):  quit
