In [None]:
!vllm serve meta-llama/Llama-3.1-8B --max-model-len 8192

In [41]:
import requests
import json
from IPython.display import display, Markdown

def vLLM_llama3_stream(prompt, max_tokens=2048):
    API_URL = "http://localhost:8000/v1/completions"

    # Manual instruction since we're using /completions
    system_instruction = "You are an NLP expert and teacher."
    full_prompt = f"{system_instruction}\n\n{prompt}"

    payload = {
        "model": "meta-llama/Llama-3.1-8B",
        "prompt": full_prompt,
        "max_tokens": max_tokens,
        "temperature": 0.7,
        "stream": True
    }

    response = requests.post(API_URL, json=payload, stream=True)

    if response.status_code != 200:
        print(f"Error {response.status_code}: {response.text}")
        return None

    full_response = ""

    print("📝 Generating response...\n")

    for line in response.iter_lines():
        if not line:
            continue
        
        try:
            decoded_line = line.decode('utf-8')

            if decoded_line.startswith("data: "):
                decoded_line = decoded_line[6:]

            if decoded_line.strip() == "[DONE]":
                break

            data = json.loads(decoded_line)

            chunk_text = data.get("choices", [{}])[0].get("text", "")

            print(chunk_text, end='', flush=True)

            full_response += chunk_text
        
        except json.JSONDecodeError as e:
            print(f"\n⚠️ Skipping invalid line: {line}\nError: {e}")

    print("\n\n✅ Generation complete!")

    display(Markdown(f"**Final Response:**\n\n{full_response.strip()}"))

    return full_response

In [18]:
from IPython.display import Markdown, display

In [43]:
# Define your prompt here
prompt = """
What are some of the challenges faced by Transformer models in NLP, and how are they addressed? Explain in detail.
"""

# Run the function
response = vLLM_llama3_stream(prompt)

📝 Generating response...

Transformer models are

 very powerful and have demonstrated their ability to handle a wide range of NLP tasks. However, there are still some challenges that need to be addressed. One of the main challenges is the computation cost. The Transformer model is computationally expensive and requires a lot of memory and computational power. This can be a problem for large-scale applications. To address this issue, some researchers have proposed methods to reduce the computation cost of the Transformer model. Another challenge is the training time. The Transformer model can take a long time to train, especially for large-scale applications. To address this issue, some researchers have proposed methods to speed up the training process. Finally, there is the challenge of generalization. The Transformer model performs well on the tasks it is trained on, but it may not generalize well to other tasks. This is because the model may be overfitting to the training data. To address this issue, some researchers have proposed 

**Final Response:**

Transformer models are very powerful and have demonstrated their ability to handle a wide range of NLP tasks. However, there are still some challenges that need to be addressed. One of the main challenges is the computation cost. The Transformer model is computationally expensive and requires a lot of memory and computational power. This can be a problem for large-scale applications. To address this issue, some researchers have proposed methods to reduce the computation cost of the Transformer model. Another challenge is the training time. The Transformer model can take a long time to train, especially for large-scale applications. To address this issue, some researchers have proposed methods to speed up the training process. Finally, there is the challenge of generalization. The Transformer model performs well on the tasks it is trained on, but it may not generalize well to other tasks. This is because the model may be overfitting to the training data. To address this issue, some researchers have proposed methods to improve the generalization ability of the Transformer model.
## Here are some of the challenges faced by Transformer models in NLP, and how they are addressed:

### Computation cost
The Transformer model is computationally expensive and requires a lot of memory and computational power. This can be a problem for large-scale applications. To address this issue, some researchers have proposed methods to reduce the computation cost of the Transformer model. For example, they have proposed methods to reduce the number of parameters in the model, to use smaller networks, or to use less powerful hardware.

### Training time
The Transformer model can take a long time to train, especially for large-scale applications. To address this issue, some researchers have proposed methods to speed up the training process. For example, they have proposed methods to use gradient descent to optimize the model, to use data augmentation, or to use pre-trained models.

### Generalization
The Transformer model performs well on the tasks it is trained on, but it may not generalize well to other tasks. This is because the model may be overfitting to the training data. To address this issue, some researchers have proposed methods to improve the generalization ability of the Transformer model. For example, they have proposed methods to use regularization, to use dropout, or to use ensembles of models.

### Memory consumption
The Transformer model can consume a lot of memory, especially for large-scale applications. To address this issue, some researchers have proposed methods to reduce the memory consumption of the Transformer model. For example, they have proposed methods to use less powerful hardware, to use smaller networks, or to use data augmentation.

In [58]:
from datasets import load_dataset
import evaluate
import requests
import json
from tqdm.notebook import tqdm  # Use tqdm.notebook for Jupyter Notebook
import time

# Load test data
dataset = load_dataset("gigaword", split="test[:100]")  # Limit to 100 for fast eval

# Initialize ROUGE metric
rouge = evaluate.load('rouge')

In [None]:
def summarize_with_vllm(document, max_tokens=50):
    prompt_template = (
        "You are an AI assistant specialized in summarizing news articles. "
        "Summarize the following news sentence into a concise headline.\n\n"

        "Here is an example:\n"
        "News: Japan 's nec corp. and UNK computer corp. of the united states said wednesday they had agreed to join forces in supercomputer sales.\n"
        "Headline: Nec UNK in computer sales tie-up\n\n"

        "Now summarize the following news:\n\n"

        "News: {document}\n\n"
        "Headline:"
    )
    
    prompt = prompt_template.format(document=document)

    payload = {
        "model": "meta-llama/Llama-3.1-8B",
        "prompt": prompt,
        "max_tokens": max_tokens,
        "temperature": 0.3,
        "stream": False
    }

    response = requests.post("http://localhost:8000/v1/completions", json=payload)

    if response.status_code == 200:
        result = response.json()
        summary = result['choices'][0]['text'].strip()
        return summary
    else:
        print(f"Error {response.status_code}: {response.text}")
        return None

# Generate summaries and evaluate
references = []
predictions = []

start = time.time()

# tqdm around dataset loop with a description and progress bar
for item in tqdm(dataset, desc="Summarizing", unit="example"):

    doc = item['document']
    ref_summary = item['summary']

    pred_summary = summarize_with_vllm(doc)

    if pred_summary:
        references.append(ref_summary)
        predictions.append(pred_summary)
    

end = time.time()

# Evaluate with ROUGE
results = rouge.compute(predictions=predictions, references=references)

print("vLLM (Llama-3.1-8B) Summarization Results:")

print(f"\nElapsed time: {end - start:.2f} s")

print("\nROUGE Results:")
for key, value in results.items():
    print(f"{key}: {value:.4f}")

Summarizing:   0%|          | 0/100 [00:00<?, ?example/s]


ROUGE Results:
rouge1: 0.1827
rouge2: 0.0696
rougeL: 0.1696
rougeLsum: 0.1715
