In [1]:
import threading
import time

# Define a semaphore with 2 slots
semaphore = threading.Semaphore(1)

def access_resource(thread_number):
    print(f"Thread {thread_number} is attempting to access the resource.")
    # Acquire a slot
    semaphore.acquire()
    print(f"Thread {thread_number} has accessed the resource.")
    # Simulate some resource-intensive work
    time.sleep(1)
    print(f"Thread {thread_number} is releasing the resource.")
    # Release the slot
    semaphore.release()

# Create and start 5 threads
threads = []
for i in range(5):
    thread = threading.Thread(target=access_resource, args=(i,))
    threads.append(thread)
    thread.start()

# Wait for all threads to complete
for thread in threads:
    thread.join()

print("All threads have finished.")


Thread 0 is attempting to access the resource.
Thread 0 has accessed the resource.
Thread 1 is attempting to access the resource.
Thread 2 is attempting to access the resource.
Thread 3 is attempting to access the resource.
Thread 4 is attempting to access the resource.
Thread 0 is releasing the resource.
Thread 1 has accessed the resource.
Thread 1 is releasing the resource.
Thread 2 has accessed the resource.
Thread 2 is releasing the resource.
Thread 3 has accessed the resource.
Thread 3 is releasing the resource.
Thread 4 has accessed the resource.
Thread 4 is releasing the resource.
All threads have finished.


In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer, TextStreamer

tok = AutoTokenizer.from_pretrained("HuggingFaceH4/zephyr-7b-alpha")
model = AutoModelForCausalLM.from_pretrained("HuggingFaceH4/zephyr-7b-alpha")
inputs = tok(["An increasing sequence: one,"], return_tensors="pt")
streamer = TextStreamer(tok)

# Despite returning the usual output, the streamer will also print the generated text to stdout.
_ = model.generate(**inputs, streamer=streamer, max_new_tokens=20)

  from .autonotebook import tqdm as notebook_tqdm
Downloading shards: 100%|██████████| 8/8 [03:24<00:00, 25.54s/it]
Loading checkpoint shards: 100%|██████████| 8/8 [00:14<00:00,  1.81s/it]

<s> An increasing sequence: 




one, three, five, seven, nine, etc.
A decreasing sequence: ten, nine,


In [2]:
_ = model.generate(**inputs, streamer=streamer, max_new_tokens=20)

<s> An increasing sequence: one, three, five, seven, nine, etc.
A decreasing sequence: ten, nine,


In [4]:
import asyncio

# Shared resource and lock
shared_resource = "initial_value"
resource_lock = asyncio.Lock()

async def access_resource():
    """A coroutine that simulates accessing and modifying the shared resource."""
    async with resource_lock:
        # Simulate some work with the resource
        print("Accessing the shared resource.")
        await asyncio.sleep(1)  # Simulate an operation that takes time

async def attempt_access_with_timeout(timeout):
    """Attempts to access the shared resource with a specified timeout."""
    try:
        await asyncio.wait_for(access_resource(), timeout)
        print("Successfully accessed the shared resource within the timeout.")
    except asyncio.TimeoutError:
        print(f"Could not access the shared resource within {timeout} seconds.")
        # You might want to handle the exception here, depending on your application's needs

async def main():
    # Schedule attempt_access_with_timeout as a background task
    background_task = asyncio.create_task(attempt_access_with_timeout(5))

    # Continue with other steps immediately, without waiting for the above task to complete
    print("Continuing with other operations while the background task runs.")

    # Example of other operations
    #await asyncio.sleep(2)  # Simulate doing something else
    print("Finished other operations.")

    # Optionally, wait for the background task to complete at some point
    await background_task

# Run the main coroutine
await main()


Continuing with other operations while the background task runs.
Finished other operations.
Accessing the shared resource.
Successfully accessed the shared resource within the timeout.


In [1]:
import torch
from transformers import pipeline, AutoTokenizer

model_id = 'HuggingFaceH4/zephyr-7b-beta'

tokenizer = AutoTokenizer.from_pretrained(model_id) 

llm_pipeline = pipeline(
            "text-generation",
            model=model_id, 
            tokenizer=tokenizer,
            device='cuda',
            torch_dtype=torch.float16,            
)

  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|██████████| 8/8 [00:09<00:00,  1.24s/it]


In [3]:
from prompting.llm import CustomTextIteratorStreamer

      
streamer = CustomTextIteratorStreamer(tokenizer=tokenizer)

print('Before pipeline call')
_ = llm_pipeline('Write me a big long text about einstein', max_new_tokens=4096, streamer = streamer)
print('After pipeline call')

i = 0
s = ""
for t in streamer:
    s += t
    
    if 'Write me a big long text about einsteTroubleshoin' not in s:
        continue
    
    print(i, t)
    i+= 1

print('Finish streaming')

Before pipeline call
After pipeline call
0 einstein's 
1 theory 
2 of 
3 
4 
5 relativity. 
6 I 
7 want 
8 to 
9 know 
10 everything 
11 about 
12 
13 it, 
14 from 
15 the 
16 
17 basics 
18 to 
19 the 
20 most 
21 complex 
22 
23 concepts. 
24 Make 
25 it 
26 as 
27 detailed 
28 and 
29 technical 
30 as 
31 
32 possible, 
33 but 
34 also 
35 explain 
36 it 
37 in 
38 a 
39 way 
40 that 
41 a 
42 
43 layman 
44 can 
45 
46 understand. 
47 Use 
48 clear 
49 and 
50 
51 concise 
52 
53 language, 
54 and 
55 provide 
56 examples 
57 and 
58 
59 analogies 
60 where 
61 
62 necessary. 
63 I 
64 want 
65 to 
66 be 
67 able 
68 to 
69 impress 
70 my 
71 friends 
72 with 
73 my 
74 
75 newfound 
76 knowledge 
77 of 
78 
79 
80 
81 relativity!</s>
Finish streaming


In [1]:
from vllm import LLM, SamplingParams

model_id = 'HuggingFaceH4/zephyr-7b-beta'
llm = LLM(model=model_id, gpu_memory_utilization=0.6)

  from .autonotebook import tqdm as notebook_tqdm
2024-03-21 03:08:48,329	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


INFO 03-21 03:08:49 llm_engine.py:87] Initializing an LLM engine with config: model='HuggingFaceH4/zephyr-7b-beta', tokenizer='HuggingFaceH4/zephyr-7b-beta', tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=32768, download_dir=None, load_format=auto, tensor_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, seed=0)
INFO 03-21 03:08:53 weight_utils.py:163] Using model weights format ['*.safetensors']
INFO 03-21 03:09:03 llm_engine.py:357] # GPU blocks: 5314, # CPU blocks: 2048
INFO 03-21 03:09:05 model_runner.py:684] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 03-21 03:09:05 model_runner.py:688] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memo

In [2]:
model_kwargs = dict()
temperature = model_kwargs.get("temperature", 0.8)
top_p = model_kwargs.get("top_p", 0.95)
max_tokens = model_kwargs.get("max_tokens", 256)

sampling_params = SamplingParams(
    temperature=temperature, top_p=top_p, max_tokens=max_tokens
)

prompt = 'Tell me a fun history fact'

composed_prompt = f"""<|system|>You are a helpful AI assistant
<|user|>{prompt} 
<|assistant|>"""

output = llm.generate(composed_prompt, sampling_params, use_tqdm=True, stream=True)
output


TypeError: LLM.generate() got an unexpected keyword argument 'stream'

In [1]:
from prompting.llm import HuggingFaceLLM
import torch
from transformers import pipeline, AutoTokenizer

model_id = 'HuggingFaceH4/zephyr-7b-beta'

tokenizer = AutoTokenizer.from_pretrained(model_id) 

llm_pipeline = pipeline(
            "text-generation",
            model=model_id, 
            tokenizer=tokenizer,
            device='cuda',
            torch_dtype=torch.float16,            
)



  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|██████████| 8/8 [00:09<00:00,  1.15s/it]


In [4]:
streamer = HuggingFaceLLM(llm_pipeline, 'You are a helpful assistant', max_new_tokens=128).stream('Tell me about albert einstein')

print('STREAMER')
for t in streamer:
    print(t)

STREAMER
<|system|>
You are a helpful assistant</s> 
<|user|>
Tell me about albert einstein</s> 
<|assistant|>



Albert 
Einstein 
was 
a 

renowned 


German-born 


physicist 
who 
made 
significant 
contributions 
to 
the 
field 
of 

science, 
most 
notably 
the 
development 
of 
the 
theory 
of 


relativity. 
Born 
in 


Ulm, 

Germany, 
in 





1879, 
Einstein 
showed 
an 
early 

aptitude 
for 

mathematics 
and 

physics. 
He 
went 
on 
to 
earn 
a 
degree 
in 
physics 
from 
the 


Polytechnic 
School 
in 

Zurich 
and 
later 
a 



Ph.D. 
From 
the 
University 
of 


Zurich.








Einstein's 

groundbreaking 
work 
in 
physics 
began 
in 





1905, 
when 
he 
published 
four 
papers 
that 
would 
come 
to 
be 
known 
as 
his 


"annus
