## Transformer inference

In [1]:
import os
import time
import json
from tqdm import tqdm
from string import Template

from essential_generators import DocumentGenerator

In [2]:
print(f"\nPATH variable is set to\n\t{os.environ['PATH']}")

print(f"\nPYTHONPATH variable is set to\n\t{os.environ['PYTHONPATH']}")

print(f"\nHUGGINGFACE_HUB_CACHE variable is set to\n\t{os.environ['HUGGINGFACE_HUB_CACHE']}")


PATH variable is set to
	/pbs/throng/training/universite-hiver/wschooloptim/vllm/bin:/pbs/software/redhat-9-x86_64/python/3.11.4/bin:/usr/share/Modules/bin:/usr/local/cuda/bin:/pbs/software/redhat-9-x86_64/latex/2025/bin/x86_64-linux:/pbs/software/redhat-9-x86_64/jnp/3.11/bin:/opt/conda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin

PYTHONPATH variable is set to
	/pbs/throng/training/universite-hiver/wschooloptim/vllm/lib/python3.11/site-packages/

HUGGINGFACE_HUB_CACHE variable is set to
	/pbs/throng/training/universite-hiver/cache/huggingface


### (1) Promts preparation

In [6]:
system_content = "You are an expert in politics."

In [4]:
text1 = """Gaullist/National and Popular Sovereignty.
Co-founder of the Citizens' Political Movement.
Join us, Become a Member ‚§µÔ∏è, Help us save France!"""

text2 = """The heart that beats on the left ‚Ä¢ With @egregoire

for a greater Paris
‚Ä¢ Leads @coeurgauche75
‚Ä¢ üó£Ô∏èüá™üá∫ in the @partisocialiste

and üá´üá∑ in @pes_pse
‚Ä¢ Activist @psparis10"""

### (2) Transformer model

### (2.1) Load the model

In [5]:
!nvidia-smi

Thu Dec 18 12:19:33 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 575.57.08              Driver Version: 575.57.08      CUDA Version: 12.9     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA L40S                    Off |   00000000:82:00.0 Off |                    0 |
| N/A   47C    P0             87W /  350W |       0MiB /  46068MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [5]:
import transformers
import torch


# zephyr
model_id = "HuggingFaceH4/zephyr-7b-beta"

pipeline = transformers.pipeline(
    "text-generation",
    model=model_id,
    model_kwargs={"dtype": torch.bfloat16},
    device_map="auto",
     pad_token_id=128001
)

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

Device set to use cuda:0


In [7]:
!nvidia-smi

Thu Dec 18 12:19:45 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 575.57.08              Driver Version: 575.57.08      CUDA Version: 12.9     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA L40S                    Off |   00000000:82:00.0 Off |                    0 |
| N/A   48C    P0             95W /  350W |   14253MiB /  46068MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

### (3) Request the model answer

#### (3.1) A single answer

In [7]:
instructions = """
Please classify the following social media profile according to whether or not it expresses support
for or a positive attitude toward right-wing American parties. Here is the message: '${text}'"""

start = time.time()

outputs = pipeline(
    text_inputs=[
        {"role": "system", "content": system_content},
        {"role": "user", "content": Template(instructions).substitute(text=text1)},
    ],
    temperature=0.6,
    top_p=0.9, # The model will only consider the results of the tokens with top_p 90% probability mass.
    max_new_tokens=32 # ~ 0.75 * 32 = 24 words
)

elapsed = time.time() - start

# answer = outputs[0]["generated_text"][-1]["content"]
print(f"ANSWER:\n\t{outputs}")

print(f"Took {elapsed} seconds for one request,\nthis means {elapsed * 1e6 / (24 * 60 * 60):.1f} days per 1 millon of requests.")

ANSWER:
	[{'generated_text': [{'role': 'system', 'content': 'You are an expert in politics.'}, {'role': 'user', 'content': "\nPlease classify the following social media profile according to whether or not it expresses support\nfor or a positive attitude toward right-wing American parties. Here is the message: 'Gaullist/National and Popular Sovereignty.\nCo-founder of the Citizens' Political Movement.\nJoin us, Become a Member ‚§µÔ∏è, Help us save France!'"}, {'role': 'assistant', 'content': 'Based on the provided message, it appears that the social media profile expresses support for right-wing American parties, specifically Gaullist/National and Popular S'}]}]
Took 1.0987176895141602 seconds for one request,
this means 12.7 days per 1 millon of requests.


In [8]:
instructions_concise = """
Please classify the following social media profile according to whether or not it expresses support
for or a positive attitude toward right-wing American parties. Be concise and answer only YES or NO. Here is the message: '${text}'"""

def make_promts_concise(text):
    return 
    
start = time.time()
outputs = pipeline(
    text_inputs=[
        {"role": "system", "content": system_content},
        {"role": "user", "content": Template(instructions_concise).substitute(text=text2)},
    ],
    temperature=0.6,
    top_p=0.9, # The model will only consider the results of the tokens with top_p 90% probability mass.
    max_new_tokens=32 # ~ 0.75 * 32 = 24 words
)
elapsed = time.time() - start

answer = outputs[0]["generated_text"][-1]["content"]
print(f"ANSWER:\n\t{answer}")

print(f"Took {elapsed} seconds for one request,\nthis means {elapsed * 1e6 / (24 * 60 * 60):.1f} days per 1 millon of requests.")

ANSWER:
	NO, the social media profile expressed support for left-wing American and French parties. There is no indication of support for right-wing American parties.
Took 0.7370374202728271 seconds for one request,
this means 8.5 days per 1 millon of requests.


#### (3.2) Many answers

In [9]:
def make_promts(text):
    return [
        {"role": "system", "content": system_content},
        {"role": "user", "content": Template(instructions_concise).substitute(text=text)},
    ]

In [10]:
from string import Template

nb_texts = 100
gen = DocumentGenerator()
list_of_texts = [gen.sentence() for _ in range(nb_texts)]
list_of_messages = [make_promts(text) for text in list_of_texts]

In [11]:
print(f"Mean text lenght is {sum([len(t) for t in list_of_texts]) / nb_texts} words.")

Mean text lenght is 62.06 words.


In [12]:
start = time.time()
outputs = pipeline(
        list_of_messages,
        temperature=0.6,
        top_p=0.9,
        max_new_tokens=32
    )
elapsed  = time.time() - start
per_request_time = elapsed / nb_texts

In [13]:
print(f"Took {per_request_time} seconds per request,\nthis means {per_request_time * 1e6 / (24 * 60 * 60):.1f} days per 1 millon of requests.")

Took 0.7435340237617493 seconds per request,
this means 8.6 days per 1 millon of requests.
