In [None]:
from google.colab import drive
drive.mount('/content/drive')

import time
import json
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from torch.utils.data import DataLoader


model_path = "Qwen/Qwen2.5-7B-Instruct"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True, padding_side='left')
model = AutoModelForCausalLM.from_pretrained(
    model_path,
    torch_dtype=torch.float16,
    trust_remote_code=True
).to("cuda" if torch.cuda.is_available() else "cpu")

with open("/content/drive/MyDrive/AdSeek/Preprocessing/p_engineering_testsets/p_engineering_testset.json", "r", encoding="utf-8") as f:
    data = json.load(f)

df = pd.DataFrame(data)

BATCH_SIZE = 8

def batch_generate(prompts, max_new_tokens=20):
    tokenized = tokenizer(prompts, return_tensors="pt", padding=True, truncation=True).to(model.device)
    start = time.time()
    with torch.inference_mode():
        outputs = model.generate(
            **tokenized,
            max_new_tokens=max_new_tokens,
            do_sample=False,
            eos_token_id=tokenizer.eos_token_id,
            pad_token_id=tokenizer.pad_token_id,
            repetition_penalty=1.2
        )
    elapsed = time.time() - start
    decoded_outputs = tokenizer.batch_decode(outputs[:, tokenized["input_ids"].shape[1]:], skip_special_tokens=True)
    return decoded_outputs, elapsed


batched_preds = []
inference_times = []
for i in range(0, len(df), BATCH_SIZE):
    batch = df['prompt'].iloc[i:i+BATCH_SIZE].tolist()
    outputs, t = batch_generate(batch)
    batched_preds.extend(outputs)
    inference_times.extend([t / len(batch)] * len(batch))

df['y_pred'] = batched_preds
df['inference_time'] = inference_times

df.to_json("qwen05_pe_results.json", orient="records", force_ascii=False, indent=2)


Mounted at /content/drive


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/7.30k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/663 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/27.8k [00:00<?, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/3.56G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/3.95G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/243 [00:00<?, ?B/s]



In [None]:
df['inference_time'].describe()

Unnamed: 0,inference_time
count,1355.0
mean,0.155268
std,0.030821
min,0.116472
25%,0.140402
50%,0.147929
75%,0.163277
max,0.367064


In [None]:
sum(df['inference_time'])

210.38796329498288