In [6]:
# Reasoning Instruction
SYSTEM_PROMPT = """
A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning process and answer are enclosed within <thinking> </thinking> and
<answer> </answer> tags, respectively, i.e., <thinking> reasoning process here </thinking><answer> answer here </answer>.
Response Format rules:
- Always start your response with <thinking> tag and end with </answer>.
- Do not include any text or commentary before the opening <thinking> tag or after the closing </answer> tag.
- Do not include any text or commentary between the closing </thinking> tag and the opening <answer> tag.
For example, your response follow this format:
<thinking>
[Your detailed chain-of-thought goes here]
</thinking>
<answer>
[Your final answer goes here]
</answer>
"""

In [8]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

checkpoint = "HuggingFaceTB/SmolLM2-1.7B-Instruct"
device = "cuda" if torch.cuda.is_available() else "cpu"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForCausalLM.from_pretrained(checkpoint).to(device)

messages = [
    {"role": "system", "content": SYSTEM_PROMPT},
    {"role": "user", "content": "How to add two numbers in Python?\n"},
]
input_text = tokenizer.apply_chat_template(messages, tokenize=False)
inputs = tokenizer.encode(input_text, return_tensors="pt").to(device)
outputs = model.generate(inputs, max_new_tokens=256, temperature=0.2, top_p=0.9, do_sample=True, use_cache=False)
print(tokenizer.decode(outputs[0]))


model.safetensors:  61%|######    | 2.09G/3.42G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

<|im_start|>system

A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning process and answer are enclosed within <thinking> </thinking> and
<answer> </answer> tags, respectively, i.e., <thinking> reasoning process here </thinking><answer> answer here </answer>.
Response Format rules:
- Always start your response with <thinking> tag and end with </answer>.
- Do not include any text or commentary before the opening <thinking> tag or after the closing </answer> tag.
- Do not include any text or commentary between the closing </thinking> tag and the opening <answer> tag.
For example, your response follow this format:
<thinking>
[Your detailed chain-of-thought goes here]
</thinking>
<answer>
[Your final answer goes here]
</answer>
<|im_end|>
<|im_start|>user
How to add two numbers in Python?
<|im_end|>
<|im_start|>assistant


In [14]:
import re
import os
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
from collections import defaultdict
from datasets import load_dataset, Dataset
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, AutoModelForCausalLM, get_cosine_schedule_with_warmup

# Reasoning Instruction
SYSTEM_PROMPT = """
A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning process and answer are enclosed within <thinking> </thinking> and
<answer> </answer> tags, respectively, i.e., <thinking> reasoning process here </thinking><answer> answer here </answer>.
Response Format rules:
- Always start your response with <thinking> tag and end with </answer>.
- Do not include any text or commentary before the opening <thinking> tag or after the closing </answer> tag.
- Do not include any text or commentary between the closing </thinking> tag and the opening <answer> tag.
For example, your response follow this format:
<thinking>
[Your detailed chain-of-thought goes here]
</thinking>
<answer>
[Your final answer goes here]
</answer>
"""

# Helpers

def get_user_prompt(prompt: str) -> str:
    match = re.search(r"<\|im_start\|>user\s*(.*?)\s*<\|im_end\|>", prompt, re.DOTALL)
    if match:
        return match.group(1).strip()
    lines = prompt.splitlines()
    result = []
    for line in lines:
        if not line.strip().lower().startswith("system"):
            if line.strip().lower().startswith("user"):
                result.append(line.strip()[4:].strip())
            else:
                result.append(line)
    return "\n".join(result).strip()

def get_assistant_response(text: str) -> str:
    match = re.search(r"<\|im_start\|>assistant\s*(.*?)\s*<\|im_end\|>", text, re.DOTALL)
    if match:
        return match.group(1).strip()
    lines = text.splitlines()
    result = []
    capture = False
    for line in lines:
        stripped = line.strip()
        if stripped.lower().startswith("assistant"):
            capture = True
            continue
        if capture:
            result.append(line)
    return "\n".join(result).strip()

def extract_xml_answer(text: str) -> str:
    answer = text.split("<answer>")[-1]
    answer = answer.split("</answer>")[0]
    return answer.strip()

def extract_hash_answer(text: str) -> str:
    if "####" not in text:
        return text.strip()
    return text.split("####", 1)[1].strip()

def count_xml(text: str) -> float:
    count = 0.0
    if text.count("<thinking>\n") == 1:
        count += 0.225
    if text.count("\n</thinking>\n") == 1:
        count += 0.225
    if text.count("\n<answer>\n") == 1:
        count += 0.225
        count -= len(text.split("\n</answer>")[-1]) * 0.001
    if text.count("\n</answer>\n") == 1:
        count += 0.225
        count -= (len(text.split("\n</answer>")[-1]) - 1) * 0.001
    return count

def inference(prompt: str, model_path: str) -> str:
    device = "cuda"
    model_infer = AutoModelForCausalLM.from_pretrained(model_path).to(device)
    tokenizer_infer = AutoTokenizer.from_pretrained(model_path)
    inputs = tokenizer_infer(prompt, return_tensors="pt", max_length=256, truncation=False)
    outputs = model_infer.generate(
        inputs["input_ids"].to(device),
        attention_mask=inputs["attention_mask"].to(device),
        max_new_tokens=256,
        
        pad_token_id=tokenizer_infer.eos_token_id,
        temperature=0.2,
        num_return_sequences=1,
        top_p=0.9, do_sample=True, use_cache=False
    )
    full_text = tokenizer_infer.decode(outputs[0])
    user_question = get_user_prompt(prompt)
    assistant_response = get_assistant_response(full_text)
    extracted_answer = extract_xml_answer(assistant_response)
    return f"{'='*10} Inference {'='*10}\nQuestion:\n{user_question}\n\nModel Response:\n{assistant_response}\n\nExtracted:\n{extracted_answer}\n{'='*12} End {'='*12}\n"


inference("How to add two numbers in Python?\n", "HuggingFaceTB/SmolLM2-1.7B-Instruct")



In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm
2025-02-15 17:54:57.311983: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-02-15 17:54:57.322513: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1739631297.333230  351737 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1739631297.336533  351737 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-02-15 17:54:57.349669: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorF

In [5]:
# 1. Model ve Tokenizer'ı Yükle
checkpoint = "HuggingFaceTB/SmolLM2-360M-Instruct"  # Mistral 7B modeli


In [12]:


device = "cuda" # for GPU usage or "cpu" for CPU usage
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
# for multiple GPUs install accelerate and do `model = AutoModelForCausalLM.from_pretrained(checkpoint, device_map="auto")`
model = AutoModelForCausalLM.from_pretrained(checkpoint).to(device)

messages = [{"role": "system", "content": "Senin adın Mia"},{"role": "user", "content": "Adın ne?"}]
input_text=tokenizer.apply_chat_template(messages, tokenize=False)
print(input_text)
inputs = tokenizer.encode(input_text, return_tensors="pt").to(device)
outputs = model.generate(inputs, max_new_tokens=512, temperature=0.2, top_p=0.9, do_sample=True)

outputs

<|im_start|>system
Senin adın Mia<|im_end|>
<|im_start|>user
Adın ne?<|im_end|>



tensor([[    1,  9690,   198, 27025,   254,   493, 20335,    94, 21230,     2,
           198,     1,  4093,   198,  4503, 20335,    94,   420,    47,     2,
           198,     1,   520,  9531,   198,  4503, 20335,    94,   420,    47,
             2]], device='cuda:0')

In [13]:
print(tokenizer.decode(outputs[0]))

<|im_start|>system
Senin adın Mia<|im_end|>
<|im_start|>user
Adın ne?<|im_end|>
<|im_start|>assistant
Adın ne?<|im_end|>
