In [5]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

In [None]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-R1-Distill-Qwen-32B")
model = AutoModelForCausalLM.from_pretrained("deepseek-ai/DeepSeek-R1-Distill-Qwen-32B")

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM, StoppingCriteria, StoppingCriteriaList
import torch
from tqdm import tqdm  # Import tqdm for progress bar

# Free MPS memory before running
if torch.backends.mps.is_available():
    torch.mps.empty_cache()
    torch.mps.synchronize()

# Load tokenizer and model
model_name = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Check if MPS (Apple GPU) is available, otherwise use CPU
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print(f"Using device: {device}")

# Load model onto the selected device
model = AutoModelForCausalLM.from_pretrained(model_name)
model.to(device)

# Set pad token ID (to avoid warnings)
tokenizer.pad_token = tokenizer.eos_token


# Custom stopping criteria (to limit token count manually)
class MaxTokenStoppingCriteria(StoppingCriteria):
    def __init__(self, max_new_tokens):
        self.max_new_tokens = max_new_tokens
        self.current_tokens = 0

    def __call__(self, input_ids, scores, **kwargs):
        self.current_tokens += 1
        return self.current_tokens >= self.max_new_tokens


while True:
    # 사용자 입력 받기
    user_input = input("\n문장을 입력하세요: ")

    # 입력 문장 토큰화 (Set attention_mask to avoid unexpected behavior)
    inputs = tokenizer(user_input, return_tensors="pt", padding=True, truncation=True)
    input_ids = inputs.input_ids.to(device)
    attention_mask = inputs.attention_mask.to(device)

    # Maximum length of generated text
    max_length = 100  # Reduce length for better performance

    # Create a progress bar
    print("\nGenerating text...")
    with tqdm(total=max_length, desc="Generating", unit="token") as pbar:
        stopping_criteria = StoppingCriteriaList([MaxTokenStoppingCriteria(max_length)])

        # Step-by-step token generation
        output = model.generate(
            input_ids,
            attention_mask=attention_mask,
            max_new_tokens=max_length,  # Only generate `max_length` tokens
            temperature=0.7,
            top_k=50,
            top_p=0.9,
            repetition_penalty=1.2,
            pad_token_id=tokenizer.pad_token_id,
            do_sample=True,  # Enable sampling for more diverse output
            stopping_criteria=stopping_criteria
        )

        # Update tqdm progress bar dynamically
        generated_tokens = output.shape[1] - input_ids.shape[1]
        pbar.update(generated_tokens)

    # 생성된 문장 디코딩
    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)

    # 결과 출력
    print("\n=== 생성된 텍스트 ===")
    print(generated_text)


In [1]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM, StoppingCriteria, StoppingCriteriaList
import torch
from tqdm import tqdm  # Import tqdm for progress bar

# Free MPS memory before running
if torch.backends.mps.is_available():
    torch.mps.empty_cache()
    torch.mps.synchronize()

# Load tokenizer and model
model_name = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Check if MPS (Apple GPU) is available, otherwise use CPU
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print(f"Using device: {device}")

# Load model onto the selected device
model = AutoModelForCausalLM.from_pretrained(model_name)
model.to(device)

# Set pad token ID (to avoid warnings)
tokenizer.pad_token = tokenizer.eos_token


# Custom stopping criteria (to limit token count manually)
class MaxTokenStoppingCriteria(StoppingCriteria):
    def __init__(self, max_new_tokens):
        self.max_new_tokens = max_new_tokens
        self.current_tokens = 0

    def __call__(self, input_ids, scores, **kwargs):
        self.current_tokens += 1
        return self.current_tokens >= self.max_new_tokens


while True:
    # 사용자 입력 받기
    user_input = input("\n문장을 입력하세요: ")

    # 입력 문장 토큰화 (Set attention_mask to avoid unexpected behavior)
    inputs = tokenizer(user_input, return_tensors="pt", padding=True, truncation=True)
    input_ids = inputs.input_ids.to(device)
    attention_mask = inputs.attention_mask.to(device)

    # Maximum length of generated text
    max_length = 100  # Reduce length for better performance

    # Create a progress bar
    print("\nGenerating text...")
    with tqdm(total=max_length, desc="Generating", unit="token") as pbar:
        stopping_criteria = StoppingCriteriaList([MaxTokenStoppingCriteria(max_length)])

        # Step-by-step token generation
        output = model.generate(
            input_ids,
            attention_mask=attention_mask,
            max_new_tokens=max_length,  # Only generate `max_length` tokens
            temperature=0.7,
            top_k=50,
            top_p=0.9,
            repetition_penalty=1.2,
            pad_token_id=tokenizer.pad_token_id,
            do_sample=True,  # Enable sampling for more diverse output
            stopping_criteria=stopping_criteria
        )

        # Update tqdm progress bar dynamically
        generated_tokens = output.shape[1] - input_ids.shape[1]
        pbar.update(generated_tokens)

    # 생성된 문장 디코딩
    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)

    # 결과 출력
    print("\n=== 생성된 텍스트 ===")
    print(generated_text)


Using device: mps



문장을 입력하세요:  안녕



Generating text...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Generating: 100%|██████████████████████████| 100/100 [00:18<00:00,  5.37token/s]



=== 생성된 텍스트 ===
안녕, machine learning beginner. Want to learn from scratch the deep learning part.

I have a question: what is the best way to start with neural networks?

My initial thought was that I should just use Keras or TensorFlow and write some code.

But when I tried it before, things didn't go well; my models were overfitting on training data.

So perhaps another approach would be better.

Wait, maybe using PyTorch? It's an alternative framework for building models quickly and having control



문장을 입력하세요:  한국말로 대답해줘



Generating text...


Generating: 100%|██████████████████████████| 100/100 [00:05<00:00, 17.48token/s]



=== 생성된 텍스트 ===
한국말로 대답해줘! 2017년 5월 3일 라는 기사의 Readers Response를 주세요. (가이mtplexon)
<pre><code>
</code></pre>

```
def get_message():
    return 'The answer is "This is the correct response to your question."'

message = get_message()
print(message)

Wait, but you need to make this function work in a way that when called with any arguments, it returns an empty string.

So


KeyboardInterrupt: Interrupted by user