In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model_name = "Qwen/Qwen2.5-0.5B-Instruct" # https://huggingface.co/collections/Qwen/qwen25-66e81a666513e518adb90d9e

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype="auto",
    device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [10]:
# Temporary Strorage
conversation_storage = ""

# OPTIONAL: Store messages in Drive
# conversation_storage = "/content/drive/MyDrive/ColabGPT"

In [None]:
import os
import json
import time

# Create conversation storage folder if needed and conversation file name
if conversation_storage and not os.path.exists(conversation_storage):
    os.makedirs(conversation_storage)
convesation_name = f"conversation_{time.strftime('%Y%m%d-%H%M%S')}.json"

messages = [
    {"role": "system", "content": "You are a helpful assistant."},
]

while True:
    user_input = input("User: ")
    messages.append({"role": "user", "content": user_input})

    if user_input.lower() == "exit":
        print("Exiting...")
        break

    try:
        # Prepare model inputs
        text = tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True
        )
        model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

        # Generate response with optimized parameters
        generated_ids = model.generate(
            **model_inputs,
            max_new_tokens=256, # Reduce max_new_tokens if memory is still an issue.
            do_sample=True,     # Enable sampling for more diverse outputs
            top_p=0.9,         # Nucleus sampling for better quality
            temperature=0.7,    # Control randomness of generation
        )

        generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)]
        response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

        print("Assistant:")
        for i in range(0, len(response), 80):
            print(response[i:i+80])

        messages.append({"role": "assistant", "content": response})

        # Save conversation history only if storage is defined
        with open(os.path.join(conversation_storage, convesation_name), "w") as f:
            json.dump(messages, f, indent=4)

    except Exception as e:
        print(f"Error during generation: {e}")
        break
