In [None]:
pip install gradio

Collecting gradio
  Downloading gradio-5.8.0-py3-none-any.whl.metadata (16 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.6-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.4.0-py3-none-any.whl.metadata (2.9 kB)
Collecting gradio-client==1.5.1 (from gradio)
  Downloading gradio_client-1.5.1-py3-none-any.whl.metadata (7.1 kB)
Collecting markupsafe~=2.0 (from gradio)
  Downloading MarkupSafe-2.1.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.0 kB)
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-multipart>=0.0.18 (from gradio)
  Downloading python_multipart-0.0.19-py3-none-any.whl.metadata (1.8 kB)
Collecting ruff>=0.2.2 (from gradio)
  Downloading ruff-0.8.2-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metad

In [None]:
pip install unsloth

Collecting unsloth
  Downloading unsloth-2024.12.4-py3-none-any.whl.metadata (59 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/59.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.2/59.2 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting unsloth_zoo>=2024.11.8 (from unsloth)
  Downloading unsloth_zoo-2024.12.1-py3-none-any.whl.metadata (16 kB)
Collecting xformers>=0.0.27.post2 (from unsloth)
  Downloading xformers-0.0.28.post3-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (1.0 kB)
Collecting bitsandbytes (from unsloth)
  Downloading bitsandbytes-0.45.0-py3-none-manylinux_2_24_x86_64.whl.metadata (2.9 kB)
Collecting triton>=3.0.0 (from unsloth)
  Downloading triton-3.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.3 kB)
Collecting tyro (from unsloth)
  Downloading tyro-0.9.2-py3-none-any.whl.metadata (9.4 kB)
Collecting datasets>=2.16.0 (from unsloth)
  Downloadi

In [25]:
import gradio as gr
from unsloth import FastLanguageModel
import torch

# Initialize the fine-tuned model
model_name = "skapl/lora_model"  # Replace with your fine-tuned model name
max_seq_length = 512  # Adjust based on your model's settings
dtype = torch.float16  # or float32 depending on your model
load_in_4bit = True  # Set to True if you're using 4-bit precision

# Load the fine-tuned Dungeon Master model
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=model_name,
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
)
# Enable inference optimizations for the model
FastLanguageModel.for_inference(model)

def respond_to_player(input_text: str, history: list, state: str):
    """
    Generate a Dungeon Master's response based on the player's action or input.
    """
    # Build the conversation history
    system_message = (
        "You are the Dungeon Master in a Dungeons & Dragons game. Guide the player through an adventure, "
        "describe environments, interact with characters, and generate the world-building narrative."
    )
    messages = [{"role": "system", "content": system_message}]

    if state == "y":
        for user_msg, bot_msg in history:
            if user_msg:
                messages.append({"role": "user", "content": user_msg})
            if bot_msg:
                messages.append({"role": "assistant", "content": bot_msg})

    # Add the player's current input
    messages.append({"role": "user", "content": input_text})

    # Tokenize inputs
    tokenized_inputs = tokenizer.apply_chat_template(
        messages,
        tokenize=True,
        add_generation_prompt=True,
        return_tensors="pt",
    ).to("cuda")

    # Move the tokenized inputs to the GPU
  #  input_ids = tokenized_inputs["input_ids"].to("cuda")  # No unsqueeze needed
  #  attention_mask = tokenized_inputs["attention_mask"].to("cuda")  # Ensure the attention mask is used if available

    # Generate the Dungeon Master's response
    generated_ids = model.generate(
        input_ids=tokenized_inputs,
        #attention_mask=attention_mask,  # Use the attention mask
        max_new_tokens=400,
        temperature=0.8
    )

    # Decode the generated text
    generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
    state = "y"  # Update state after the first interaction

    # Return the response and updated history
    return generated_text, history + [(input_text, generated_text)], state




# Create a Gradio Interface
def launch_dungeon_master_interface():
    # Initial state for the system
    initial_state = "n"

    # Create a Gradio Interface for user input (player) and bot output (DM)
    interface = gr.Interface(
        fn=respond_to_player,
        inputs=[
            gr.Textbox(label="Player's Input", placeholder="Describe your action or speak to the Dungeon Master..."),
            gr.State(value=[]),  # To store conversation history
            gr.Textbox(value=initial_state, label="State", visible=False),  # Hidden state input
        ],
        outputs=[
            gr.Textbox(label="Dungeon Master's Response"),
            gr.State(),  # Updated conversation history
            gr.Textbox(label="Updated State", visible=False),  # Updated state output
        ],
        title="Dungeon Master AI",
        description="This is a Dungeon Master assistant for your Dungeons & Dragons adventures. Provide an input to interact with the DM!",
    )

    # Launch the interface
    interface.launch(debug=True)

# Run the interface
if __name__ == "__main__":
    launch_dungeon_master_interface()

==((====))==  Unsloth 2024.12.4: Fast Llama patching. Transformers:4.46.3.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu121. CUDA: 7.5. CUDA Toolkit: 12.1. Triton: 3.1.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://9892a28c09874deb15.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Sp

Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7861 <> https://a055af8e970b9b74fb.gradio.live
Killing tunnel 127.0.0.1:7862 <> https://ab94a41fec3d8e131e.gradio.live
Killing tunnel 127.0.0.1:7863 <> https://9026e4036132f00f8c.gradio.live
Killing tunnel 127.0.0.1:7864 <> https://9892a28c09874deb15.gradio.live


In [None]:
pip show bitsandbytes

Name: bitsandbytes
Version: 0.45.0
Summary: k-bit optimizers and matrix multiplication routines.
Home-page: https://github.com/bitsandbytes-foundation/bitsandbytes
Author: Tim Dettmers
Author-email: dettmers@cs.washington.edu
License: MIT
Location: /usr/local/lib/python3.10/dist-packages
Requires: numpy, torch, typing_extensions
Required-by: 


In [None]:
import gradio as gr
from unsloth import FastLanguageModel
from PIL import Image
from transformers import BlipProcessor, BlipForConditionalGeneration
import torch

# === Configuration ===
MODEL_NAME = "skapl/lora_model"  # Replace with the actual fine-tuned model name
MAX_SEQ_LENGTH = 512  # Adjust based on your model's requirements
DTYPE = torch.float16  # Use float32 or float16 depending on your setup
LOAD_IN_4BIT = False  # Set to True if using reduced precision

# === Load Models ===
# Load fine-tuned language model (Lore Model)
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=MODEL_NAME,
    max_seq_length=MAX_SEQ_LENGTH,
    dtype=DTYPE,
    load_in_4bit=LOAD_IN_4BIT,
)
FastLanguageModel.for_inference(model)  # Optimize the model for inference

# Load BLIP image captioning model and processor
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
image_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

# === Define Functionality ===
def describe_image(image: Image):
    """
    Generate a description for an uploaded image.
    Combines BLIP-generated caption with a detailed response from the fine-tuned language model.
    """
    # Step 1: Generate a basic caption using BLIP
    inputs = processor(images=image, return_tensors="pt")
    generated_ids = image_model.generate(**inputs)
    blip_caption = processor.decode(generated_ids[0], skip_special_tokens=True)

    # Step 2: Enhance the caption using the fine-tuned language model
    prompt = f"Describe this image: {blip_caption}"
    model_inputs = tokenizer.apply_chat_template(
        [
            {"role": "system", "content": "You are an image description assistant."},
            {"role": "user", "content": prompt},
        ],
        tokenize=True,
        add_generation_prompt=True,
        return_tensors="pt",
    ).to("cuda")

    # Generate a detailed description
    generated_ids = model.generate(
        input_ids=model_inputs,
        max_new_tokens=150,
        temperature=0.8
    )
    detailed_description = tokenizer.decode(generated_ids[0], skip_special_tokens=True)

    return detailed_description

# === Gradio Interface ===
gr.Interface(
    fn=describe_image,
    inputs=gr.Image(type="pil", label="Upload Image"),
    outputs=gr.Textbox(label="Image Description"),
    title="Image Description Assistant",
    description="Upload an image, and the bot will describe it with rich details.",
).launch()

==((====))==  Unsloth 2024.12.4: Fast Llama patching. Transformers:4.46.3.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu121. CUDA: 7.5. CUDA Toolkit: 12.1. Triton: 3.1.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://9026e4036132f00f8c.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


