In [None]:
! pip install liquid-audio
# ! pip install "liquid-audio [demo]" # optional, to install demo dependencies
# ! pip install flash-attn --no-build-isolation



In [None]:
import torch
import torchaudio
from liquid_audio import LFM2AudioModel, LFM2AudioProcessor, ChatState, LFMModality

# Load models
HF_REPO = "LiquidAI/LFM2-Audio-1.5B"

processor = LFM2AudioProcessor.from_pretrained(HF_REPO).eval()
model = LFM2AudioModel.from_pretrained(HF_REPO).eval()

Fetching 10 files:   0%|          | 0/10 [00:00<?, ?it/s]

In [None]:
from datasets import load_dataset, Audio
from pathlib import Path

data_path = "/content/drive/MyDrive/DLS Speech/local_datasets/weather_dataset_with_tts_v2.tsv"
audio_paths_prefix = "/content/drive/MyDrive/DLS Speech/"

ds = load_dataset(
    "csv",
    data_files=data_path,
    split="train",
    column_names=["user_text", "user_audio", "assistant_text"],  # change to your columns
    delimiter="\t",
)


def add_audio_path_prefix(example):
    p = Path(example["user_audio"])
    example["user_audio"] = str(Path(audio_paths_prefix) / p)
    return example

ds = ds.map(add_audio_path_prefix)


ds = ds.map(lambda batch: {"user_audio_path": batch["user_audio"]}, batched=True)  # let's copy audio paths to another column
ds = ds.cast_column("user_audio", Audio(sampling_rate=16000, decode=True))  # and cast audio column into decoded audio

# 90/10 split
splits = ds.train_test_split(test_size=0.1, seed=3407, shuffle=True)
train_ds = splits["train"]
test_ds  = splits["test"]

train_ds[100]

{'user_text': 'Weather update for Phoenix right now, please.',
 'user_audio': <datasets.features._torchcodec.AudioDecoder at 0x78a466f1c740>,
 'assistant_text': '{"tool_call": {"name": "weather.get_forecast", "arguments": {"city": "Phoenix"}}}',
 'user_audio_path': '/content/drive/MyDrive/DLS Speech/local_datasets/audio/000416.wav'}

In [None]:
SYSTEM_PROMPT_JSON = """
You are a tool-using assistant.
You can respond with JSON tool call if the tool usage is required based on the user's request.
You need to respond with a tool call if the user asks about weather.
If the user asks something else unrelated to weather, tool is not needed, respond to user's query in plain text as usual

Available tools (names are exact; use only these names):
- weather.get_forecast(city: string, date: YYYY-MM-DD)
  - Returns the weather in a given city.

Output rules for tool call:
- Output format:
  {
    "tool_call": {
      "name": "weather.get_forecast",
      "arguments": {
        "city": "CITY",
      }
    }
  }
- Do not add any extra properties, arrays, comments, or text before/after the JSON.
"""

In [None]:
""" Testing text-only inputs
"""

SYSTEM_PROMPT_V2 = """You are a tool-using assistant.
You MUST respond with exactly one XML tool call without any surrounding text if the tool usage is required (user is asking about the weather), otherwise respond to user's request in plain text.
Template: <tool_call><name>weather.get_forecast</name><arguments><city>CITY</city></arguments></tool_call>. If tool usage is not required answer user's question shortly with plain text
"""

# Set up inputs for the model
chat = ChatState(processor)

chat.new_turn("system")
chat.add_text(SYSTEM_PROMPT_V2)
chat.end_turn()

chat.new_turn("user")
chat.add_text("What's the current temperature in Voronezh?")
# chat.add_text("Is it rainy in Astana?")
# chat.add_text("I am trying to understand how CTC training in ASR works. Can you briefly explain that?")
# chat.add_text("What is going to happen to me in 6 months if I run 5 km every single day?")
chat.end_turn()

# start an assistant turn before feeding inputs to the model
chat.new_turn("assistant")

In [None]:
# Generate text
for t in model.generate_sequential(**chat, max_new_tokens=512):
    if t.numel() == 1:
        print(processor.text.decode(t), end="", flush=True)

Running 5 km every single day for 6 months could lead to several health impacts, both positive and negative. Here are some potential outcomes:

1. **Physical Health**: 
   - **Benefits**: Regular running can improve cardiovascular health, increase muscle strength, enhance flexibility, and aid in weight management.
   - **Risks**: Over time, excessive running can lead to injuries such as stress fractures, tendonitis, or muscle strains. It can also cause chronic fatigue and overtraining syndrome.

2. **Mental Health**: 
   - Running can be a great mood booster and help reduce symptoms of depression and anxiety. It can also improve sleep quality and increase energy levels.

3. **Lifestyle**: 
   - Consistent running can lead to a more active lifestyle, encouraging other healthy habits like better nutrition and more physical activity.

4. **Time Management**: 
   - Running 5 km daily requires a significant time commitment, which might impact other aspects of life, such as work, family, or 

In [None]:
""" Testing ASR performance
"""

for sample in test_ds.select(range(5)):
  user_audio = sample["user_audio_path"]

  chat = ChatState(processor)

  chat.new_turn("system")
  # This model is super sensitive to prompt formulation, better follow example prompts
  # https://github.com/Liquid4All/liquid-audio
  chat.add_text("Perform ASR.")
  chat.end_turn()

  chat.new_turn("user")
  wav, sampling_rate = torchaudio.load(user_audio)
  chat.add_audio(wav, sampling_rate)
  chat.end_turn()
  chat.new_turn("assistant")

  print(f"Ground truth transcript:\n{sample['user_text']}\n")
  print("Recognized speech:")

  # Generate text
  for t in model.generate_sequential(**chat, max_new_tokens=512):
      if t.numel() == 1:
          print(processor.text.decode(t), end="", flush=True)
  print("\n==================\n")


Ground truth transcript:
Is it cold in Granada right now?

Recognized speech:
Is it cold in Granada right now?<|im_end|>

Ground truth transcript:
Give me the current weather conditions in Miami.

Recognized speech:
Give me the current weather conditions in Miami.<|im_end|>

Ground truth transcript:
Is it foggy in Melbourne right now?

Recognized speech:
Is it foggy in Melbourne right now?<|im_end|>

Ground truth transcript:
Is it foggy in Geneva right now?

Recognized speech:
Is it foggy in Geneva right now?<|im_end|>

Ground truth transcript:
Is it stormy in San Diego at the moment?

Recognized speech:
Is it stormy in San Diego at the moment?<|im_end|>



In [None]:
"""Maybe the model can do both ASR and tool-call in a single go
"""
for sample in test_ds.select(range(5)):
  user_audio = sample["user_audio_path"]

  chat = ChatState(processor)

  chat.new_turn("system")
  chat.add_text(f"{SYSTEM_PROMPT_V2}. Respond with text.")
  chat.end_turn()

  chat.new_turn("user")
  wav, sampling_rate = torchaudio.load(user_audio)
  chat.add_audio(wav, sampling_rate)
  chat.end_turn()
  chat.new_turn("assistant")

  print(f"Ground truth transcript:\n{sample['user_text']}\n")
  print("Model reponse:")

  # Generate text
  for t in model.generate_sequential(**chat, max_new_tokens=512):
      if t.numel() == 1:
          print(processor.text.decode(t), end="", flush=True)
  print("\n==================\n")


Ground truth transcript:
Is it cold in Granada right now?

Model reponse:
I'm sorry, I don't have access to a tool to answer this question. However, I can help you with other types of information. For example, I can provide you with the weather forecast for Granada. Would you like me to check the current weather in Granada?<|im_end|>

Ground truth transcript:
Give me the current weather conditions in Miami.

Model reponse:
<tool_call>weather.get_forecast</tool_call><arguments><city>Miami</city></arguments></tool_call><|im_end|>

Ground truth transcript:
Is it foggy in Melbourne right now?

Model reponse:
I don't have real-time data, but you can check the current weather in Melbourne using a weather app or website. Would you like a weather app recommendation?<|im_end|>

Ground truth transcript:
Is it foggy in Geneva right now?

Model reponse:
I don't have real-time data to check current weather conditions, including fog in Geneva. You might want to check a weather app or website for the