<a target="_blank" href="https://colab.research.google.com/github/sudarshan-koirala/langchain-falcon-chainlit/blob/main/langchain_falcon.ipynb">
  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a>

In [4]:
# USE CASE:  https://huggingface.co/learn/audio-course/chapter7/voice-assistant

%%capture
%pip install langchain huggingface_hub watermark
%pip install transformers datasets soundfile speechbrain accelerate
%pip install sentencepiece
%pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

#!pip show transformers
#!pip show datasets
#!pip show sentencepiece
#!pip show torch

# In Codespaces - you have to install it from TERMINAL with following
# sudo apt update
# sudo apt install ffmpeg
# confirm installed successfully :  ffmpeg -version

# https://ffmpeg.org/download.html
# https://ffmpeg.org/download.html#build-windows

# For anyone encountering this issue, I found a solution. 
# ERROR:  ValueError: ffmpeg was not found but is required to stream audio files from filename
# Apparently python ffmpeg-python package doesn’t install ffmpeg itself,
# so you have to install it manually on --> your machine Download FFmpeg 
# https://phoenixnap.com/kb/ffmpeg-windows

In [5]:
#%load_ext watermark
%reload_ext watermark
%watermark -a "Falcon FutureTech Maverick Team" -vmp langchain,huggingface_hub

Author: Falcon FutureTech Maverick Team

Python implementation: CPython
Python version       : 3.10.8
IPython version      : 8.16.1

langchain      : 0.0.324
huggingface_hub: 0.17.3

Compiler    : GCC 9.4.0
OS          : Linux
Release     : 6.2.0-1015-azure
Machine     : x86_64
Processor   : x86_64
CPU cores   : 2
Architecture: 64bit



In [6]:
# get your Huggingface access token from https://huggingface.co/settings/tokens 🔑
from getpass import getpass
import os

HUGGINGFACE_API_TOKEN = getpass()
os.environ["HUGGINGFACE_API_TOKEN"] = HUGGINGFACE_API_TOKEN   

#### Let's use falcon-7b-instruct model from [Huggingface website](https://huggingface.co/tiiuae/falcon-7b-instruct)

In [7]:
from langchain import HuggingFaceHub

repo_id = "tiiuae/falcon-7b-instruct"
llm = HuggingFaceHub(huggingfacehub_api_token=HUGGINGFACE_API_TOKEN, 
                     repo_id=repo_id, 
                     model_kwargs={"temperature":0.7, "max_new_tokens":700})

  from .autonotebook import tqdm as notebook_tqdm


In [8]:
from transformers import pipeline
import torch

device = "cuda:0" if torch.cuda.is_available() else "cpu"

classifier = pipeline(
    "audio-classification", model="MIT/ast-finetuned-speech-commands-v2", device=device
)

In [9]:
print(classifier.model.config.id2label)

{0: 'backward', 1: 'follow', 2: 'five', 3: 'bed', 4: 'zero', 5: 'on', 6: 'learn', 7: 'two', 8: 'house', 9: 'tree', 10: 'dog', 11: 'stop', 12: 'seven', 13: 'eight', 14: 'down', 15: 'six', 16: 'forward', 17: 'cat', 18: 'right', 19: 'visual', 20: 'four', 21: 'wow', 22: 'no', 23: 'nine', 24: 'off', 25: 'three', 26: 'left', 27: 'marvin', 28: 'yes', 29: 'up', 30: 'sheila', 31: 'happy', 32: 'bird', 33: 'go', 34: 'one'}


In [10]:
#Confirm the WAKE WORD = 'marvin' is present within this dataset
print(classifier.model.config.id2label[27])

marvin


In [11]:
from transformers.pipelines.audio_utils import ffmpeg_microphone_live

In [12]:
def launch_fn(
    wake_word="marvin",
    prob_threshold=0.5,
    chunk_length_s=2.0,
    stream_chunk_s=0.25,
    debug=False,
):
    if wake_word not in classifier.model.config.label2id.keys():
        raise ValueError(
            f"Wake word {wake_word} not in set of valid class labels, pick a wake word in the set {classifier.model.config.label2id.keys()}."
        )

    sampling_rate = classifier.feature_extractor.sampling_rate

    mic = ffmpeg_microphone_live(
        sampling_rate=sampling_rate,
        chunk_length_s=chunk_length_s,
        stream_chunk_s=stream_chunk_s,
    )

    print("Listening for wake word...")
    for prediction in classifier(mic):
        prediction = prediction[0]
        if debug:
            print(prediction)
        if prediction["label"] == wake_word:
            if prediction["score"] > prob_threshold:
                return True

In [14]:
launch_fn(debug=True)


# Listening for wake word...
# {'score': 0.055326107889413834, 'label': 'one'}
# {'score': 0.05999856814742088, 'label': 'off'}
# {'score': 0.1282748430967331,  'label': 'five'}
# {'score': 0.07310110330581665, 'label': 'follow'}
# {'score': 0.06634809821844101, 'label': 'follow'}
# {'score': 0.05992642417550087, 'label': 'tree'}
# {'score': 0.05992642417550087, 'label': 'tree'}
# {'score': 0.999913215637207, 'label': 'marvin'}  ---> WAKE WORD


# As soon as we say the wake word, the model predicts "marvin" with probability close to 1 and terminates the loop, 
# signalling that the wake word has been detected and that the ASR system should be activated

Listening for wake word...


ALSA lib confmisc.c:767:(parse_card) cannot find card '0'
ALSA lib conf.c:4732:(_snd_config_evaluate) function snd_func_card_driver returned error: No such file or directory
ALSA lib confmisc.c:392:(snd_func_concat) error evaluating strings
ALSA lib conf.c:4732:(_snd_config_evaluate) function snd_func_concat returned error: No such file or directory
ALSA lib confmisc.c:1246:(snd_func_refer) error evaluating name
ALSA lib conf.c:4732:(_snd_config_evaluate) function snd_func_refer returned error: No such file or directory
ALSA lib conf.c:5220:(snd_config_expand) Evaluate error: No such file or directory
ALSA lib pcm.c:2642:(snd_pcm_open_noupdate) Unknown PCM default


In [15]:
transcriber = pipeline(
    "automatic-speech-recognition", model="openai/whisper-base.en", device=device
)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [16]:
# As we’re transcribing the speech, we also need to have an idea of when the user stops speaking
# so that we can terminate the recording

# For simplicity, we’ll terminate our microphone recording after 5 secs

import sys


def transcribe(chunk_length_s=5.0, stream_chunk_s=1.0):
    sampling_rate = transcriber.feature_extractor.sampling_rate

    mic = ffmpeg_microphone_live(
        sampling_rate=sampling_rate,
        chunk_length_s=chunk_length_s,
        stream_chunk_s=stream_chunk_s,
    )

    print("Start speaking...")
    for item in transcriber(mic, generate_kwargs={"max_new_tokens": 128}):
        sys.stdout.write("\033[K")
        print(item["text"], end="\r")
        if not item["partial"][0]:
            break

    return item["text"]

In [17]:
transcribe()

Start speaking...


ALSA lib confmisc.c:767:(parse_card) cannot find card '0'
ALSA lib conf.c:4732:(_snd_config_evaluate) function snd_func_card_driver returned error: No such file or directory
ALSA lib confmisc.c:392:(snd_func_concat) error evaluating strings
ALSA lib conf.c:4732:(_snd_config_evaluate) function snd_func_concat returned error: No such file or directory
ALSA lib confmisc.c:1246:(snd_func_refer) error evaluating name
ALSA lib conf.c:4732:(_snd_config_evaluate) function snd_func_refer returned error: No such file or directory
ALSA lib conf.c:5220:(snd_config_expand) Evaluate error: No such file or directory
ALSA lib pcm.c:2642:(snd_pcm_open_noupdate) Unknown PCM default


UnboundLocalError: local variable 'item' referenced before assignment

In [18]:
# Now that we have our spoken query transcribed,  we want to generate a meaningful response.
# We’ll use an LLM hosted on the Cloud and use the Inference API to easily query the model.

# We’ll search by “instruct” to filter by models that have been instruction fine-tuned

# The Inference API allows us to send a HTTP request from our local machine to the LLM hosted on the Hub
# and returns the response as a json file

from huggingface_hub import HfFolder
import requests


def query(text, model_id="tiiuae/falcon-7b-instruct"):
    api_url = f"https://api-inference.huggingface.co/models/{model_id}"
    headers = {"Authorization": f"Bearer {HfFolder().get_token()}"}
    payload = {"inputs": text}

    print(f"Querying...: {text}")
    response = requests.post(api_url, headers=headers, json=payload)
    return response.json()[0]["generated_text"][len(text) + 1 :]

In [19]:
query("What does Hugging Face do?")

Querying...: What does Hugging Face do?


KeyError: 0

In [20]:
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan

checkpoint="microsoft/speecht5_tts"
processor = SpeechT5Processor.from_pretrained(checkpoint)

model = SpeechT5ForTextToSpeech.from_pretrained(checkpoint).to(device)
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)

In [21]:
from datasets import load_dataset

embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)

In [22]:
def synthesise(text):
    inputs = processor(text=text, return_tensors="pt")
    speech = model.generate_speech(
        inputs["input_ids"].to(device), speaker_embeddings.to(device), vocoder=vocoder
    )
    return speech.cpu()

In [23]:
from IPython.display import Audio

audio = synthesise(
    "Hugging Face is a company that provides natural language processing and machine learning tools for developers."
)

Audio(audio, rate=16000)