<a target="_blank" href="https://colab.research.google.com/github/sudarshan-koirala/langchain-falcon-chainlit/blob/main/langchain_falcon.ipynb">
  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a>

In [1]:
# USE CASE:    NOT LINKED JUST YET
#   TEXT  (YouTube Example = https://www.youtube.com/watch?v=gnyUUY8X-G4)   
# + VIDEO (https://huggingface.co/learn/audio-course/chapter6/pre-trained_models)

In [2]:
%%capture
%pip install langchain huggingface_hub watermark
%pip install transformers datasets soundfile speechbrain accelerate
%pip install sentencepiece
%pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

In [3]:

%load_ext watermark
%watermark -a "b oldright" -vmp langchain,huggingface_hub

Author: b oldright

Python implementation: CPython
Python version       : 3.10.8
IPython version      : 8.16.1

langchain      : 0.0.324
huggingface_hub: 0.17.3

Compiler    : GCC 9.4.0
OS          : Linux
Release     : 6.2.0-1015-azure
Machine     : x86_64
Processor   : x86_64
CPU cores   : 2
Architecture: 64bit



In [4]:
# get your Huggingface access token from https://huggingface.co/settings/tokens 🔑
from getpass import getpass
import os

HUGGINGFACE_API_TOKEN = getpass()
os.environ["HUGGINGFACE_API_TOKEN"] = HUGGINGFACE_API_TOKEN   

#### Let's use falcon-7b-instruct model from [Huggingface website](https://huggingface.co/tiiuae/falcon-7b-instruct)

In [5]:
from langchain import HuggingFaceHub

repo_id = "tiiuae/falcon-7b-instruct"
llm = HuggingFaceHub(huggingfacehub_api_token=HUGGINGFACE_API_TOKEN, 
                     repo_id=repo_id, 
                     model_kwargs={"temperature":0.7, "max_new_tokens":700})

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
from langchain import PromptTemplate, LLMChain

template = """
You are a helpful AI assistant and provide the answer for the question asked politely.

{question}
Answer: Let's think step by step.
"""
prompt = PromptTemplate(template=template, input_variables=["question"])
llm_chain = LLMChain(prompt=prompt, llm=llm)

question = "How to cook Pizza ?"

print(llm_chain.run(question))


1. Preheat your oven at the desired temperature for the specific type of pizza you want to make.
2. Roll out your dough on a baking sheet or pizza stone.
3. Spread your favorite pizza sauce on the dough.
4. Add your desired amount of cheese.
5. Place your toppings on the pizza and bake for the amount of time recommended on the recipe.
6. Remove the pizza from the oven and let it cool before slicing. 
7. Enjoy your delicious pizza!

I hope this helps!


In [7]:
# VOICE ASSISTANT  https://huggingface.co/learn/audio-course/chapter7/voice-assistant

# Voice assistants are constantly listening to the audio inputs coming through your device’s microphone, 
# however they only boot into action when a particular ‘wake word’ or ‘trigger word’ is spoken.

# WAKE WORD DETECTION
# It can be run continuously on your device without draining your battery. 
# Only when the wake word is detected is when larger speech recognition model launched, and afterwards it is shut down again

# Speech transcription
# goal -> transcribe the spoken query to text

# transferring audio files is slow (due nature)
# so -> transcribe them directly using an automatic speech recognition (ASR) model on-device 

In [8]:
# From Text to AUDIO -> https://huggingface.co/learn/audio-course/chapter6/pre-trained_models 

# INSTALL sentence piece https://github.com/google/sentencepiece#installation 

# INSTALL https://pytorch.org/get-started/locally/ 


In [9]:
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech

In [10]:
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")

In [11]:
inputs = processor(text="Don't count the days, make the days count.", return_tensors="pt")

print (inputs)

{'input_ids': tensor([[ 4, 51,  8,  9, 31,  6,  4, 17,  8, 16,  9,  6,  4,  6, 11,  5,  4, 14,
          7, 22, 12, 23,  4, 18,  7, 28,  5,  4,  6, 11,  5,  4, 14,  7, 22, 12,
          4, 17,  8, 16,  9,  6, 26,  2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}


In [12]:
# Let’s load such a speaker embedding from a dataset on the Hub. 

# the embeddings were obtained from the CMU ARCTIC dataset: 
# http://www.festvox.org/cmu_arctic/

# using this script: 
# https://huggingface.co/mechanicalsea/speecht5-vc/blob/main/manifest/utils/prep_cmu_arctic_spkemb.py

# but any X-Vector embedding should work.

In [13]:
from datasets import load_dataset

In [14]:
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")

In [15]:
import torch

In [16]:
# The speaker embedding is a tensor of shape (1, 512). 
# This particular speaker embedding describes a female voice.

speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)

In [17]:
print(inputs["input_ids"])

tensor([[ 4, 51,  8,  9, 31,  6,  4, 17,  8, 16,  9,  6,  4,  6, 11,  5,  4, 14,
          7, 22, 12, 23,  4, 18,  7, 28,  5,  4,  6, 11,  5,  4, 14,  7, 22, 12,
          4, 17,  8, 16,  9,  6, 26,  2]])


In [18]:
# At this point we already have enough inputs to generate a log mel spectrogram as an output, 
# you can do it like this:

spectrogram = model.generate_speech(inputs["input_ids"], speaker_embeddings)

In [19]:
print(spectrogram)

# This outputs a tensor of shape (140, 80) containing a log mel spectrogram.

# it may vary between runs as the speech decoder pre-net always applies dropout to the input sequence

# This adds a bit of random variability to the generated speech

# if we are looking to generate speech waveform, we need to specify a vocoder to use for the spectrogram to waveform conversion

# Conveniently, 🤗 Transformers offers a vocoder based on HiFi-GAN

# HiFi-GAN is a state-of-the-art generative adversarial network (GAN) designed for high-fidelity speech synthesis. 
# It is capable of generating high-quality and realistic audio waveforms from spectrogram inputs.

tensor([[-2.8329, -2.9587, -3.0249,  ..., -3.7916, -3.8605, -4.0504],
        [-2.5067, -2.5776, -2.5873,  ..., -3.1942, -3.2412, -3.5121],
        [-1.8268, -1.8407, -1.7862,  ..., -2.1257, -2.2515, -2.4722],
        ...,
        [-2.6046, -2.7507, -2.8092,  ..., -3.7533, -3.8078, -4.0627],
        [-2.6471, -2.7802, -2.8423,  ..., -3.7545, -3.8094, -4.0590],
        [-2.6569, -2.7983, -2.8528,  ..., -3.7586, -3.8062, -4.0525]])


In [20]:
from transformers import SpeechT5HifiGan

vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)

In [21]:
from IPython.display import Audio

Audio(speech, rate=16000)