In [1]:
try:
    from pydub import AudioSegment
except ImportError:
    %pip install pydub
    %pip install pydub[extras]
    from pydub import AudioSegment
    from pydub.playback import play


In [2]:
from IPython.display import Audio
audio_path = '../datasets/audio/Prime-minister.m4a'
audio = AudioSegment.from_file(audio_path, format="m4a")
audio

In [3]:
try:    
    import whisper
except ImportError:
    %pip install openai-whisper
    import whisper

In [4]:
whisper_model = whisper.load_model("base.en")


In [5]:
transcription = whisper_model.transcribe(audio_path, fp16=True, verbose=False)

100%|██████████| 347/347 [00:00<00:00, 367.83frames/s]


In [6]:
transcription

{'text': ' Who is the Prime Minister of India?',
 'segments': [{'id': 0,
   'seek': 0,
   'start': 0.0,
   'end': 3.0,
   'text': ' Who is the Prime Minister of India?',
   'tokens': [50363, 5338, 318, 262, 5537, 4139, 286, 3794, 30, 50513],
   'temperature': 0.0,
   'avg_logprob': -0.34697675704956055,
   'compression_ratio': 0.813953488372093,
   'no_speech_prob': 0.005249415524303913}],
 'language': 'en'}

In [7]:
from IPython.display import Audio

In [8]:
try:
    from gtts import gTTS
except ImportError:
    %pip install gtts
    from gtts import gTTS

In [9]:
def speak(text, file):
    tts = gTTS(text, lang='en')
    with open(file, 'wb') as f:
        tts.write_to_fp(f)
    return Audio(file)

In [10]:
speak(transcription['text'], '../datasets/audio/pm-2.mp3')

In [12]:
from langchain.llms import Ollama
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler 

In [13]:
llm = Ollama(model="llama2", 
             callback_manager = CallbackManager([StreamingStdOutCallbackHandler()]))

In [16]:
def answers(llm, prompt_qs, prompts, text):
    outputs = []
    for prompt, prompt_qs in zip(prompts, prompt_qs):
        print(prompt_qs, end="\n")
        output = llm(prompt, temperature=0.5)
        #print(output, end="\n\n")
        print("\n" + "=="*50, end="\n\n")
    outputs.append(output) 
    return outputs

In [17]:
prompt_qs = ["Please be concise."] 
prompts = [q + ":"+ transcription["text"] for q in prompt_qs]

outputs = answers(llm, prompt_qs, prompts, transcription["text"])

Please be concise.

The Prime Minister of India is Narendra Modi.



In [18]:
speak(outputs[0].replace("\n", ""), '../datasets/audio/pm-answer.mp3')

References

1. [Whisper](https://blog.devgenius.io/transcribing-youtube-videos-using-openais-whisper-%EF%B8%8F-%EF%B8%8F-a29d264d6fb1)
2. [Langchain and LLama](https://www.youtube.com/watch?v=k_1pOF1mj8k)
3. [English to Hindi using Transformers](https://prateekjoshi.medium.com/english-to-hindi-translation-made-simple-with-transformers-library-33f64f745552)

In [1]:
from IPython.display import YouTubeVideo
YouTubeVideo('https://www.youtube.com/watch?v=CuBzyh4Xmvk', width=500, height=300)

In [54]:
try:
    import yt_dlp
except ImportError:
    %pip install yt_dlp
    import yt_dlp

In [55]:
def download(video_id: str, save_path: str) -> str:
    video_url = f'https://www.youtube.com/watch?v={video_id}'
    ydl_opts = {
        'format': 'm4a/bestaudio/best',
        'paths': {'home': save_path},
        'outtmpl': {'default': "lecture.m4a"},
        'postprocessors': [{
            'key': 'FFmpegExtractAudio',
            'preferredcodec': 'm4a',
        }]
    }
    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        error_code = ydl.download([video_url])
        if error_code != 0:
            raise Exception('Failed to download video')

    return save_path

In [56]:
download('CuBzyh4Xmvk', '../datasets/audio/')

[youtube] Extracting URL: https://www.youtube.com/watch?v=CuBzyh4Xmvk
[youtube] CuBzyh4Xmvk: Downloading webpage
[youtube] CuBzyh4Xmvk: Downloading ios player API JSON
[youtube] CuBzyh4Xmvk: Downloading android player API JSON
[youtube] CuBzyh4Xmvk: Downloading m3u8 information
[info] CuBzyh4Xmvk: Downloading 1 format(s): 140
[download] ../datasets/audio/lecture.m4a has already been downloaded
[download] 100% of   72.26MiB
[ExtractAudio] Not converting audio ../datasets/audio/lecture.m4a; file is already in target format m4a


'../datasets/audio/'

In [19]:
audio_path = '../datasets/audio/lecture.m4a'
audio = AudioSegment.from_file(audio_path, format="m4a")

In [20]:
audio[:13000]

In [21]:
transcription = whisper_model.transcribe("../datasets/audio/lecture.m4a", fp16=True, verbose=False)

 99%|█████████▉| 465481/468481 [02:07<00:00, 3643.86frames/s]


In [22]:
print(transcription["text"][:500].replace(". ", "\n"))

 Please look at the code mentioned above and please sign up on the Google Cloud
We've already started making some announcements
You will likely end up missing the announcements and you'll have no one else to play with
The second quick logistical announcement is that we'll have an extra lecture on Saturday, 11th Jan at 11am in 1.101
So a lot of ones over there
And I think one or two people still have conflict, but in the larger, in the larger phone we'll have almost everyone available, so we


In [23]:
transcription.keys()

dict_keys(['text', 'segments', 'language'])

In [24]:
def create_srt_from_transcription(transcription_objects, srt_file_path):
    with open(srt_file_path, 'w') as srt_file:
        index = 1  # SRT format starts with index 1

        for entry in transcription_objects['segments']:
            start_time = entry['start']
            end_time = entry['end']
            text = entry['text']

            # Convert time to SRT format
            start_time_str = format_time(start_time)
            end_time_str = format_time(end_time)

            # Write entry to SRT file
            srt_file.write(f"{index}\n")
            srt_file.write(f"{start_time_str} --> {end_time_str}\n")
            srt_file.write(f"{text}\n\n")

            index += 1

def format_time(time_seconds):
    minutes, seconds = divmod(time_seconds, 60)
    hours, minutes = divmod(minutes, 60)
    return f"{int(hours):02d}:{int(minutes):02d}:{int(seconds):02d},000"


In [25]:
create_srt_from_transcription(transcription, "../datasets/audio/lecture.srt")

In [26]:
!head ../datasets/audio/lecture.srt

1
00:00:00,000 --> 00:00:05,000
 Please look at the code mentioned above and please sign up on the Google Cloud.

2
00:00:05,000 --> 00:00:08,000
 We've already started making some announcements.

3
00:00:08,000 --> 00:00:14,000


In [27]:
speak(transcription['text'][:1300], '../datasets/audio/hello.mp3')

In [28]:
try:
    from transformers import MBartForConditionalGeneration, MBart50TokenizerFast
except:
    %pip install transformers -U -q
    %pip install sentencepiece
    from transformers import MBartForConditionalGeneration, MBart50TokenizerFast
    

In [67]:

# download and save model
model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-one-to-many-mmt")

# import tokenizer
tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50-one-to-many-mmt", src_lang="en_XX")


In [68]:
text_to_translate = transcription["text"][:500].split(". ")
text_to_translate

[' Please look at the code mentioned above and please sign up on the Google Cloud',
 "We've already started making some announcements",
 "You will likely end up missing the announcements and you'll have no one else to play with",
 "The second quick logistical announcement is that we'll have an extra lecture on Saturday, 11th Jan at 11am in 1.101",
 'So a lot of ones over there',
 "And I think one or two people still have conflict, but in the larger, in the larger phone we'll have almost everyone available, so we"]

In [69]:
model_inputs = tokenizer(text_to_translate, return_tensors="pt", padding=True, truncation=True)

In [70]:
generated_tokens = model.generate(
    **model_inputs,
    forced_bos_token_id=tokenizer.lang_code_to_id["hi_IN"]
)

translation = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)



In [71]:
translation

['कृपया उपर्युक्त कोड को देखें और कृपया Google क्लाउड पर साइन अप करें',
 'हम पहले से ही कुछ घोषणाएं करने शुरू कर दी हैं',
 'आप शायद अंत में घोषणाओं को खो देंगे और आप के साथ खेलने के लिए कोई अन्य नहीं होगा',
 'दूसरा त्वरित लॉजिस्टिक घोषणा यह है कि हम एक अतिरिक्त व्याख्यान Saturday, 11th Jan 11am में 1.101 में होगा',
 'तो वहाँ के बहुत से',
 'और मुझे लगता है कि एक या दो लोग अभी भी संघर्ष है, लेकिन बड़ी, बड़ी फोन में हम लगभग सभी उपलब्ध हो जाएगा, तो हम']

In [31]:
llm = Ollama(model="mistral", 
             callback_manager = CallbackManager([StreamingStdOutCallbackHandler()]))
prompt_qs = ["Please provide a bullet-point summary for the given text:",
             "Highlight the important topics and subtopics in the given lecture:",
             "Give us some question for a quiz based on the following text:",
             "Summarize the following text in Hindi in 10 lines or less:",
            ]

prompts = [q + "\n\n" + transcription["text"] for q in prompt_qs]

outputs = answers(llm, prompt_qs, prompts, transcription["text"])

Please provide a bullet-point summary for the given text:
 * The text discusses a machine learning course and announces several logistical matters, including signing up for Google Cloud, an extra lecture on Saturday, and providing access to Google Docs for FAQ and project questions.
* The definition of machine learning is discussed, with the ability to learn without explicit programming being highlighted.
* A task to recognize digits from a dataset is introduced as an example, and rules are suggested for recognizing the digit "4".
* It is explained that traditional programming involves explicitly programming rules, while machine learning involves using data and experience to learn patterns and make predictions.
* An example of predicting tomato quality based on visual features is given, with the goal being to scale up this process in a business setting.
* The concept of precision and recall in machine learning evaluation metrics is touched upon, as well as the idea of a decision tree a