### Build : 

```
!pip install -q pytube
!pip install -q git+https://github.com/openai/whisper.git
!pip install -r reqs.txt

```


### Setup

In [None]:
import torch
from pathlib import Path
from pytube import YouTube

import whisper
from whisper.utils import get_writer

In [None]:
input_format : str = 'local'
file :str = "sample_audio.mp4"
plain : bool = True #save the transcribed file
srt : bool = True ; tsv : bool = True ; vtt : bool= True
download : bool = True #download the transcribed files


### Configuration

In [None]:
DEBUG : bool = False
device = 'cuda' if torch.cuda.is_available() else 'cpu'
if DEBUG : print(f'Executing on : {device}')
# instanciate the whisper model : english version (en)
model = whisper.load_model('medium.en').to(device)

### Transcribe the audio file

In [None]:
from pathlib import Path


def transcribe_audio_file (model , file, plain,srt, vtt, tsv):
    '''
    loads and runs the whisper model on audio files and returns the written version of them
    '''
    file_path = Path(file)
    if DEBUG : 
        print(f'Start of transcription of : {file_path}')
    # Save the transcribed file into the source dir
    output_dir = file_path.parent
    
    # run Whisper
    transcript = model.transcribe(file, verbose=False, language='en')
    
    # Check other arguments to customiwe the saving
    if plain :
        txt_path = file_path.with_suffix('.txt')
        print(f'Creating a plain text file : {txt_path}')
        with open(txt_path, 'w', encoding='utf-8') as txt :
            txt.write(transcript['text'])
    if srt:
        print(f'Creating srt file ')
        srt_writer = get_writer('srt', output_dir)
        srt_writer(transcript, str(file_path.stem)) 
        
    if vtt :
        print('Creating VTT file')       
        vtt_writer = get_writer('vtt', output_dir)
        vtt_writer(transcript, str(file_path.stem))
        
    if tsv :
        print('Creating a tsv file')
        tsv_writer = get_writer('tsv', output_dir)
        tsv_writer(transcript, str(file_path.stem))
        
        
    return transcript
        
    

In [None]:
transcript = transcribe_audio_file(model, file, plain, srt, vtt, tsv)

In [None]:
transcript['text']