# SPEECH2TEXT

## LIBRARIES

In [1]:
#!pip install -U openai-whisper
#!pip install transformers
#!pip install torchaudio
#!pip install pytube
#!pip uninstall moviepy
#!pip install librosa

In [13]:
import torch
import whisper
import librosa
import pytube
import moviepy.editor as mp
from transformers import WhisperProcessor, WhisperForConditionalGeneration
from IPython.display import YouTubeVideo

import warnings
warnings.filterwarnings('ignore')

## FUNCTIONS

In [3]:
def download_video(youtube_link, filename='video.mp4'):
    yt = pytube.YouTube(youtube_link);
    duration = yt.length;
    yt = yt.streams.get_highest_resolution();
    yt.download(filename = filename);
    return duration

In [4]:
def create_audio_clip(video_file_name_path, clip_start, clip_end, file_name = 'audio.mp3'):
    clip = mp.VideoFileClip(video_file_name_path).subclip(clip_start,clip_end);
    clip.audio.write_audiofile(file_name, logger = None);

In [5]:
def transcribe_audio_clip(audio_path, model, processor, skip_special_tokens, device, samplerate):
    # load audio files
    speech, _ = librosa.load(audio_path);
    # get the input features
    input_features = processor(speech, return_tensors="pt", sampling_rate = samplerate).input_features.to(device);
    # generate token ids
    predicted_ids = model.generate(input_features);
    # decode token ids to text
    transcription = processor.batch_decode(predicted_ids, skip_special_tokens=skip_special_tokens);
    return transcription

## TRANSCRIPTION

- load model - whisper pretrained model

In [6]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
pretained_model = "openai/whisper-large"

In [7]:
# load model and processor
processor = WhisperProcessor.from_pretrained(pretained_model);
model = WhisperForConditionalGeneration.from_pretrained(pretained_model).to(device);
model.config.forced_decoder_ids = processor.get_decoder_prompt_ids(language = "spanish", task = "transcribe");

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


- load input - download video

In [23]:
youtube_video_id = "rAAgMDU7ftY"
youtube_link = "https://www.youtube.com/watch?v="+youtube_video_id
maxima = download_video(youtube_link);

# Youtube
HTML('<iframe width="560" height="315" src="https://www.youtube.com/embed/'+youtube_video_id+'?" title="YouTube video player" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture; web-share" allowfullscreen></iframe>')

- get output - extract audio clip and transcribe

In [9]:
clip_length = 30
clips = [(n, min(n+clip_length, maxima)) for n in range(0, maxima, clip_length)]
skip_special_tokens = True

In [10]:
for clip in clips:
    #extract audio clip
    create_audio_clip('video.mp4', clip[0], clip[1]);
    #transcribe audio clip
    transcription = transcribe_audio_clip('audio.mp3', model, processor, skip_special_tokens, device, 16000);
    print(transcription)

[' Los cojones me los está calentando tú. Yo a ti no te disona, ni te estoy hablando de mala manera, ni te estoy vacilando, simplemente que vas a exponer tú más que yo. Y eso no lo veo justo, porque el trabajo gran parte también lo he hecho yo. La intervención la he hecho yo, la tabla la he hecho yo, todo ha interpretado yo. Y le he mandado el correo al profesor porque no lo todos querían y manda vosotros. Todo he hecho yo. Todo que he hecho es la diapositiva.']
