### Speech Recognition (Speed of Videos)

In [7]:
# Most imports are for testing purposes only.
import os
import shutil
import cv2
import re
import speech_recognition as sr
import moviepy.editor as mp
from pathlib import Path
from pydub import AudioSegment
from pydub.utils import make_chunks


#### If you have the video script and length of video...
Take total amount of words and divide by video time. \
This is really obvious, but I'm just writing this in case someone from the YouTube scraping team have these values already

In [8]:
# input: string_script: the subtitles of a video as a string
# input: video length: a string in HH:MM:SS format (hours, minutes, seconds)
# not sure if this is the output from the YouTube API. If not, should be a quick fix.

# output: number of words per second

def string_speed(string_script, video_length):
    # 3600 seconds in an hour, 60 seconds in a minute, 1 second in a second
    second_converter = [3600, 60, 1]
    time_in_sec = 0
    for i in range(3):
        time_in_sec += (second_converter[i] * int(video_length.split(":")[i]))

    num_words = len(string_script.split())

    print("The number of words per second is", num_words / time_in_sec)

    return num_words / time_in_sec


#### If you only have the video file (and perhaps not even the subtitles)...
This program assumes that you have the video in question locally.

In [9]:
# getting video path
current = %pwd
from pathlib import Path
path = Path(current)
filename = str(path.parent.absolute())
video_name = "Binging with Babish_ Patrick's Briefcase from SpongeBob SquarePants.mp4"
filename += '\\' + video_name
filename

"c:\\Users\\brian\\Desktop\\PYUM\\P-YUM\\Binging with Babish_ Patrick's Briefcase from SpongeBob SquarePants.mp4"

In [10]:
# input: the video file
# output: length of video file in seconds

def find_video_length(filename):

    video = cv2.VideoCapture(filename)

    fps = video.get(cv2.CAP_PROP_FPS)
    frame_count = video.get(cv2.CAP_PROP_FRAME_COUNT)

    return frame_count / fps


In [11]:
# input: video file (string)
# output: audio file (audio from video)
# audio file named "converted.wav"
# audio file MUST BE LOSSLESS
def vid_to_audio(filename):
    clip = mp.VideoFileClip(filename)
    mp3_clip = clip.audio.write_audiofile(r"converted.wav")


vid_to_audio(filename)

MoviePy - Writing audio in converted.wav


                                                                      

MoviePy - Done.




In [12]:
# we must split the videos into smaller chunks because the voice
# recognizer feature doesn't work for files that are larger than
# 10 megabytes.
# input: name of audio file
# output: a new folder called "chunked" contains 30 second clips of video
def process_sudio(file_name):
    myaudio = AudioSegment.from_file(file_name, "wav")
    chunk_length_ms = 30000  # in milliseconds
    chunks = make_chunks(myaudio, chunk_length_ms)  # Make chunks of one sec
    for i, chunk in enumerate(chunks):
        chunk_name = './chunked/' + file_name + "_{0}.wav".format(i)
        chunk.export(chunk_name, format="wav")


all_file_names = os.listdir()
try:
    os.makedirs('chunked')
except:
    pass
for each_file in all_file_names:
    if ('.wav' in each_file):
        process_sudio(each_file)
os.remove("converted.wav")  # delete the original wav file.


In [13]:
# define recognizer
r = sr.Recognizer()


In [14]:
# actually converts the audio file into text
# input: wav audio file
# output: the speech in string format
def speech_converter(wav_file):
    wav_name = './chunked/' + wav_file
    audio = sr.AudioFile(wav_name)
    
    with audio as source:
        audio_file = r.record(source)
    result = r.recognize_google(audio_file)
    return result



In [15]:
list_of_chunks_words = []

path = "./"
AllFiles = list(os.walk(path))
counter = 0
for item in AllFiles:
    foldername, LoDirs, LoFiles = item   # cool unpacking!

    for filename_thing in LoFiles:
        if filename_thing[-3:] == "wav":
            counter += 1
            try:
                thirty_sec = speech_converter(filename_thing)
            except:
                print("""Either speech isn't being recognized (for this chunk) or the speech recognition doesn't like your request.""")
            list_of_chunks_words.append(thirty_sec)



Either speech isn't being recognized (for this chunk) or the speech recognition doesn't like your request.


In [16]:
entire_script = ''
for chunks in list_of_chunks_words:
    entire_script += " " + chunks

with open('final_output.txt', mode='w') as script_file:
    script_file.write(entire_script)


In [17]:
entire_script

" this episode is brought to you by cash app when personal finance connects you to both your funds and stuff that matters that's money and that's cash app you know what else has money cracking two eggs at once perfectly on the first try using a torch to overcome your fear of bananas and of course packing a healthy lunch for work that's money that's cash app download cash app from the app store or Google Play Store today to add your cash tag to the 80 million and Counting real bad thing by the way you forgot your briefcase real bad thing by the way you forgot your briefcase traps to accommodate our overtime lunch for which going to need a whole lot of donuts and since we want lots of variety I think cake Donuts our best bet for basic vanilla donut with starting up by combining 240 G of all-purpose flour with 200 g of granulated sugar 100 G of brown sugar whisk together the dry and so to the wet underneath middle of whole milk 75 of vegetable oil 1 tbsp of vanilla paste and two large egg

In [18]:
num_of_words = len(entire_script.split())
length_of_vid = find_video_length(filename)
print(f'Your video had around {length_of_vid / num_of_words} words per second')

Your video had around 0.342894578313253 words per second


In [19]:
# deletes the wav files

def clear_wav():
    try:
        shutil.rmtree("chunked")
    except:
        print("No chunked directory found")

clear_wav()