In [1]:
# YoutubeCaption class
class YoutubeCaption:
    def __init__(self, caption):
        self.caption = caption
        self.sectionlist = self.format_sections()
        self.text = self.compile_sections()
        
#     print and preview caption
    def __str__(self):
        return f'<YoutubeCaption text="{self.text[:100]}... ">'
    
#     function to convert srt into a list of itmes in the format {"time": {time}, "subtitle": {subtitle}}
    def format_sections(self):
        sections = []
        caption = self.caption.split('\n')
#         Divide caption into sections (each sections is 4 lines each: 1. section number, 2. timestamps, 3. text, 4. newline)
        for index in range(len(caption) // 4):
            base = index * 4
#             create new item in subtitle
            sections.append({
            "section": caption[base],
            "time_original": caption[base+1],
            "time": self.format_time(caption[base + 1]),
            "subtitle": caption[base + 2]
            })
        return sections
    
    def compile_sections(self):
        text = " ".join([section["subtitle"] for section in self.sectionlist])
        return text
    
    def list_sections(self):
        return self.sectionlist
    
    def full_text(self):
        return self.text
    
    @classmethod
    def format_time(cls, time):
        """return starttime:endtime in seconds from original format, e.g. '00:00:00,000 --> 00:00:07,000'"""
        start_time, end_time = time.split(' --> ')
        start_time, end_time = cls.timestamp_to_seconds(start_time), cls.timestamp_to_seconds(end_time)
        return f'{start_time}:{end_time}'
        
    @classmethod
    def timestamp_to_seconds(cls, timestamp):
        hh, mm, ss = timestamp.split(':')
        ss, ms = ss.split(',')
        seconds = int(hh) * 60 * 60 + int(mm) * 60 + int(ss) + int(ms) / 1000
        return seconds
        

In [59]:
# WatsonCaption class
class WatsonCaption:
    def __init__(self, result):
        self.result = result
        self.sectionlist = self.format_sections()
        self.text = self.compile_sections()
        
#     print and preview caption
    def __str__(self):
        return f'<WatsonCaption text="{self.text[:100]}... ">'

    # convert to srt-like format
    @staticmethod
    def format_section(section):
        subtitle = section['transcript']
        start_time = section['timestamps'][0][1]
        end_time = section['timestamps'][-1][2]
        return {'time': f"{start_time}:{end_time}", 'subtitle': subtitle}
    
    # format sections from result in format {'section', 'time', 'subtitle'}    
    def format_sections(self):
        sections = []
        for index, section in enumerate(self.result["results"]):
            s = {'section': index + 1}
            s = {**s, **self.format_section(section['alternatives'][0])}
            sections.append(s)
        return sections
    
    def compile_sections(self):
        text = " ".join([section["subtitle"] for section in self.sectionlist])
        return text
    
    def list_sections(self):
        return self.sectionlist
    
    def full_text(self):
        return self.text

### Extract caption from youtube

In [7]:
# imports
import sys
from pytube import YouTube, Caption


## helper functions

In [5]:
# Original method a.en: English (auto-generated), en: English (US), en-GB: English (UK)
# Failed case: <Caption lang="English - jamake" code="en.FmoQciUtYSc"> (https://en.jamake.io/)

def find_en_caption(yt):
    """
    returns English caption for a given Pytube's Youtube object
    """
    for c in yt.caption_tracks:
        if 'en' in c.code:
            return c
    return None

In [6]:
def yt_find_id(yt):
    """
    return videoID for a given Pytube's Youtube object
    """
    return yt.initial_data["currentVideoEndpoint"]["watchEndpoint"]["videoId"]

## Download & extract audio

In [10]:
# Download and extract audio
def download_audio(yt):
    if 'AudioFileClip' not in dir():
        from moviepy.editor import AudioFileClip
    videoId = yt_find_id(yt)
    path = yt.streams.first().download(output_path=f"download/{videoId}", filename=f"{videoId}")

    audioclip = AudioFileClip(path)
    audio_filename = f"download/{videoId}/{videoId}.mp3"
    audioclip.write_audiofile(audio_filename)
    return audio_filename

## Transcription using watson speech to text

In [26]:
def setup_watson():
    if 'speech_to_text' in dir():
        return speech_to_text
    else:
        from ibm_watson import SpeechToTextV1
        from ibm_cloud_sdk_core.authenticators import IAMAuthenticator

        apikey = 'PvFDc759NRCbfBhRgKlqi87QsDl7kpNvQYkJZTaEfGCA'
        url = 'https://api.eu-gb.speech-to-text.watson.cloud.ibm.com/instances/33ecdf1d-e4a5-4521-83e6-d3c3c574d9b0'

        # authentication
        authenticator = IAMAuthenticator(apikey)
        speech_to_text = SpeechToTextV1(authenticator=authenticator)
        speech_to_text.set_service_url(url)
    return speech_to_text

In [44]:
def stt(file, sttmodel):
    with open(file, 'rb') as audio_file:
        result = sttmodel.recognize(
            audio=audio_file,
            content_type="audio/mp3",
            model='en-US_NarrowbandModel',
            timestamps=True
        ).get_result()
    return result

In [45]:
def generate_watson_caption(yt):
    speech_to_text = setup_watson()
    audio_filename = download_audio(yt)
    speech_recognition_result = stt(audio_filename, speech_to_text)
    return speech_recognition_result

In [46]:
# generate_watson_caption(yt)
speech_recognition_result

NameError: name 'speech_recognition_result' is not defined

In [65]:
# Transcription pipeline
# Create Youtube object with a link

url = "https://www.youtube.com/watch?v=DjdECYIfGgY"
yt = YouTube(url)

try:
#     Use youtube caption
    c = find_en_caption(yt)
    if c:
        caption = YoutubeCaption(c.generate_srt_captions())
        print("YoutubeCaption created")
    else:
#     use Watson API
        speech_recognition_result = generate_watson_caption(yt)
        
        caption = WatsonCaption(speech_recognition_result)
        print("WatsonCaption created")
except:
    print("no caption created")
    pass

chunk:   2%|▏         | 173/9072 [00:00<00:05, 1729.81it/s, now=None]

MoviePy - Writing audio in download/DjdECYIfGgY/DjdECYIfGgY.mp3


                                                                      

MoviePy - Done.
WatsonCaption created


In [48]:
youtube = YoutubeCaption(c.generate_srt_captions())
watson = WatsonCaption(speech_recognition_results_US)

AttributeError: 'NoneType' object has no attribute 'generate_srt_captions'

In [66]:
caption.full_text()
caption.sectionlist

[{'section': 1,
  'time': '0.0:120.0',
 {'section': 2,
  'time': '120.0:239.71',
  'subtitle': "live in U. K. and you have an account with by now the only two methods available for withdrawing pound starting back to your current account for whatever else %HESITATION both unavailable they have both mysteriously disappeared and being placed on the maintenance now over the weekend by has been lined the way any issues at all they said the best movie just in no way affect anything on the ramp or a minor dot com on Twitter by police to the chain that said we are aware of recent reports about next year you could notice in relation to finance markets limited B. M. L. B. a male is a separate legal entity and does not offer any products or services by %HESITATION by Nance dot the by nine dot com website the Bynum group acquired B. M. L. may twenty twenty and has not get lost if you can %HESITATION use this S. yeah regulation commission for questions please be email please contact compliance a bi

In [None]:
# Testing code

# for c in yt.caption_tracks:
#     print(c)

caption.text.replace("\'","")

In [70]:
# import json 

# with open('example_watsoncaption.txt', 'w') as f:
#     f.writelines(json.dumps(caption.sectionlist))

# example output Youtube SRT caption.generate_srt_captions()
"""'1\n00:00:00,000 --> 00:00:07,000\nTranscriber: Victor Borges Reviewer: David DeRuwe\n\n2\n00:00:11,791 --> 00:00:12,799\nWow,\n\n3\n00:00:13,030 --> 00:00:14,220\nwhat an audience.\n\n4\n00:00:14,515 --> 00:00:17,531\nBut if I\'m being honest, I don\'t care what you think of my talk.\n\n5\n00:00:18,097 --> 00:00:19,101\nI don\'t.\n\n6\n00:00:19,101 --> 00:00:21,100\nI care what the internet thinks of my talk.\n\n7\n00:00:21,100 --> 00:00:22,101\n(Laughter)\n\n8\n00:00:22,101 --> 00:00:24,579\nBecause they are the ones who get it seen and shared.\n\n9\n00:00:24,579 --> 00:00:26,907\nAnd I think that\'s where most people get it wrong.\n\n10\n00:00:26,907 --> 00:00:28,574\nThey\'re talking to you, here,\n\n11\n00:00:28,660 --> 00:00:33,290\ninstead of talking to you, random person scrolling Facebook.\n\n12\n00:00:34,323 --> 00:00:35,503\nThanks for the click.\n\n13\n00:00:36,141 --> 00:00:37,825\nYou see, back in 2009,\n\n14\n00:00:37,855 --> 00:00:41,400\nwe all had these weird little things called attention spans.\n\n15\n00:00:41,400 --> 00:00:42,400\n(Laughter)\n\n16\n00:00:42,400 --> 00:00:45,596\nYeah, they\'re gone. They\'re gone. We killed them. They\'re dead.\n\n17\n00:00:46,287 --> 00:00:50,060\nI\'m trying to think of the last time I watched an 18-minute TED talk.\n\n18\n00:00:50,140 --> 00:00:52,640\nIt\'s been years, literally years.\n\n19\n00:00:52,865 --> 00:00:55,560\nSo if you\'re giving a TED talk, keep it quick.\n\n20\n00:00:55,700 --> 00:00:57,500\nI\'m doing mine in under a minute.\n\n21\n00:00:57,560 --> 00:00:59,155\nI\'m at 44 seconds right now;\n\n22\n00:00:59,155 --> 00:01:01,283\nthat means we\'ve got time for one final joke.\n\n23\n00:01:01,450 --> 00:01:04,500\nWhy are balloons so expensive?\n\n24\n00:01:04,580 --> 00:01:05,705\n(Audience) "Why?"\n\n25\n00:01:05,705 --> 00:01:06,941\nWoody Roseland: Inflation.\n\n26\n00:01:06,941 --> 00:01:08,000\n(Laughter)\n\n27\n00:01:08,000 --> 00:01:10,666\n(Applause)'"""

# example output: speech_recognition_results_US
"""
{'result_index': 0, 'results': [{'final': True, 'alternatives': [{'transcript': '%HESITATION ', 'confidence': 0.79, 'timestamps': [['%HESITATION', 0.0, 0.64]]}]}, {'final': True, 'alternatives': [{'transcript': 'wow ', 'confidence': 0.96, 'timestamps': [['wow', 11.79, 12.17]]}]}, {'final': True, 'alternatives': [{'transcript': "what an audience but if I'm being honest I don't care what you think of my talk ", 'confidence': 0.84, 'timestamps': [['what', 13.04, 13.25], ['an', 13.25, 13.34], ['audience', 13.34, 13.9], ['but', 14.5, 14.64], ['if', 14.64, 14.71], ["I'm", 14.71, 14.79], ['being', 14.79, 15.0], ['honest', 15.0, 15.47], ['I', 15.81, 15.9], ["don't", 15.9, 16.07], ['care', 16.07, 16.23], ['what', 16.23, 16.32], ['you', 16.32, 16.41], ['think', 16.41, 16.57], ['of', 16.57, 16.63], ['my', 16.63, 16.72], ['talk', 16.72, 17.13]]}]}, {'final': True, 'alternatives': [{'transcript': "I don't like you're the internet thanks my talk ", 'confidence': 0.77, 'timestamps': [['I', 18.15, 18.26], ["don't", 18.26, 18.52], ['like', 19.02, 19.16], ["you're", 19.16, 19.3], ['the', 19.3, 19.45], ['internet', 19.45, 19.77], ['thanks', 19.77, 19.98], ['my', 19.98, 20.16], ['talk', 20.16, 20.54]]}]}, {'final': True, 'alternatives': [{'transcript': "because they're the ones who get it seen and get a Cher and I think that's where most people get it wrong you're talking to you here sort of talking to you random person scrolling Facebook ", 'confidence': 0.83, 'timestamps': [['because', 21.84, 21.95], ["they're", 21.95, 22.11], ['the', 22.11, 22.21], ['ones', 22.21, 22.47], ['who', 22.47, 22.56], ['get', 22.56, 22.68], ['it', 22.68, 22.79], ['seen', 22.79, 23.25], ['and', 23.29, 23.45], ['get', 23.45, 23.56], ['a', 23.56, 23.61], ['Cher', 23.61, 24.02], ['and', 24.44, 24.58], ['I', 24.58, 24.63], ['think', 24.63, 24.84], ["that's", 24.84, 24.99], ['where', 24.99, 25.11], ['most', 25.11, 25.38], ['people', 25.38, 25.59], ['get', 25.59, 25.72], ['it', 25.72, 25.8], ['wrong', 25.8, 26.2], ["you're", 26.37, 26.5], ['talking', 26.5, 26.78], ['to', 26.78, 26.86], ['you', 26.86, 27.28], ['here', 27.59, 28.22], ['sort', 28.68, 28.93], ['of', 28.93, 29.0], ['talking', 29.0, 29.28], ['to', 29.28, 29.35], ['you', 29.35, 29.77], ['random', 30.08, 30.71], ['person', 30.75, 31.41], ['scrolling', 31.58, 32.36], ['Facebook', 32.43, 33.22]]}]}, {'final': True, 'alternatives': [{'transcript': 'thanks for the quick ', 'confidence': 0.71, 'timestamps': [['thanks', 34.36, 34.61], ['for', 34.61, 34.68], ['the', 34.68, 34.77], ['quick', 34.77, 35.17]]}]}, {'final': True, 'alternatives': [{'transcript': 'you see back in two thousand nine we all had these weird little things called attention spans ', 'confidence': 0.93, 'timestamps': [['you', 36.17, 36.33], ['see', 36.33, 36.54], ['back', 36.54, 36.85], ['in', 36.85, 36.96], ['two', 36.96, 37.15], ['thousand', 37.15, 37.58], ['nine', 37.58, 37.87], ['we', 37.87, 38.0], ['all', 38.0, 38.17], ['had', 38.17, 38.39], ['these', 38.39, 38.61], ['weird', 38.61, 38.98], ['little', 38.98, 39.18], ['things', 39.18, 39.57], ['called', 39.57, 40.09], ['attention', 40.35, 40.83], ['spans', 40.83, 41.58]]}]}, {'final': True, 'alternatives': [{'transcript': "yeah they're gone they're gone we killed in the day ", 'confidence': 0.61, 'timestamps': [['yeah', 42.39, 42.64], ["they're", 42.76, 42.92], ['gone', 42.92, 43.21], ["they're", 43.27, 43.4], ['gone', 43.4, 43.61], ['we', 43.61, 43.72], ['killed', 43.72, 43.98], ['in', 43.98, 44.14], ['the', 44.8, 44.94], ['day', 44.94, 45.26]]}]}, {'final': True, 'alternatives': [{'transcript': "I'm trying to think of the last time I watched and eighteen minutes had talked it's been years literally years so did you have any pets talk keep it quick I'm doing mine in under a minute ", 'confidence': 0.89, 'timestamps': [["I'm", 46.28, 46.45], ['trying', 46.45, 46.65], ['to', 46.65, 46.72], ['think', 46.72, 46.91], ['of', 46.91, 46.99], ['the', 46.99, 47.08], ['last', 47.08, 47.4], ['time', 47.4, 47.66], ['I', 47.66, 47.88], ['watched', 47.88, 48.3], ['and', 48.3, 48.45], ['eighteen', 48.48, 48.93], ['minutes', 48.93, 49.24], ['had', 49.24, 49.44], ['talked', 49.44, 49.82], ["it's", 50.16, 50.25], ['been', 50.25, 50.44], ['years', 50.44, 50.81], ['literally', 51.1, 51.7], ['years', 51.76, 52.21], ['so', 52.86, 53.03], ['did', 53.03, 53.21], ['you', 53.21, 53.27], ['have', 53.27, 53.37], ['any', 53.37, 53.52], ['pets', 53.52, 53.82], ['talk', 53.82, 54.17], ['keep', 54.6, 54.83], ['it', 54.83, 54.92], ['quick', 54.92, 55.45], ["I'm", 55.71, 55.88], ['doing', 55.88, 56.1], ['mine', 56.1, 56.34], ['in', 56.34, 56.43], ['under', 56.43, 56.64], ['a', 56.64, 56.71], ['minute', 56.71, 57.04]]}]}, {'final': True, 'alternatives': [{'transcript': 'forty four seconds right now that means you got time for one final joke why are balloons so expensive ', 'confidence': 0.95, 'timestamps': [['forty', 57.93, 58.11], ['four', 58.11, 58.33], ['seconds', 58.33, 58.71], ['right', 58.71, 58.93], ['now', 58.93, 59.1], ['that', 59.1, 59.26], ['means', 59.26, 59.42], ['you', 59.42, 59.5], ['got', 59.5, 59.68], ['time', 59.68, 59.92], ['for', 59.92, 60.04], ['one', 60.04, 60.26], ['final', 60.26, 60.58], ['joke', 60.58, 61.04], ['why', 61.43, 62.01], ['are', 62.12, 62.3], ['balloons', 62.3, 63.04], ['so', 63.09, 63.34], ['expensive', 63.34, 64.2]]}]}, {'final': True, 'alternatives': [{'transcript': 'inflation ', 'confidence': 0.88, 'timestamps': [['inflation', 65.76, 66.45]]}]}]}
"""
