### Data Ingestion from video (Youtube)

In [1]:
import pandas as pd
import youtube_transcript_api
from datetime import timedelta
import srt

#https://www.youtube.com/watch?v=Mot91oWrDX0
- one sample video for data ingestion

In [2]:
videos = {
    "id": ["Mot91oWrDX0"],
    "title": ["BUILD23 - Gary Brecka"]
}

In [3]:
# first expectation test
assert len(videos["title"]) == len(videos["id"])

In [18]:
videos_df = pd.DataFrame.from_dict(videos)
videos_df.index = videos_df["id"]
videos_df = videos_df.drop("id", axis="columns")
videos_df

Unnamed: 0_level_0,title
id,Unnamed: 1_level_1
Mot91oWrDX0,BUILD23 - Gary Brecka


We use the youtube_transcript_api package to pull down the transcripts in a single line of Python.

In [19]:
from youtube_transcript_api import YouTubeTranscriptApi


%time transcripts = [YouTubeTranscriptApi.get_transcript(video_id) for video_id in videos_df.index]

CPU times: user 71.8 ms, sys: 11.7 ms, total: 83.5 ms
Wall time: 942 ms


Conveniently enough, every second of a YouTube video is individually linkable and the transcripts come with timestamps.

But a second of speech is not a useful source.

And by default, the subtitles come "chunked" in time at too fine a grain as well: more like five seconds than the thirty to sixty seconds that it takes to make a reasonable point.

So now, we combine the five-second subtitle timestamps into longer chunks based on character count -- 750 seems to generate nicely sized chunks on our corpus.

In [26]:
transcripts[0][:5]

[{'text': 'this speaker', 'start': 0.12, 'duration': 4.32},
 {'text': 'outperformed Tony Robbins last year he',
  'start': 1.7,
  'duration': 4.78},
 {'text': 'was more popular and people love Tony',
  'start': 4.44,
  'duration': 4.44},
 {'text': "Robbins it's no put in not sailing",
  'start': 6.48,
  'duration': 4.26},
 {'text': "against Tony he's amazing I love the guy",
  'start': 8.88,
  'duration': 4.5}]

In [28]:
TRIGGER_LENGTH = 750  # 30-60 seconds

def merge(subtitles, idx):
    new_content = combine_content(subtitles)

    # preserve start as timedelta
    new_start = seconds_float_to_timedelta(subtitles[0]["start"])
    # merge durations as timedelta
    new_duration = seconds_float_to_timedelta(sum(sub["duration"] for sub in subtitles))

    # combine
    new_end = new_start + new_duration

    return srt.Subtitle(index=idx, start=new_start, end=new_end, content=new_content)


def combine_content(subtitles):
    contents = [subtitle["text"].strip() for subtitle in subtitles]
    return " ".join(contents) + "\n\n"


def get_charcount(subtitle):
    return len(subtitle["text"])


def seconds_float_to_timedelta(x_seconds):
    return timedelta(seconds=x_seconds)


def merge_subtitles(subtitles):
    merged_subtitles = []
    current_chunk, current_length, chunk_idx = [], 0, 1

    for subtitle in subtitles:
        current_chunk.append(subtitle)
        added_length = get_charcount(subtitle)
        new_length = current_length + added_length

        if new_length >= TRIGGER_LENGTH:
            merged_subtitle = merge(current_chunk, chunk_idx)
            merged_subtitles.append(merged_subtitle)
            current_chunk, current_length = [], 0
            chunk_idx += 1
        else:
            current_length = new_length

    if current_chunk:
        merged_subtitle = merge(current_chunk, chunk_idx)
        merged_subtitles.append(merged_subtitle)

    return merged_subtitles


%time subtitle_collections = [merge_subtitles(transcript) for transcript in transcripts]

# get strings as well for quick checks (and easier to write to files)
%time subtitle_strings = [srt.compose(merged_subtitles) for merged_subtitles in subtitle_collections]

CPU times: user 2.26 ms, sys: 20 µs, total: 2.28 ms
Wall time: 2.3 ms
CPU times: user 1.48 ms, sys: 87 µs, total: 1.56 ms
Wall time: 1.66 ms


In [33]:
subtitle_collections[0][:2]

[Subtitle(index=1, start=datetime.timedelta(microseconds=120000), end=datetime.timedelta(seconds=102, microseconds=119000), content="this speaker outperformed Tony Robbins last year he was more popular and people love Tony Robbins it's no put in not sailing against Tony he's amazing I love the guy but people said he was better so let me ask you a question imagine you're living in your dream home you got the cabin for the family the beach house your bills are paid you've built something big but you don't have your health I remember Curtis crawling through his house for months literally sliding on a piece of cardboard his wife would drag him through the house because he couldn't walk how important is our health well our final speaker today is about to rock your world about elite health not just yeah I'm healthy I mean Elite levels of health help me welcome the stage one of my favorite well no no no no no no no\n\n", proprietary=''),
 Subtitle(index=2, start=datetime.timedelta(seconds=50,

In [39]:
subtitle_strings[0][:750]

"1\n00:00:00,120 --> 00:01:42,119\nthis speaker outperformed Tony Robbins last year he was more popular and people love Tony Robbins it's no put in not sailing against Tony he's amazing I love the guy but people said he was better so let me ask you a question imagine you're living in your dream home you got the cabin for the family the beach house your bills are paid you've built something big but you don't have your health I remember Curtis crawling through his house for months literally sliding on a piece of cardboard his wife would drag him through the house because he couldn't walk how important is our health well our final speaker today is about to rock your world about elite health not just yeah I'm healthy I mean Elite levels of health "

We then add YouTube URLs for those longer subtitles as sources and combine them into a single DataFrame.

In [None]:
base_url_format = "https://www.youtube.com/watch?v={id}"
query_params_format = "&t={start}s"


def create_split_video_df(subtitles, base_url):
    rows = []
    for subtitle in subtitles:
        raw_text = subtitle.content
        text = raw_text.strip()
        start = timestamp_from_timedelta(subtitle.start)
        url = base_url + query_params_format.format(start=start)

        rows.append({"text": text, "source": url})

    video_df = pd.DataFrame.from_records(rows)
    return video_df


def timestamp_from_timedelta(td):
    return int(td.total_seconds())


split_video_dfs = [
    create_split_video_df(subtitles, base_url_format.format(id=video_id))
    for subtitles, video_id in zip(subtitle_collections, videos_df.index)
]

%time split_video_df = pd.concat(split_video_dfs, ignore_index=True)

In [None]:
%time split_video_df