In [1]:
import datetime

import pandas as pd

## Grab our topics and transcripts

In [2]:
sheets_d = pd.read_excel(
    "../../data/raw/fsdl_2022_project_transcripts.xlsx", sheet_name=["lesson_topics", "lesson_transcripts"]
)
topics_df, transcripts_df = [v for k, v in sheets_d.items()]

topics_df.drop(columns="video_url", inplace=True)
transcripts_df.drop(columns="video_url", inplace=True)

topics_df["timestamp"] = topics_df["timestamp"].astype(str)
transcripts_df["timestamp"] = transcripts_df["timestamp"].astype(str)

In [3]:
print(len(topics_df))

topics_df.head()

600


Unnamed: 0,course_title,lesson_num,timestamp,topic
0,fast.ai 2022 - Part 1,2,00:00:00,Introduction
1,fast.ai 2022 - Part 1,2,00:00:55,Reminder to use the fastai book as a companion...
2,fast.ai 2022 - Part 1,2,00:02:06,aiquizzes.com for quizzes on the book
3,fast.ai 2022 - Part 1,2,00:02:36,"Reminder to use fastai forums for links, noteb..."
4,fast.ai 2022 - Part 1,2,00:03:42,How to efficiently read the forum with summari...


In [4]:
print(len(transcripts_df))

transcripts_df.head()

25283


Unnamed: 0,course_title,lesson_num,timestamp,transcript
0,fast.ai 2022 - Part 1,2,00:00:00,Hi everybody. Welcome to lesson two. Thanks fo...
1,fast.ai 2022 - Part 1,2,00:00:08,we had a bit of an “administrative issue” at o...
2,fast.ai 2022 - Part 1,2,00:00:14,doing this from the study at home. so sorry ab...
3,fast.ai 2022 - Part 1,2,00:00:25,I'm actually really really pumped about this l...
4,fast.ai 2022 - Part 1,2,00:00:32,"were like in the very early days, because we'r..."


## Define a utility function for converting durations to total_seconds

In [5]:
def convert_duration_to_seconds(v):
    hrs, mins, secs = v.split(":")
    return (60 * 60 * int(hrs)) + (60 * int(mins)) + int(secs)

## Define the start/end boundaries (in seconds) for each topic in each lesson

In [6]:
topics_df["start_seconds"] = topics_df["timestamp"].apply(convert_duration_to_seconds)
topics_df["end_seconds"] = topics_df.groupby(by=["course_title", "lesson_num"])["start_seconds"].shift(
    -1, fill_value=100000
)

In [7]:
topics_df.tail()

Unnamed: 0,course_title,lesson_num,timestamp,topic,start_seconds,end_seconds
595,parker - learn photography,1,03:10:17,Four Characteristics of Light,11417,12021
596,parker - learn photography,1,03:20:21,What Is Composition,12021,12245
597,parker - learn photography,1,03:24:05,Composition Techniques,12245,12496
598,parker - learn photography,1,03:28:16,50+ Composition Examples,12496,14131
599,parker - learn photography,1,03:55:31,Editing Tips + Software Choices,14131,100000


## Define the total number of elapsed seconds at each timestamp in the transcripts dataset

In [8]:
transcripts_df["elapsed_seconds"] = transcripts_df["timestamp"].apply(convert_duration_to_seconds)

In [9]:
transcripts_df.head()

Unnamed: 0,course_title,lesson_num,timestamp,transcript,elapsed_seconds
0,fast.ai 2022 - Part 1,2,00:00:00,Hi everybody. Welcome to lesson two. Thanks fo...,0
1,fast.ai 2022 - Part 1,2,00:00:08,we had a bit of an “administrative issue” at o...,8
2,fast.ai 2022 - Part 1,2,00:00:14,doing this from the study at home. so sorry ab...,14
3,fast.ai 2022 - Part 1,2,00:00:25,I'm actually really really pumped about this l...,25
4,fast.ai 2022 - Part 1,2,00:00:32,"were like in the very early days, because we'r...",32


## Build our training data.  

This should be usable for both segmentation and summarization tasks

In [10]:
merged_df = topics_df[["course_title", "lesson_num", "topic", "start_seconds", "end_seconds"]].merge(
    transcripts_df, on=["course_title", "lesson_num"]
)
len(merged_df)

467129

Keep only the merged records where the transcript lies inbetween the start/end of the topic

In [11]:
merged_df = merged_df[
    (merged_df.elapsed_seconds >= merged_df.start_seconds) & (merged_df.elapsed_seconds < merged_df.end_seconds)
]

In [12]:
merged_df.head()

Unnamed: 0,course_title,lesson_num,topic,start_seconds,end_seconds,timestamp,transcript,elapsed_seconds
0,fast.ai 2022 - Part 1,2,Introduction,0,55,00:00:00,Hi everybody. Welcome to lesson two. Thanks fo...,0
1,fast.ai 2022 - Part 1,2,Introduction,0,55,00:00:08,we had a bit of an “administrative issue” at o...,8
2,fast.ai 2022 - Part 1,2,Introduction,0,55,00:00:14,doing this from the study at home. so sorry ab...,14
3,fast.ai 2022 - Part 1,2,Introduction,0,55,00:00:25,I'm actually really really pumped about this l...,25
4,fast.ai 2022 - Part 1,2,Introduction,0,55,00:00:32,"were like in the very early days, because we'r...",32


For both segmentation and summarization tasks, we'll need to group the transcripts by course + lesson + topic

In [13]:
train_df = (
    merged_df[["course_title", "lesson_num", "topic", "transcript", "start_seconds"]]
    .groupby(by=["course_title", "lesson_num", "start_seconds", "topic"])
    .agg(list)
    .reset_index()
)

train_df.sort_values(by=["course_title", "lesson_num", "start_seconds"], inplace=True)

In [14]:
train_df.head()

Unnamed: 0,course_title,lesson_num,start_seconds,topic,transcript
0,C-Squared Podcast,1,0,Intro,[[Music] welcome everybody to episode one of a...
1,C-Squared Podcast,1,137,Candidates 2018,[camps look like in general yeah well you ment...
2,C-Squared Podcast,1,464,Candidates training,[going in the candidates like how was the expe...
3,C-Squared Podcast,1,610,Playing for 2nd place,[were you just like focused on grabbing first ...
4,C-Squared Podcast,1,916,Magnus' WC decision,[know you can't uh you can't tell him you have...


QA to training set

In [15]:
train_df[train_df["course_title"] == "fast.ai 2022 - Part 1"].iloc[0].transcript

["Welcome to Practical Deep Learning for coders, lesson one. This is version five of this course, and it's the first new one we've done in two years.",
 "So, we've got a lot of cool things to cover! It's amazing how much has changed.",
 'Here is an xkcd from the end of 2015.']

## Build segmentation training set

In [16]:
seg_train_df = train_df.copy()

In [17]:
seg_examples = []

for example_idx, example in seg_train_df.iterrows():
    for seq_idx, seq in enumerate(example["transcript"]):
        is_last_seq = len(example["transcript"]) == (seq_idx + 1)
        seg_examples.append(
            {
                "course_title": example["course_title"],
                "lesson_num": example["lesson_num"],
                "topic": example["topic"],
                "seq": str(seq),
                "next_seq": str(example["transcript"][seq_idx + 1]) if not is_last_seq else None,
                "other_topic_seqs": [
                    str(txt) for i, txt in enumerate(example["transcript"]) if i != seq_idx and i != seq_idx + 1
                ],
            }
        )

In [18]:
print(len(seg_examples))
print(seg_examples[0])

25383
{'course_title': 'C-Squared Podcast', 'lesson_num': 1, 'topic': 'Intro', 'seq': '[Music] welcome everybody to episode one of a', 'next_seq': "chess themed podcast with myself christian kirilla and i'm fighting on caruana so what's up", 'other_topic_seqs': ["christian well not so much fabi uh it's first of all great um to finally start a", "podcast the chess podcast i know that um there's a lot of podcasts out there but", "i wanted to bring our own tune to the mix and i think uh yeah i'm", "excited about that so that's uh the first thing how about yourself fabian well i'm back in the states after it's", "been a while at your home it's good to be here it's my first time in uh visiting here and uh", "yeah it's been an interesting few months played a lot of chess which is pretty cool but", 'also a bit difficult at times my home uh here we are not going to mention the location because', "those uh crazy fans who knows maybe they will uh track me down so you're back in the states you've

In [19]:
seg_train_df = pd.DataFrame(seg_examples)

In [20]:
print(len(seg_train_df))
seg_train_df.head(30)

25383


Unnamed: 0,course_title,lesson_num,topic,seq,next_seq,other_topic_seqs
0,C-Squared Podcast,1,Intro,[Music] welcome everybody to episode one of a,chess themed podcast with myself christian kir...,[christian well not so much fabi uh it's first...
1,C-Squared Podcast,1,Intro,chess themed podcast with myself christian kir...,christian well not so much fabi uh it's first ...,[[Music] welcome everybody to episode one of a...
2,C-Squared Podcast,1,Intro,christian well not so much fabi uh it's first ...,podcast the chess podcast i know that um there...,[[Music] welcome everybody to episode one of a...
3,C-Squared Podcast,1,Intro,podcast the chess podcast i know that um there...,i wanted to bring our own tune to the mix and ...,[[Music] welcome everybody to episode one of a...
4,C-Squared Podcast,1,Intro,i wanted to bring our own tune to the mix and ...,excited about that so that's uh the first thin...,[[Music] welcome everybody to episode one of a...
5,C-Squared Podcast,1,Intro,excited about that so that's uh the first thin...,been a while at your home it's good to be here...,[[Music] welcome everybody to episode one of a...
6,C-Squared Podcast,1,Intro,been a while at your home it's good to be here...,yeah it's been an interesting few months playe...,[[Music] welcome everybody to episode one of a...
7,C-Squared Podcast,1,Intro,yeah it's been an interesting few months playe...,also a bit difficult at times my home uh here ...,[[Music] welcome everybody to episode one of a...
8,C-Squared Podcast,1,Intro,also a bit difficult at times my home uh here ...,those uh crazy fans who knows maybe they will ...,[[Music] welcome everybody to episode one of a...
9,C-Squared Podcast,1,Intro,those uh crazy fans who knows maybe they will ...,a few months actually you were in europe for a...,[[Music] welcome everybody to episode one of a...


## Build summarization training set

In [21]:
summarization_train_df = train_df.copy()

In [22]:
summarization_train_df["transcript"] = summarization_train_df["transcript"].apply(
    lambda v: " ".join([str(seq) for seq in v])
)

In [23]:
summarization_train_df.head()

Unnamed: 0,course_title,lesson_num,start_seconds,topic,transcript
0,C-Squared Podcast,1,0,Intro,[Music] welcome everybody to episode one of a ...
1,C-Squared Podcast,1,137,Candidates 2018,camps look like in general yeah well you menti...
2,C-Squared Podcast,1,464,Candidates training,going in the candidates like how was the exper...
3,C-Squared Podcast,1,610,Playing for 2nd place,were you just like focused on grabbing first w...
4,C-Squared Podcast,1,916,Magnus' WC decision,know you can't uh you can't tell him you have ...


In [24]:
summarization_train_df.iloc[0].transcript

"[Music] welcome everybody to episode one of a chess themed podcast with myself christian kirilla and i'm fighting on caruana so what's up christian well not so much fabi uh it's first of all great um to finally start a podcast the chess podcast i know that um there's a lot of podcasts out there but i wanted to bring our own tune to the mix and i think uh yeah i'm excited about that so that's uh the first thing how about yourself fabian well i'm back in the states after it's been a while at your home it's good to be here it's my first time in uh visiting here and uh yeah it's been an interesting few months played a lot of chess which is pretty cool but also a bit difficult at times my home uh here we are not going to mention the location because those uh crazy fans who knows maybe they will uh track me down so you're back in the states you've been away for uh quite a few months actually you were in europe for a very long time when's the last time you were in the states well i played th

In [25]:
train_df.iloc[0].transcript

['[Music] welcome everybody to episode one of a',
 "chess themed podcast with myself christian kirilla and i'm fighting on caruana so what's up",
 "christian well not so much fabi uh it's first of all great um to finally start a",
 "podcast the chess podcast i know that um there's a lot of podcasts out there but",
 "i wanted to bring our own tune to the mix and i think uh yeah i'm",
 "excited about that so that's uh the first thing how about yourself fabian well i'm back in the states after it's",
 "been a while at your home it's good to be here it's my first time in uh visiting here and uh",
 "yeah it's been an interesting few months played a lot of chess which is pretty cool but",
 'also a bit difficult at times my home uh here we are not going to mention the location because',
 "those uh crazy fans who knows maybe they will uh track me down so you're back in the states you've been away for uh quite",
 "a few months actually you were in europe for a very long time when's the last tim