In [1]:
import datetime

import pandas as pd

In [2]:
sheets_d = pd.read_excel("../../data/raw/fsdl_2022_project_transcripts.xlsx", sheet_name=["lesson_topics", "lesson_transcripts"])
topics_df, transcripts_df = [v for k,v in sheets_d.items()]

topics_df.drop(columns="video_url", inplace=True)
transcripts_df.drop(columns="video_url", inplace=True)

topics_df["timestamp"] = topics_df["timestamp"].astype(str)
transcripts_df["timestamp"] = transcripts_df["timestamp"].astype(str)

In [3]:
print(len(topics_df))

topics_df.head()

397


Unnamed: 0,course_title,lesson_num,timestamp,topic
0,fast.ai 2022 - Part 1,2,00:00:00,Introduction
1,fast.ai 2022 - Part 1,2,00:00:55,Reminder to use the fastai book as a companion...
2,fast.ai 2022 - Part 1,2,00:02:06,aiquizzes.com for quizzes on the book
3,fast.ai 2022 - Part 1,2,00:02:36,"Reminder to use fastai forums for links, noteb..."
4,fast.ai 2022 - Part 1,2,00:03:42,How to efficiently read the forum with summari...


In [4]:
print(len(transcripts_df))

transcripts_df.head()

14791


Unnamed: 0,course_title,lesson_num,timestamp,transcript
0,fast.ai 2022 - Part 1,2,00:00:00,Hi everybody. Welcome to lesson two. Thanks fo...
1,fast.ai 2022 - Part 1,2,00:00:08,we had a bit of an “administrative issue” at o...
2,fast.ai 2022 - Part 1,2,00:00:14,doing this from the study at home. so sorry ab...
3,fast.ai 2022 - Part 1,2,00:00:25,I'm actually really really pumped about this l...
4,fast.ai 2022 - Part 1,2,00:00:32,"were like in the very early days, because we'r..."


Define a utility function for converting durations to total_seconds

In [5]:
def convert_duration_to_seconds(v):
    hrs, mins, secs = v.split(":")
    return ( 60 * 60 * int(hrs)) + (60 * int(mins)) + int(secs)

Define the start/end boundaries (in seconds) for each topic in each lesson

In [6]:
topics_df["start_seconds"] = topics_df["timestamp"].apply(convert_duration_to_seconds)
topics_df["end_seconds"] = topics_df.groupby(by=["course_title", "lesson_num"])["start_seconds"].shift(-1, fill_value=100000)

In [7]:
topics_df.tail()

Unnamed: 0,course_title,lesson_num,timestamp,topic,start_seconds,end_seconds
392,Full Stack Deep Learning - Spring 2021,13,00:02:44,ML Roles,164,675
393,Full Stack Deep Learning - Spring 2021,13,00:11:15,ML Organizations,675,1730
394,Full Stack Deep Learning - Spring 2021,13,00:28:50,Managing ML Teams,1730,2477
395,Full Stack Deep Learning - Spring 2021,13,00:41:17,Hiring ML Engineers (Or Getting Hired),2477,3414
396,Full Stack Deep Learning - Spring 2021,13,00:56:54,Conclusion,3414,100000


Define the total number of elapsed seconds at each timestamp in the transcripts dataset

In [8]:
transcripts_df["elapsed_seconds"] = transcripts_df["timestamp"].apply(convert_duration_to_seconds)

In [9]:
transcripts_df.head()

Unnamed: 0,course_title,lesson_num,timestamp,transcript,elapsed_seconds
0,fast.ai 2022 - Part 1,2,00:00:00,Hi everybody. Welcome to lesson two. Thanks fo...,0
1,fast.ai 2022 - Part 1,2,00:00:08,we had a bit of an “administrative issue” at o...,8
2,fast.ai 2022 - Part 1,2,00:00:14,doing this from the study at home. so sorry ab...,14
3,fast.ai 2022 - Part 1,2,00:00:25,I'm actually really really pumped about this l...,25
4,fast.ai 2022 - Part 1,2,00:00:32,"were like in the very early days, because we'r...",32


Build our training data.  

This should be usable for both segmentation and summarization tasks

In [18]:
merged_df = topics_df[["course_title", "lesson_num", "topic", "start_seconds", "end_seconds"]].merge(transcripts_df, on=["course_title", "lesson_num"])
len(merged_df)

264993

Keep only the merged records where the transcript lies inbetween the start/end of the topic

In [19]:
merged_df = merged_df[(merged_df.elapsed_seconds >= merged_df.start_seconds) & (merged_df.elapsed_seconds < merged_df.end_seconds)]

In [20]:
merged_df.head()

Unnamed: 0,course_title,lesson_num,topic,start_seconds,end_seconds,timestamp,transcript,elapsed_seconds
0,fast.ai 2022 - Part 1,2,Introduction,0,55,00:00:00,Hi everybody. Welcome to lesson two. Thanks fo...,0
1,fast.ai 2022 - Part 1,2,Introduction,0,55,00:00:08,we had a bit of an “administrative issue” at o...,8
2,fast.ai 2022 - Part 1,2,Introduction,0,55,00:00:14,doing this from the study at home. so sorry ab...,14
3,fast.ai 2022 - Part 1,2,Introduction,0,55,00:00:25,I'm actually really really pumped about this l...,25
4,fast.ai 2022 - Part 1,2,Introduction,0,55,00:00:32,"were like in the very early days, because we'r...",32


For both segmentation and summarization tasks, we'll need to group the transcripts by course + lesson + topic

In [24]:
train_df = merged_df[["course_title", "lesson_num", "topic", "transcript", "start_seconds"]].groupby(by=["course_title", "lesson_num", "start_seconds", "topic"]).agg(list).reset_index()

train_df.sort_values(by=["course_title", "lesson_num", "start_seconds"], inplace=True)

In [26]:
train_df.head()

Unnamed: 0,course_title,lesson_num,start_seconds,topic,transcript
0,Full Stack Deep Learning - Spring 2021,1,0,Intro,"[so josh talked about why we're doing, this co..."
1,Full Stack Deep Learning - Spring 2021,1,85,Neural Networks,"[so let's kick it off with neural, networks, u..."
2,Full Stack Deep Learning - Spring 2021,1,408,Universality,"[is universality which is, you know this neura..."
3,Full Stack Deep Learning - Spring 2021,1,528,Learning Problems,"[networks for, well we do for machine learning..."
4,Full Stack Deep Learning - Spring 2021,1,977,Empirical Risk Minimization / Loss Functions,"[um what's known as risk minimization and, the..."


QA to training set

In [28]:
train_df[train_df["course_title"] == "fast.ai 2022 - Part 1"].iloc[0].transcript

["Welcome to Practical Deep Learning for coders, lesson one. This is version five of this course, and it's the first new one we've done in two years.",
 "So, we've got a lot of cool things to cover! It's amazing how much has changed.",
 'Here is an xkcd from the end of 2015.']