In [1]:
%reload_ext autoreload
%autoreload 2

In [2]:
import gc
from pathlib import Path

from fastai.vision.all import *
import torch

from course_copilot import transcription, topic_segmentation, summarization

  from .autonotebook import tqdm as notebook_tqdm


## Configuration

In [3]:
youtube_id = "Jsz4E2iNXUA"
device = "cuda"

audio_files_fpath = Path("../../transcription/audio_files")
transcription_fpath = Path("../../transcription/transcriptions")
whisper_models_fpath = Path("../../transcription/models")

whisper_model = "base"
topic_segmentation_learner_fpath = "../../models/topic_segmentation_deberta_v3_small.pkl"
headline_summarization_learner_fpath = "../../models/headline_summarization.pkl"
content_summarization_learner_fpath = "../../models/content_summarization.pkl"

# smaller = more fine grained topics (more topics predicted) while larger values = less fine grained (less topics predicted)
topic_segmentation_get_preds_kwargs = {"threshold_std_coeff": 1.9}

## Transcription

In [4]:
audio_fpath = transcription.fetch_youtube_audio(youtube_id, audio_files_fpath=audio_files_fpath)

transcription_fpath = transcription.fetch_transcription(
    audio_fpath=audio_fpath,
    transcription_fpath=transcription_fpath,
    model_fpath=whisper_models_fpath,
    model_checkptoint=whisper_model,
    device=device,
)

transcription_df = transcription.transcription_to_df(transcription_fpath)
transcription_df.head()

Unnamed: 0,elapsed_seconds,timestamp,transcript
0,0.0,00:00:00.000,Let me make sure everything is as it should be.
1,3.56,00:00:03.560,I'm always fascinated by the fact that people are waiting.
2,6.72,00:00:06.720,It's like it's it's so surprising that people are like on here
3,12.7,00:00:12.700,sometimes early for you.
4,14.12,00:00:14.120,It makes sense for my session.


In [5]:
transcription_df.shape

(633, 3)

## Topic Segmentation

In [6]:
topic_segmentation_trainer = topic_segmentation.TopicSegmentationModelTrainer(
    experiment_name="deberta_v3_small", train_config=topic_segmentation.TopicSegmentationConfig
)
# results_df, train_df, val_idxs = trainer.train()
inf_learn = topic_segmentation_trainer.load_learner_or_model(
    model_learner_fpath=topic_segmentation_learner_fpath, device=device
)

topic_seg_preds_df, pred_topic_idxs = topic_segmentation_trainer.get_preds(
    inf_learn, transcription_df.copy(), **topic_segmentation_get_preds_kwargs
)

# cleanup resources
del inf_learn
torch.cuda.empty_cache()
gc.collect()

# shows final results
print("# predicted topics: ", len(pred_topic_idxs))
print(pred_topic_idxs[:10])
topic_seg_preds_df.head()

# predicted topics:  41
[0, 44, 133, 139, 151, 156, 161, 166, 169, 177]


Unnamed: 0,elapsed_seconds,timestamp,transcript,depth_score,threshold,pred_start
0,0.0,00:00:00.000,Let me make sure everything is as it should be.,0.006972,0.022045,True
1,3.56,00:00:03.560,I'm always fascinated by the fact that people are waiting.,0.004309,0.022045,False
2,6.72,00:00:06.720,It's like it's it's so surprising that people are like on here,0.0,0.022045,False
3,12.7,00:00:12.700,sometimes early for you.,0.003844,0.022045,False
4,14.12,00:00:14.120,It makes sense for my session.,0.005018,0.022045,False


## Summarization

In [7]:
summarization_inf_df = topic_seg_preds_df.copy()

summarization_inf_df["topic_num"] = -1
n_topics = len(pred_topic_idxs)

for i, seg_idx in enumerate(pred_topic_idxs):
    end_idx = pred_topic_idxs[i + 1] if i + 1 <= (n_topics - 1) else None
    summarization_inf_df.loc[seg_idx:end_idx, "topic_num"] = i

summarization_inf_df = summarization_inf_df.groupby(by="topic_num").agg(list).reset_index()
summarization_inf_df["transcript"] = summarization_inf_df["transcript"].apply(
    lambda v: " ".join([str(seq) for seq in v])
)
summarization_inf_df.timestamp = summarization_inf_df.timestamp.apply(lambda v: v[0])
summarization_inf_df.drop(columns=["depth_score", "threshold", "pred_start"], inplace=True)

summarization_inf_df.head()

Unnamed: 0,topic_num,elapsed_seconds,timestamp,transcript
0,0,"[0.0, 3.56, 6.72, 12.7, 14.12, 15.24, 22.72, 23.2, 25.08, 25.68, 26.96, 27.8, 28.8, 30.28, 32.04, 36.6, 39.12, 42.4, 44.96, 47.44, 51.92, 54.68, 60.64, 63.28, 66.76, 70.56, 71.24, 72.8, 73.56, 74.0, 75.16, 78.2, 80.16, 85.56, 93.75999999999999, 94.75999999999999, 106.16, 128.04, 129.24, 133.0, 140.68, 141.68, 149.88, 152.84]",00:00:00.000,"Let me make sure everything is as it should be. I'm always fascinated by the fact that people are waiting. It's like it's it's so surprising that people are like on here sometimes early for you. It makes sense for my session. I'm like why are people joining? Awesome. I believe we live. Welcome back. Everyone wait is back. He wasn't well. Last week. Now he's in awesome health. So that's I was happy to hear that. And I hope you are as well because he'll be teaching us how to translate stuff and how to summarize models. If I can learn this from him, I'll take his model and deploy it on my pod..."
1,1,"[153.92000000000002, 155.52, 162.44, 163.88, 169.44, 175.52, 176.6, 181.56, 184.76, 189.92, 192.4, 193.84, 196.68, 200.56, 206.68, 208.56, 211.76, 214.28, 220.44, 221.88, 224.92000000000002, 225.88, 229.52, 231.8, 235.28, 239.04, 242.64, 246.0, 249.12, 253.32, 255.96, 257.72, 259.56, 264.6, 270.12, 273.84000000000003, 279.08, 285.32, 287.0, 292.64, 293.08, 294.88, 295.88, 299.28, 301.52, 312.64, 314.68, 316.08, 318.0, 320.4, 324.4, 330.36, 333.68, 336.92, 337.76, 341.72, 345.92, 346.56, 350.76, 352.72, 355.88, 358.56, 359.48, 364.36, 364.88, 366.04, 366.52, 367.04, 368.44, 369.92, 373.48, ...",00:02:33.920,"In the meantime, I can talk about tricks. I've been trying to learn different tricks shared across NLP, computer vision world. I found out the augmentee library by the space group. Let's see if I can find that up. Here it is. So let's see link to it. I found that to be quite interesting. And a few things I've been trying to learn in the meantime is NLP augmentations. As you can see, I learned a cool trick called pack translation. I'm sure everyone knows of it. But when you're trying to perform augmentation, you can translate your original text to foreign language, translate it back. And ju..."
2,2,"[430.4, 434.04, 438.68, 442.0, 444.52, 447.88]",00:07:10.400,"And there's minor differences between building a summarization versus a translation model, especially in terms of the metrics that you're going to want to use. So we're going to kind of go through our seven steps that we've used for looking at question answering task, and also looking at the token classification task."
3,3,"[454.2, 457.08, 462.92, 467.6, 471.68, 475.24, 476.48, 479.64, 480.8, 484.44, 486.96, 489.08]",00:07:34.200,"And along the way, we'll talk about a little bit deeper into model selection and also metrics selection. So as you know, when you're actually building a model, the loss is kind of the models way, the models metric, that it understands in terms of improving the quality of your weights. But from a human perspective, what we really care about is metrics. And so regardless of what you're doing, and in particular, when we look at summarization and translation, the metrics that we're going to want to use are going to be different for these tasks."
4,4,"[492.88, 495.56, 498.68, 502.6, 506.68]",00:08:12.880,"And we want to make sure that we're using metrics that are in line with whatever our objectives are, and also with the data set that we're using. So we'll take a look at that in a little bit more detail. And also, I have a note here on this particular slide"


In [8]:
summarization_inf_df.shape

(41, 4)

In [7]:
# TODO: Kurian to add his magic here after he adds his models to /models ...

In [11]:
%%time
headline_trainer = summarization.SummarizationModelTrainer(
    task="inference", experiment_name="headline_summarization", train_config=summarization.HeadlineSummarizationConfig
)
inf_learn = headline_trainer.load_learner_or_model(headline_summarization_learner_fpath, device="cpu")
headling_preds_df = headline_trainer.get_preds(inf_learn, summarization_inf_df)

CPU times: user 30min 4s, sys: 4min 43s, total: 34min 47s
Wall time: 29.4 s


In [13]:
headling_preds_df.head()

Unnamed: 0,topic_num,elapsed_seconds,timestamp,transcript,topic_prediction
0,0,"[0.0, 3.56, 6.72, 12.7, 14.12, 15.24, 22.72, 23.2, 25.08, 25.68, 26.96, 27.8, 28.8, 30.28, 32.04, 36.6, 39.12, 42.4, 44.96, 47.44, 51.92, 54.68, 60.64, 63.28, 66.76, 70.56, 71.24, 72.8, 73.56, 74.0, 75.16, 78.2, 80.16, 85.56, 93.75999999999999, 94.75999999999999, 106.16, 128.04, 129.24, 133.0, 140.68, 141.68, 149.88, 152.84]",00:00:00.000,"Let me make sure everything is as it should be. I'm always fascinated by the fact that people are waiting. It's like it's it's so surprising that people are like on here sometimes early for you. It makes sense for my session. I'm like why are people joining? Awesome. I believe we live. Welcome back. Everyone wait is back. He wasn't well. Last week. Now he's in awesome health. So that's I was happy to hear that. And I hope you are as well because he'll be teaching us how to translate stuff and how to summarize models. If I can learn this from him, I'll take his model and deploy it on my pod...",This week'
1,1,"[153.92000000000002, 155.52, 162.44, 163.88, 169.44, 175.52, 176.6, 181.56, 184.76, 189.92, 192.4, 193.84, 196.68, 200.56, 206.68, 208.56, 211.76, 214.28, 220.44, 221.88, 224.92000000000002, 225.88, 229.52, 231.8, 235.28, 239.04, 242.64, 246.0, 249.12, 253.32, 255.96, 257.72, 259.56, 264.6, 270.12, 273.84000000000003, 279.08, 285.32, 287.0, 292.64, 293.08, 294.88, 295.88, 299.28, 301.52, 312.64, 314.68, 316.08, 318.0, 320.4, 324.4, 330.36, 333.68, 336.92, 337.76, 341.72, 345.92, 346.56, 350.76, 352.72, 355.88, 358.56, 359.48, 364.36, 364.88, 366.04, 366.52, 367.04, 368.44, 369.92, 373.48, ...",00:02:33.920,"In the meantime, I can talk about tricks. I've been trying to learn different tricks shared across NLP, computer vision world. I found out the augmentee library by the space group. Let's see if I can find that up. Here it is. So let's see link to it. I found that to be quite interesting. And a few things I've been trying to learn in the meantime is NLP augmentations. As you can see, I learned a cool trick called pack translation. I'm sure everyone knows of it. But when you're trying to perform augmentation, you can translate your original text to foreign language, translate it back. And ju...",This is the
2,2,"[430.4, 434.04, 438.68, 442.0, 444.52, 447.88]",00:07:10.400,"And there's minor differences between building a summarization versus a translation model, especially in terms of the metrics that you're going to want to use. So we're going to kind of go through our seven steps that we've used for looking at question answering task, and also looking at the token classification task.",We've
3,3,"[454.2, 457.08, 462.92, 467.6, 471.68, 475.24, 476.48, 479.64, 480.8, 484.44, 486.96, 489.08]",00:07:34.200,"And along the way, we'll talk about a little bit deeper into model selection and also metrics selection. So as you know, when you're actually building a model, the loss is kind of the models way, the models metric, that it understands in terms of improving the quality of your weights. But from a human perspective, what we really care about is metrics. And so regardless of what you're doing, and in particular, when we look at summarization and translation, the metrics that we're going to want to use are going to be different for these tasks.",Models and metrics
4,4,"[492.88, 495.56, 498.68, 502.6, 506.68]",00:08:12.880,"And we want to make sure that we're using metrics that are in line with whatever our objectives are, and also with the data set that we're using. So we'll take a look at that in a little bit more detail. And also, I have a note here on this particular slide",metrics in line


In [18]:
%%time
content_trainer = summarization.SummarizationModelTrainer(
    task="inference", experiment_name="content_summarization", train_config=summarization.ContentSummarizationConfig
)
inf_learn = content_trainer.load_learner_or_model(content_summarization_learner_fpath, device="cpu")
content_preds_df = content_trainer.get_preds(inf_learn, headling_preds_df)

CPU times: user 1h 57min, sys: 7min 20s, total: 2h 4min 20s
Wall time: 1min 40s


In [19]:
content_preds_df.head()

Unnamed: 0,topic_num,elapsed_seconds,timestamp,transcript,topic_prediction,content_highlights
0,0,"[0.0, 3.56, 6.72, 12.7, 14.12, 15.24, 22.72, 23.2, 25.08, 25.68, 26.96, 27.8, 28.8, 30.28, 32.04, 36.6, 39.12, 42.4, 44.96, 47.44, 51.92, 54.68, 60.64, 63.28, 66.76, 70.56, 71.24, 72.8, 73.56, 74.0, 75.16, 78.2, 80.16, 85.56, 93.75999999999999, 94.75999999999999, 106.16, 128.04, 129.24, 133.0, 140.68, 141.68, 149.88, 152.84]",00:00:00.000,"Let me make sure everything is as it should be. I'm always fascinated by the fact that people are waiting. It's like it's it's so surprising that people are like on here sometimes early for you. It makes sense for my session. I'm like why are people joining? Awesome. I believe we live. Welcome back. Everyone wait is back. He wasn't well. Last week. Now he's in awesome health. So that's I was happy to hear that. And I hope you are as well because he'll be teaching us how to translate stuff and how to summarize models. If I can learn this from him, I'll take his model and deploy it on my pod...",This week',This week's show is the second day of the show . I'm excited to learn how to translate stuff and summarize models . I'll take his model and deploy it on my podcast .
1,1,"[153.92000000000002, 155.52, 162.44, 163.88, 169.44, 175.52, 176.6, 181.56, 184.76, 189.92, 192.4, 193.84, 196.68, 200.56, 206.68, 208.56, 211.76, 214.28, 220.44, 221.88, 224.92000000000002, 225.88, 229.52, 231.8, 235.28, 239.04, 242.64, 246.0, 249.12, 253.32, 255.96, 257.72, 259.56, 264.6, 270.12, 273.84000000000003, 279.08, 285.32, 287.0, 292.64, 293.08, 294.88, 295.88, 299.28, 301.52, 312.64, 314.68, 316.08, 318.0, 320.4, 324.4, 330.36, 333.68, 336.92, 337.76, 341.72, 345.92, 346.56, 350.76, 352.72, 355.88, 358.56, 359.48, 364.36, 364.88, 366.04, 366.52, 367.04, 368.44, 369.92, 373.48, ...",00:02:33.920,"In the meantime, I can talk about tricks. I've been trying to learn different tricks shared across NLP, computer vision world. I found out the augmentee library by the space group. Let's see if I can find that up. Here it is. So let's see link to it. I found that to be quite interesting. And a few things I've been trying to learn in the meantime is NLP augmentations. As you can see, I learned a cool trick called pack translation. I'm sure everyone knows of it. But when you're trying to perform augmentation, you can translate your original text to foreign language, translate it back. And ju...",This is the,"This is the fifth session of our walkthrough of the Part 2, the Hugging Faced Course . This is the fifth session of our walkthrough of the Part 2, the Hugging Faced Course ."
2,2,"[430.4, 434.04, 438.68, 442.0, 444.52, 447.88]",00:07:10.400,"And there's minor differences between building a summarization versus a translation model, especially in terms of the metrics that you're going to want to use. So we're going to kind of go through our seven steps that we've used for looking at question answering task, and also looking at the token classification task.",We've,"We're going to kind of go through the seven steps that we've used to look at question answering task, and also the token classification task ."
3,3,"[454.2, 457.08, 462.92, 467.6, 471.68, 475.24, 476.48, 479.64, 480.8, 484.44, 486.96, 489.08]",00:07:34.200,"And along the way, we'll talk about a little bit deeper into model selection and also metrics selection. So as you know, when you're actually building a model, the loss is kind of the models way, the models metric, that it understands in terms of improving the quality of your weights. But from a human perspective, what we really care about is metrics. And so regardless of what you're doing, and in particular, when we look at summarization and translation, the metrics that we're going to want to use are going to be different for these tasks.",Models and metrics,"The loss is kind of the models way, the models metric, that it understands in terms of improving the quality of your weights . When we look at summarization and translation, the metrics that we're going to want to use are going to be different for these tasks ."
4,4,"[492.88, 495.56, 498.68, 502.6, 506.68]",00:08:12.880,"And we want to make sure that we're using metrics that are in line with whatever our objectives are, and also with the data set that we're using. So we'll take a look at that in a little bit more detail. And also, I have a note here on this particular slide",metrics in line,"We want to make sure that we're using metrics that are in line with whatever our objectives are, and also with the data set that we're using ."
