In [1]:
%reload_ext autoreload
%autoreload 2

In [2]:
import gc
from pathlib import Path

import torch

from course_copilot import transcription, topic_segmentation, summarization

## Configuration

In [14]:
youtube_id = "Jsz4E2iNXUA"
device = "cuda"

audio_files_fpath = Path("../../transcription/audio_files")
transcription_fpath = Path("../../transcription/transcriptions")
whisper_models_fpath = Path("../../transcription/models")

whisper_model = "base"
topic_segmentation_learner_fpath = "../../models/topic_segmentation_deberta_v3_small.pkl"
headline_summarization_learner_fpath = "../../models/{exported_learner.pkl}"
content_summarization_learner_fpath = "../../models/{exported_learner.pkl}"

# smaller = more fine grained topics (more topics predicted) while larger values = less fine grained (less topics predicted)
topic_segmentation_get_preds_kwargs = {"threshold_std_coeff": 1.9}

## Transcription

In [4]:
audio_fpath = transcription.fetch_youtube_audio(youtube_id, audio_files_fpath=audio_files_fpath)

transcription_fpath = transcription.fetch_transcription(
    audio_fpath=audio_fpath,
    transcription_fpath=transcription_fpath,
    model_fpath=whisper_models_fpath,
    model_checkptoint=whisper_model,
    device=device,
)

transcription_df = transcription.transcription_to_df(transcription_fpath)
transcription_df.head()

Unnamed: 0,timestamp,transcript
0,00:00:00.000,Let me make sure everything is as it should be.
1,00:00:03.560,I'm always fascinated by the fact that people are waiting.
2,00:00:06.720,It's like it's it's so surprising that people are like on here
3,00:00:12.700,sometimes early for you.
4,00:00:14.120,It makes sense for my session.


## Topic Segmentation

In [15]:
topic_segmentation_trainer = topic_segmentation.TopicSegmentationModelTrainer(
    experiment_name="deberta_v3_small", train_config=topic_segmentation.TopicSegmentationConfig
)
# results_df, train_df, val_idxs = trainer.train()
inf_learn = topic_segmentation_trainer.load_learner_or_model(
    model_learner_fpath=topic_segmentation_learner_fpath, device=device
)

topic_seg_preds_df, pred_topic_idxs = topic_segmentation_trainer.get_preds(
    inf_learn, transcription_df.copy(), **topic_segmentation_get_preds_kwargs
)

# cleanup resources
del inf_learn
torch.cuda.empty_cache()
gc.collect()

# shows final results
print("# predicted topics: ", len(pred_topic_idxs))
print(pred_topic_idxs[:10])
topic_seg_preds_df.head()

# predicted topics:  41
[0, 44, 113, 129, 133, 145, 150, 155, 162, 170]


Unnamed: 0,timestamp,transcript,depth_score,threshold,pred_start
0,00:00:00.000,Let me make sure everything is as it should be.,0.006972,0.022118,True
1,00:00:03.560,I'm always fascinated by the fact that people are waiting.,0.004309,0.022118,False
2,00:00:06.720,It's like it's it's so surprising that people are like on here,0.0,0.022118,False
3,00:00:12.700,sometimes early for you.,0.003844,0.022118,False
4,00:00:14.120,It makes sense for my session.,0.005018,0.022118,False


## Summarization

In [16]:
summarization_inf_df = topic_seg_preds_df.copy()

summarization_inf_df["topic_num"] = -1
n_topics = len(pred_topic_idxs)

for i, seg_idx in enumerate(pred_topic_idxs):
    end_idx = pred_topic_idxs[i + 1] if i + 1 <= (n_topics - 1) else None
    summarization_inf_df.loc[seg_idx:end_idx, "topic_num"] = i

summarization_inf_df = summarization_inf_df.groupby(by="topic_num").agg(list).reset_index()
summarization_inf_df["transcript"] = summarization_inf_df["transcript"].apply(
    lambda v: " ".join([str(seq) for seq in v])
)
summarization_inf_df.timestamp = summarization_inf_df.timestamp.apply(lambda v: v[0])
summarization_inf_df.drop(columns=["depth_score", "threshold", "pred_start"], inplace=True)

summarization_inf_df.head()

Unnamed: 0,topic_num,timestamp,transcript
0,0,00:00:00.000,"Let me make sure everything is as it should be. I'm always fascinated by the fact that people are waiting. It's like it's it's so surprising that people are like on here sometimes early for you. It makes sense for my session. I'm like why are people joining? Awesome. I believe we live. Welcome back. Everyone wait is back. He wasn't well. Last week. Now he's in awesome health. So that's I was happy to hear that. And I hope you are as well because he'll be teaching us how to translate stuff and how to summarize models. If I can learn this from him, I'll take his model and deploy it on my pod..."
1,1,00:02:34.040,"In the meantime, I can talk about tricks. I've been trying to learn different tricks shared across NLP, computer vision world. I found out the augmentee library by the Spacey group. Let's see if I can find that up. Here it is. So let's see link to it. I found that to be quite interesting. And a few things I've been trying to learn in the meantime is NLP augmentations. As you can see, I learned a cool trick called back translation. I'm sure everyone knows of it. But when you're trying to perform augmentation, you can translate your original text to foreign language, translate it back. And j..."
2,2,00:06:30.360,"but we're now looking at some of the major NLP tasks that you can build with using transformers. And then in particular, we're looking at how we can use FASTI to develop these models, train them, deploy them for inference. And their models that are supported in a library built called Blur. So we'll be looking today kind of really diving into the summarization and translation bits. And I combined them because, as you'll see, they're actually very similar. There's little intricacies with building these type of models. And there's minor differences between building a summarization versus a tr..."
3,3,00:07:22.040,"So we're going to kind of go through our seven steps that we've used for looking at question answering task and also looking at the token classification task. And along the way, we'll talk about,"
4,4,00:07:35.840,"get a little bit deeper into model selection and also metrics selection. So as you know, when you're actually building a model, the loss is kind of the models way, the models metric, that it understands in terms of improving the quality of your weights. But from a human perspective, what we really care about is metrics. And so regardless of what you're doing, and in particular, when we look at summarization and translation, the metrics that we're going to want to use are going to be different for these tasks."


In [8]:
# TODO: Kurian to add his magic here after he adds his models to /models ...