In [1]:
%reload_ext autoreload
%autoreload 2

In [2]:
import gc
from pathlib import Path

import torch

from course_copilot import transcription, topic_segmentation, summarization

## Configuration

In [3]:
youtube_id = "Jsz4E2iNXUA"
device = "cuda"

audio_files_fpath = Path("../../transcription/audio_files")
transcription_fpath = Path("../../transcription/transcriptions")
whisper_models_fpath = Path("../../transcription/models")

whisper_model = "base"
topic_segmentation_learner_fpath = "../../models/topic_segmentation_deberta_v3_small.pkl"
headline_summarization_learner_fpath = "../../models/{exported_learner.pkl}"
content_summarization_learner_fpath = "../../models/{exported_learner.pkl}"

## Transcription

In [4]:
audio_fpath = transcription.fetch_youtube_audio(youtube_id, audio_files_fpath=audio_files_fpath)

transcription_fpath = transcription.fetch_transcription(
    audio_fpath=audio_fpath,
    transcription_fpath=transcription_fpath,
    model_fpath=whisper_models_fpath,
    model_checkptoint=whisper_model,
    device=device,
)

transcription_df = transcription.transcription_to_df(transcription_fpath)
transcription_df.head()

Unnamed: 0,timestamp,transcript
0,00:00:00.000,Let me make sure everything is as it should be.
1,00:00:03.560,I'm always fascinated by the fact that people are waiting.
2,00:00:06.720,It's like it's it's so surprising that people are like on here
3,00:00:12.700,sometimes early for you.
4,00:00:14.120,It makes sense for my session.


## Topic Segmentation

In [5]:
topic_segmentation_trainer = topic_segmentation.TopicSegmentationModelTrainer(
    experiment_name="deberta_v3_small", train_config=topic_segmentation.TopicSegmentationConfig
)
# results_df, train_df, val_idxs = trainer.train()
inf_learn = topic_segmentation_trainer.load_learner_or_model(
    model_learner_fpath=topic_segmentation_learner_fpath, device=device
)

topic_seg_preds_df, pred_topic_idxs = topic_segmentation_trainer.get_preds(inf_learn, transcription_df.copy())

# cleanup resources
del inf_learn
torch.cuda.empty_cache()
gc.collect()

# shows final results
print(pred_topic_idxs[:10])
topic_seg_preds_df.head()

[0, 11, 16, 21, 25, 37, 40, 44, 56, 89]


Unnamed: 0,timestamp,transcript,depth_score,threshold,pred_start
0,00:00:00.000,Let me make sure everything is as it should be.,0.006972,0.014812,True
1,00:00:03.560,I'm always fascinated by the fact that people are waiting.,0.004309,0.014812,False
2,00:00:06.720,It's like it's it's so surprising that people are like on here,0.0,0.014812,False
3,00:00:12.700,sometimes early for you.,0.003844,0.014812,False
4,00:00:14.120,It makes sense for my session.,0.005018,0.014812,False


## Summarization

In [6]:
summarization_inf_df = topic_seg_preds_df.copy()

summarization_inf_df["topic_num"] = -1
n_topics = len(pred_topic_idxs)

for i, seg_idx in enumerate(pred_topic_idxs):
    end_idx = pred_topic_idxs[i + 1] if i + 1 <= (n_topics - 1) else None
    summarization_inf_df.loc[seg_idx:end_idx, "topic_num"] = i

summarization_inf_df = summarization_inf_df.groupby(by="topic_num").agg(list).reset_index()
summarization_inf_df["transcript"] = summarization_inf_df["transcript"].apply(
    lambda v: " ".join([str(seq) for seq in v])
)
summarization_inf_df.timestamp = summarization_inf_df.timestamp.apply(lambda v: v[0])
summarization_inf_df.drop(columns=["depth_score", "threshold", "pred_start"], inplace=True)

summarization_inf_df.head()

Unnamed: 0,topic_num,timestamp,transcript
0,0,00:00:00.000,Let me make sure everything is as it should be. I'm always fascinated by the fact that people are waiting. It's like it's it's so surprising that people are like on here sometimes early for you. It makes sense for my session. I'm like why are people joining? Awesome. I believe we live. Welcome back. Everyone wait is back. He wasn't well.
1,1,00:00:27.800,Last week. Now he's in awesome health. So that's I was happy to hear that. And I hope you are as well because he'll be teaching us how to translate stuff and how to summarize models.
2,2,00:00:39.120,"If I can learn this from him, I'll take his model and deploy it on my podcast and give you all summaries that you have been asking for. And I haven't easily produced. So I'm excited to learn that from me then that over to you. Well, guess what I just tried sharing my screen."
3,3,00:00:54.680,"And it says Chrome has lost permissions to sharing our screen recording. So let me fix that real quick. And Sonia, I'm going to let you keep talking about whatever you want to talk about why I do the apologies."
4,4,00:01:10.560,"No problem. It happens with updates sometimes. Right? Yeah. Of course, I didn't check. I just assumed we'd all be good to go and we're not. So see here. Has anyone done the homework or if it had any blog posts in the meantime? Matthew, great to see you. All right. We'll get started in a minute or two while we figure this out. Anyone who joined the previous session, this is better than getting zoom-bound."


In [7]:
# TODO: Kurian to add his magic here after he adds his models to /models ...