In [None]:
# |default_exp preprocessing
# |default_cls_lvl 3

In [None]:
# |hide
%reload_ext autoreload
%autoreload 2

# preprocessing

Classes and methods for turning our raw datasets into appropriately formatted training data for both segmentation and summarization tasks

In [None]:
# |export
from __future__ import annotations

import argparse, datetime
from pathlib import Path

import pandas as pd

from course_copilot import utils

In [None]:
# | hide
import pdb

from IPython.display import display

from fastcore.test import *
from nbdev.showdoc import show_doc

## Utility methods

In [None]:
# |export
def convert_duration_to_seconds(
    # A duration/iterval with the format of "hh:mm:ss"
    v: str,
) -> int:  # The total number of seconds

    hrs, mins, secs = v.split(":")
    return (60 * 60 * int(hrs)) + (60 * int(mins)) + float(secs)

In [None]:
# |export
def build_train_df(
    # The path to the data dir
    data_path: str
    | Path = "../data/",
) -> pd.DataFrame:  # A preprocessed DataFrame suitable for both segmentation and summarization training

    sheets_d = pd.read_excel(
        Path(data_path) / "raw/fsdl_2022_project_transcripts.xlsx", sheet_name=["lesson_topics", "lesson_transcripts"]
    )
    topics_df, transcripts_df = [v for k, v in sheets_d.items()]

    topics_df.drop(columns="video_url", inplace=True)
    transcripts_df.drop(columns="video_url", inplace=True)

    topics_df["timestamp"] = topics_df["timestamp"].astype(str)
    transcripts_df["timestamp"] = transcripts_df["timestamp"].astype(str)

    # define the start/end boundaries (in seconds) for each topic in each lesson
    topics_df["start_seconds"] = topics_df["timestamp"].apply(convert_duration_to_seconds)
    topics_df["end_seconds"] = topics_df.groupby(by=["course_title", "lesson_num"])["start_seconds"].shift(
        -1, fill_value=100000
    )

    # define the total number of elapsed seconds at each timestamp in the transcripts dataset
    transcripts_df["elapsed_seconds"] = transcripts_df["timestamp"].apply(convert_duration_to_seconds)

    # build our training data
    merged_df = topics_df[["course_title", "lesson_num", "topic", "start_seconds", "end_seconds"]].merge(
        transcripts_df, on=["course_title", "lesson_num"]
    )

    # keep only the merged records where the transcript lies inbetween the start/end of the topic
    merged_df = merged_df[
        (merged_df.elapsed_seconds >= merged_df.start_seconds) & (merged_df.elapsed_seconds < merged_df.end_seconds)
    ]

    # for both segmentation and summarization tasks, we'll need to group the transcripts by course + lesson + topic
    train_df = (
        merged_df[["course_title", "lesson_num", "topic", "transcript", "start_seconds"]]
        .groupby(by=["course_title", "lesson_num", "start_seconds", "topic"])
        .agg(list)
        .reset_index()
    )

    train_df.sort_values(by=["course_title", "lesson_num", "start_seconds"], inplace=True)

    return train_df

In [None]:
# |export
def build_segmentation_train_df(
    # The preprocess training DataFrame
    train_df: pd.DataFrame,
) -> pd.DataFrame:  # A preprocessed DataFrame for segmentation training
    """
    For segmentation, we want to create a dataset of seq, seq +1 examples, but also include the ability to gather negative samples
    from either sequences in that topic or not
    """
    seg_train_df = train_df.copy()

    seg_examples = []

    for example_idx, example in seg_train_df.iterrows():
        is_last_example = len(seg_train_df) == (example_idx + 1)

        for seq_idx, seq in enumerate(example["transcript"]):
            is_last_seq = len(example["transcript"]) == (seq_idx + 1)

            if is_last_seq and is_last_example:
                next_seq = None
                next_topic_begin_seq = None
            elif is_last_seq and not is_last_example:
                next_seq = seg_train_df.iloc[example_idx + 1]["transcript"][0]
                next_topic_begin_seq = next_seq
            else:
                next_seq = str(example["transcript"][seq_idx + 1])
                next_topic_begin_seq = None

            if seq_idx == 0:
                prev_seq = "xxBEGIN_TOPICxx"
            else:
                prev_seq = example["transcript"][seq_idx - 1]

            seg_examples.append(
                {
                    "course_title": example["course_title"],
                    "lesson_num": example["lesson_num"],
                    "topic": example["topic"],
                    "seq": str(seq),
                    "prev_seq": prev_seq,
                    "next_seq": next_seq,
                    "is_topic_end": is_last_seq,
                    "next_topic_begin_seq": next_topic_begin_seq,
                    "other_topic_seqs": [
                        str(txt) for i, txt in enumerate(example["transcript"]) if i != seq_idx and i != seq_idx + 1
                    ],
                }
            )

    seg_train_df = pd.DataFrame(seg_examples)

    return seg_train_df

In [None]:
# |export
def build_summarization_train_df(
    # The preprocess training DataFrame
    train_df: pd.DataFrame,
) -> pd.DataFrame:  # A preprocessed DataFrame for summarization training
    """For summarization, we want to concatenate all the sequences in a topic and use the resulting string to predict the topic"""
    summarization_train_df = train_df.copy()

    summarization_train_df["transcript"] = summarization_train_df["transcript"].apply(
        lambda v: " ".join([str(seq) for seq in v])
    )
    return summarization_train_df

## Preprocessing

In [None]:
# |export
def preprocess_data(
    # What dataset do we want to preprocess in the data/raw folder
    ds: str = "train",
    # The path to the data folder
    data_path: str | Path = "../data/",
    # Determines whether or not we save the cleaned Dataframes to data/clean
    return_file: bool = True,
    # Determines whether or not we return the cleaned Dataframes
    save_file: bool = False,
):
    is_train = ds == "train"

    train_df = build_train_df(data_path)
    segmentation_train_df = build_segmentation_train_df(train_df)
    summarization_train_df = build_summarization_train_df(train_df)

    # preprocessing that should only be applied to the training data
    if is_train:
        pass

    # save/return the preprocessed data
    if save_file:
        (Path(data_path) / "clean").mkdir(exist_ok=True)
        segmentation_train_df.to_csv((Path(data_path) / "clean") / f"segmentation_{ds}.csv", index=False)
        summarization_train_df.to_csv((Path(data_path) / "clean") / f"summarization_{ds}.csv", index=False)

    if return_file:
        return segmentation_train_df, summarization_train_df

In [None]:
# |eval: false
segmentation_train_df, summarization_train_df = preprocess_data("train", save_file=True)

len(segmentation_train_df), len(summarization_train_df)

(25383, 597)

In [None]:
# |eval: false
segmentation_train_df.head()

Unnamed: 0,course_title,lesson_num,topic,seq,prev_seq,next_seq,is_topic_end,next_topic_begin_seq,other_topic_seqs
0,C-Squared Podcast,1,Intro,[Music] welcome everybody to episode one of a,xxBEGIN_TOPICxx,chess themed podcast with myself christian kirilla and i'm fighting on caruana so what's up,False,,"[christian well not so much fabi uh it's first of all great um to finally start a, podcast the chess podcast i know that um there's a lot of podcasts out there but, i wanted to bring our own tune to the mix and i think uh yeah i'm, excited about that so that's uh the first thing how about yourself fabian well i'm back in the states after it's, been a while at your home it's good to be here it's my first time in uh visiting here and uh, yeah it's been an interesting few months played a lot of chess which is pretty cool but, also a bit difficult at times my home uh here we are not going to m..."
1,C-Squared Podcast,1,Intro,chess themed podcast with myself christian kirilla and i'm fighting on caruana so what's up,[Music] welcome everybody to episode one of a,christian well not so much fabi uh it's first of all great um to finally start a,False,,"[[Music] welcome everybody to episode one of a, podcast the chess podcast i know that um there's a lot of podcasts out there but, i wanted to bring our own tune to the mix and i think uh yeah i'm, excited about that so that's uh the first thing how about yourself fabian well i'm back in the states after it's, been a while at your home it's good to be here it's my first time in uh visiting here and uh, yeah it's been an interesting few months played a lot of chess which is pretty cool but, also a bit difficult at times my home uh here we are not going to mention the location because, those ..."
2,C-Squared Podcast,1,Intro,christian well not so much fabi uh it's first of all great um to finally start a,chess themed podcast with myself christian kirilla and i'm fighting on caruana so what's up,podcast the chess podcast i know that um there's a lot of podcasts out there but,False,,"[[Music] welcome everybody to episode one of a, chess themed podcast with myself christian kirilla and i'm fighting on caruana so what's up, i wanted to bring our own tune to the mix and i think uh yeah i'm, excited about that so that's uh the first thing how about yourself fabian well i'm back in the states after it's, been a while at your home it's good to be here it's my first time in uh visiting here and uh, yeah it's been an interesting few months played a lot of chess which is pretty cool but, also a bit difficult at times my home uh here we are not going to mention the location beca..."
3,C-Squared Podcast,1,Intro,podcast the chess podcast i know that um there's a lot of podcasts out there but,christian well not so much fabi uh it's first of all great um to finally start a,i wanted to bring our own tune to the mix and i think uh yeah i'm,False,,"[[Music] welcome everybody to episode one of a, chess themed podcast with myself christian kirilla and i'm fighting on caruana so what's up, christian well not so much fabi uh it's first of all great um to finally start a, excited about that so that's uh the first thing how about yourself fabian well i'm back in the states after it's, been a while at your home it's good to be here it's my first time in uh visiting here and uh, yeah it's been an interesting few months played a lot of chess which is pretty cool but, also a bit difficult at times my home uh here we are not going to mention th..."
4,C-Squared Podcast,1,Intro,i wanted to bring our own tune to the mix and i think uh yeah i'm,podcast the chess podcast i know that um there's a lot of podcasts out there but,excited about that so that's uh the first thing how about yourself fabian well i'm back in the states after it's,False,,"[[Music] welcome everybody to episode one of a, chess themed podcast with myself christian kirilla and i'm fighting on caruana so what's up, christian well not so much fabi uh it's first of all great um to finally start a, podcast the chess podcast i know that um there's a lot of podcasts out there but, been a while at your home it's good to be here it's my first time in uh visiting here and uh, yeah it's been an interesting few months played a lot of chess which is pretty cool but, also a bit difficult at times my home uh here we are not going to mention the location because, those uh cra..."


In [None]:
# |eval: false
summarization_train_df.head()

Unnamed: 0,course_title,lesson_num,start_seconds,topic,transcript
0,C-Squared Podcast,1,0.0,Intro,[Music] welcome everybody to episode one of a chess themed podcast with myself christian kirilla and i'm fighting on caruana so what's up christian well not so much fabi uh it's first of all great um to finally start a podcast the chess podcast i know that um there's a lot of podcasts out there but i wanted to bring our own tune to the mix and i think uh yeah i'm excited about that so that's uh the first thing how about yourself fabian well i'm back in the states after it's been a while at your home it's good to be here it's my first time in uh visiting here and uh yeah it's been an intere...
1,C-Squared Podcast,1,137.0,Candidates 2018,camps look like in general yeah well you mentioned the 2018 cycle uh where we worked together we started with the training before the candidates and for me it's interesting because i've i've played a lot of these candidates tournaments and i'm always doing it a bit differently trying different things trying to improve it but sometimes it goes less or more successfully you never know what will work out i think what we did in 2018 not just for the candidates but also for the world championship because i qualified for that i think what we did then was extremely successful um we we arranged it...
2,C-Squared Podcast,1,464.0,Candidates training,going in the candidates like how was the experience yeah i think the preparation was pretty serious it included a bunch of uh camps and preparation devoted to players as i assume i think everyone has the same sort of general approach which is to think about their openings their strategy look at the opponents try to get in shape make sure that you're not you know rusty or blundering things or hallucinating variations uh but there's a lot of nerves and i i felt a lot of nerves before the tournament and i think possibly i you know overworked over trained a bit because it was yeah it was like ...
3,C-Squared Podcast,1,610.0,Playing for 2nd place,were you just like focused on grabbing first well i was only focused on first but of course there were always these thoughts that well maybe second is enough but you can't play for second like let's say once i had achieved plus three in the tournament and john was plus four and i tried to go and go into like full like risk reverse mode which is still difficult to do but let's say i had gone that mode and and achieved it and like finished second with like plus three and john got plus five uh and then like magnus says well i'm going to play right then you also feel kind of stupid you know li...
4,C-Squared Podcast,1,916.0,Magnus' WC decision,know you can't uh you can't tell him you have to do something i i guess let me rephrase that fair to let you guys play the tournament first and then tell you the decision well i think he said it in a strange way which was that i'll play against alireza which to me is strange because if you don't want to play world championship match i fully understand you know but did he say that did he actually name him yeah that's kind of what he said um yeah he more he like he didn't say definitively like i won't play against anyone but he was like i probably won't play unless it's frozen right and yeah...


In [None]:
# TODO
# segmentation_test_df,  summarization_test_df = preprocess_data("test", save_file=True)

In [None]:
# | export
# |eval: false
if __name__ == "__main__" and utils.run_env == "script":
    # instantiate argparser
    parser = argparse.ArgumentParser()

    # define args
    parser.add_argument("--ds", type=str, default="train")
    parser.add_argument("--data_path", type=str, default="./data")
    args = parser.parse_args()

    preprocess_data(
        ds=args.ds,
        data_path=args.data_path,
        return_file=False,
        save_file=True,
    )

## Export -

In [None]:
# | hide
import nbdev

nbdev.nbdev_export()