In [None]:
pip install -U yt-dlp tiktoken openai==0.28



In [None]:
import tiktoken
import openai
from IPython.display import display, Markdown
import os
import glob
import json
from natsort import natsorted
import numpy as np
from collections import defaultdict

In [None]:
pip install youtube-dl




In [None]:
playlist_url = "https://www.youtube.com/playlist?list=PLLssT5z_DsK_gyrQ_biidwvPYCRNGI3iv"

In [None]:
import youtube_dl

def get_playlist_videos(playlist_url):
    ydl_opts = {
        'quiet': True,
        'extract_flat': True,
        'force_generic_extractor': True,
    }

    with youtube_dl.YoutubeDL(ydl_opts) as ydl:
        playlist_info = ydl.extract_info(playlist_url, download=False)
        videos = playlist_info['entries']

        if videos:
            base_url = "https://www.youtube.com/watch?v="
            return {video['id']:base_url + video['id'] for video in videos}
        else:
            return {}

In [None]:
video_list_dict = get_playlist_videos(playlist_url)

In [None]:
video_list_dict
#!apt-get install jq

{'OVwEeSsSCHE': 'https://www.youtube.com/watch?v=OVwEeSsSCHE',
 'jNBYZbDWyQk': 'https://www.youtube.com/watch?v=jNBYZbDWyQk',
 'VA9niXgGOsQ': 'https://www.youtube.com/watch?v=VA9niXgGOsQ',
 'mnTJezQOIDU': 'https://www.youtube.com/watch?v=mnTJezQOIDU',
 'nrkpEx7tA2Y': 'https://www.youtube.com/watch?v=nrkpEx7tA2Y',
 '6cHupvcxA38': 'https://www.youtube.com/watch?v=6cHupvcxA38',
 '5tHN6Y70d5Y': 'https://www.youtube.com/watch?v=5tHN6Y70d5Y',
 'TNhgCkYDc8M': 'https://www.youtube.com/watch?v=TNhgCkYDc8M',
 '92hto4KwlGI': 'https://www.youtube.com/watch?v=92hto4KwlGI',
 '16qJwiH-FdE': 'https://www.youtube.com/watch?v=16qJwiH-FdE',
 '6gZjRI_gnGc': 'https://www.youtube.com/watch?v=6gZjRI_gnGc',
 'tafPPLVuB2s': 'https://www.youtube.com/watch?v=tafPPLVuB2s',
 'MVT-bHFqZ9o': 'https://www.youtube.com/watch?v=MVT-bHFqZ9o',
 'VCT1N0EsGj0': 'https://www.youtube.com/watch?v=VCT1N0EsGj0',
 '7UknKXkcbZA': 'https://www.youtube.com/watch?v=7UknKXkcbZA',
 '_LzxJ1LbSl4': 'https://www.youtube.com/watch?v=_LzxJ1

In [None]:
# Install jq
!apt-get install jq

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
jq is already the newest version (1.6-2.1ubuntu3).
0 upgraded, 0 newly installed, 0 to remove and 6 not upgraded.


In [None]:
pip install webvtt-py



In [None]:
import pandas as pd
import re

def pre_preprocess_transcript(original_transcript):
    # Define a regular expression pattern to match the time format with at least one or two digits for minutes
    time_pattern = re.compile(r'^(\d{2,}:\d{2}:\d{2},\d{3}) --> (\d{2}:\d{2}:\d{2})', re.MULTILINE)

    # Replace the matched time format with the desired format in the expected output
    formatted_transcript = re.sub(time_pattern, r'\n\1 --> \2', original_transcript)

    # Remove extra tabs and white spaces
    formatted_transcript = re.sub(r' {2,}', ' ', formatted_transcript)

    # Convert the formatted transcript to a Pandas DataFrame
    df = pd.DataFrame({'transcript': formatted_transcript.split('\n')})

    # Remove lines with only digits
    df = df[~df['transcript'].str.match(r'^\d+$')]

    # Remove duplicate lines and keep the last occurrence
    df = df.drop_duplicates(keep='last')

    # Remove the time frame for duplicate rows
    df['transcript'] = df.apply(lambda row: re.sub(time_pattern, '', row['transcript']) if df[df.duplicated(subset='transcript')].index.isin([row.name]).any() else row['transcript'], axis=1)

    # Concatenate the unique lines back into a formatted transcript
    formatted_transcript = '\n'.join(df['transcript'])

    lines = formatted_transcript.strip().split('\n')

    # Extract timestamps and text content
    timestamps = [line if re.match(r'\d{2}:\d{2}:\d{2},\d{3} --> \d{2}:\d{2}:\d{2},\d{3}', line) else None for line in lines]
    text_content = [line if not re.search(r'\d+:\d+:\d+,\d+', line) else None for line in lines]

    for index, line in enumerate(text_content):
        if line is not None:
            text_content[index-1] = timestamps[index-1]

    # Create DataFrame
    df = pd.DataFrame(text_content)

    # Drop rows without timestamps
    df_cleaned = df.dropna().reset_index(drop=True)
    cleaned = '\n'.join(df_cleaned[0].astype(str))
    return cleaned.strip()


In [None]:
def preprocess_and_save_transcripts(transcripts_folder):
    # Iterate through all .srt files in the transcripts folder
    for filename in os.listdir(transcripts_folder):
        if filename.endswith(".srt"):
            file_path = os.path.join(transcripts_folder, filename)

            # Read the content of the .srt file
            with open(file_path, 'r', encoding='utf-8') as transcript_file:
                transcript_text = transcript_file.read()

            # Preprocess the transcript text
            processed_transcript_text = pre_preprocess_transcript(transcript_text)

            # Save the preprocessed transcript back to the file
            with open(file_path, 'w', encoding='utf-8') as transcript_file:
                transcript_file.write(processed_transcript_text)

In [None]:
import subprocess
import os

def get_youtube_chapters_and_transcript(video_link):
    ## Create the bash script content to get chapters
    chapters_script = f'yt-dlp --dump-json "{video_link}" | jq --raw-output \'.chapters[] | "\(.start_time / 60 | floor)min - \(.title)"\''

    # Run the chapters script directly in the shell
    chapters_result = subprocess.run(chapters_script, shell=True, capture_output=True, text=True)

    # Check if there was an error in getting chapters
    if chapters_result.returncode != 0:
        print(f"Error getting chapters: {chapters_result.stderr}")
        return None

    # Parse the output into a list of chapters
    chapters = [line.strip() for line in chapters_result.stdout.split('\n') if line.strip()]

    # Create folders if they don't exist
    transcripts_folder = '/content/transcripts'

    os.makedirs(transcripts_folder, exist_ok=True)

    # Set a working directory for the subprocess to avoid getcwd() errors
    chapters_folder = '/content/chapters'
    working_directory = '/content/transcripts'

    os.makedirs(chapters_folder, exist_ok=True)

    # Create the bash script content to get transcript
    transcript_script = f'yt-dlp --write-auto-sub --skip-download --sub-lang en --convert-subs srt "{video_link}"'

    # Run the transcript script directly in the shell with the specified working directory
    transcript_result = subprocess.run(transcript_script, shell=True, capture_output=True, text=True, cwd=working_directory)

    # Check if there was an error in getting the transcript
    if transcript_result.returncode != 0:
        print(f"Error getting transcript: {transcript_result.stderr}")
        return None

    # Extract the transcript text from the subtitle file
    transcript_text = transcript_result.stdout

    # Save chapters and transcript files
    video_id = video_link.split("=")[-1]

    chapters_filename = f'{chapters_folder}/chapters_{video_id}.txt'
    transcript_filename = f'{transcripts_folder}/transcript_{video_id}.srt'

    with open(chapters_filename, 'w', encoding='utf-8') as chapters_file:
       chapters_file.write('\n'.join(chapters))
    with open(transcript_filename, 'w', encoding='utf-8') as transcript_file:
         transcript_file.write(transcript_text)

    preprocess_and_save_transcripts(transcripts_folder)
    return chapters_filename, transcript_filename

In [None]:
for video_id, video_link in video_list_dict.items():
    print(f"Processing video: {video_id}")
    result = get_youtube_chapters_and_transcript(video_link)

    if result:
        chapters_filename, transcript_filename = result
        print(f"Chapters saved to: {chapters_filename}")
        print(f"Transcript saved to: {transcript_filename}")
    else:
        print("Error processing video.")
    print("-" * 50)


Processing video: OVwEeSsSCHE


KeyboardInterrupt: ignored

## Helper Functions

In [None]:
def get_num_tokens(prompt, model="gpt-3.5-turbo"):
    """Calculates the number of tokens in a text prompt"""

    enc = tiktoken.encoding_for_model("gpt-3.5-turbo")

    return len(enc.encode(prompt))


def get_response(prompt_question,):
    response = openai.ChatCompletion.create(
        model="gpt-4",
        messages=[{"role": "system", "content": "You are a helpful research and\
            programming assistant"},
                  {"role": "user", "content": prompt_question}]
    )

    return response["choices"][0]["message"]["content"]

def load_transcription(transcription_file):
    with open(transcription_file, "r") as f:
        transcription_with_timestamp = f.read()

    return transcription_with_timestamp


In [None]:
transcription_with_timestamp = load_transcription("/content/transcripts/Lecture 1.1 — Why do we need machine learning — [ Deep Learning ｜ Geoffrey Hinton ｜ UofT ] [OVwEeSsSCHE].en.srt")
display(Markdown(transcription_with_timestamp))

00:00:03,780 --> 00:00:06,070
hello welcome to the Coursera course on
00:00:06,080 --> 00:00:09,020
neural networks for machine learning
00:00:09,030 --> 00:00:11,270
before we get into the details of neural
00:00:11,280 --> 00:00:13,120
network learning algorithms I want to
00:00:13,130 --> 00:00:16,400
talk a little bit about machine learning
00:00:16,410 --> 00:00:18,590
why we need machine learning the kinds
00:00:18,600 --> 00:00:22,279
of things we use it for and show you
00:00:22,289 --> 00:00:25,220
some examples of what it can do so the
00:00:25,230 --> 00:00:27,410
reason we need machine learning is that
00:00:27,420 --> 00:00:30,109
the some problems where it's very hard
00:00:30,119 --> 00:00:31,479
to write the programs recognizing a
00:00:31,489 --> 00:00:34,400
three-dimensional object for example
00:00:34,410 --> 00:00:35,900
when it's from a novel viewpoint in new
00:00:35,910 --> 00:00:39,860
lighting conditions in a cluttered scene
00:00:39,870 --> 00:00:41,420
is very hard to do we don't know what
00:00:41,430 --> 00:00:44,540
programs right because we don't know how
00:00:44,550 --> 00:00:46,279
it's done in our brain and even if we
00:00:46,289 --> 00:00:47,630
did know what programs right it might be
00:00:47,640 --> 00:00:49,990
that it was a horrendously complicated
00:00:50,000 --> 00:00:52,970
program
00:00:52,980 --> 00:00:55,580
another example is detecting a
00:00:55,590 --> 00:00:58,099
fraudulent credit card transaction where
00:00:58,109 --> 00:01:00,260
there may not be any nice simple rules
00:01:00,270 --> 00:01:02,090
that will tell you it's fraudulent you
00:01:02,100 --> 00:01:05,719
really need to combine a very large
00:01:05,729 --> 00:01:08,060
number of not very reliable rules and
00:01:08,070 --> 00:01:09,710
also those rules change over time
00:01:09,720 --> 00:01:12,980
because people change the tricks they
00:01:12,990 --> 00:01:14,870
use for fraud so we need a complicated
00:01:14,880 --> 00:01:19,399
program that combines unreliable rules
00:01:19,409 --> 00:01:21,890
and that we can change easily the
00:01:21,900 --> 00:01:24,140
machine learning approach is to say
00:01:24,150 --> 00:01:26,780
instead of writing each program by hand
00:01:26,790 --> 00:01:28,429
for each specific task for a particular
00:01:28,439 --> 00:01:30,800
task we'll collect a lot of examples
00:01:30,810 --> 00:01:34,010
that specify the correct output for a
00:01:34,020 --> 00:01:36,859
given input a machine learning algorithm
00:01:36,869 --> 00:01:40,370
then takes these examples and produces a
00:01:40,380 --> 00:01:42,590
program that does the job the program
00:01:42,600 --> 00:01:43,999
produced by the learning algorithm may
00:01:44,009 --> 00:01:46,969
look very different from a typical
00:01:46,979 --> 00:01:48,530
handwritten program for example it might
00:01:48,540 --> 00:01:52,399
contain millions of numbers about how
00:01:52,409 --> 00:01:54,319
you weight different kinds of errands if
00:01:54,329 --> 00:01:55,940
we do it right the program should work
00:01:55,950 --> 00:02:00,080
for new cases as well as ones it's
00:02:00,090 --> 00:02:01,580
training on and if the data changes we
00:02:01,590 --> 00:02:04,280
should be able to change the program
00:02:04,290 --> 00:02:09,109
relatively easily by retraining it on
00:02:09,119 --> 00:02:11,059
the new data and now massive amounts of
00:02:11,069 --> 00:02:12,350
computation are cheaper than paying
00:02:12,360 --> 00:02:13,650
someone to write a program for a
00:02:13,660 --> 00:02:16,530
specific task
00:02:16,540 --> 00:02:19,050
so we can afford big complicated machine
00:02:19,060 --> 00:02:23,970
learning programs to produce the start
00:02:23,980 --> 00:02:25,440
task specific systems for some examples
00:02:25,450 --> 00:02:27,950
of the things that are best done by
00:02:27,960 --> 00:02:31,290
using a learning algorithm are
00:02:31,300 --> 00:02:34,650
recognizing patterns so for example
00:02:34,660 --> 00:02:37,580
objects in real scenes or the identities
00:02:37,590 --> 00:02:42,210
or expressions of people's faces or
00:02:42,220 --> 00:02:45,540
spoken words there's also recognizing
00:02:45,550 --> 00:02:47,070
anomalies so an unusual sequence of
00:02:47,080 --> 00:02:50,640
credit-card transactions will be an
00:02:50,650 --> 00:02:53,130
anomaly another example of an anomaly
00:02:53,140 --> 00:02:55,800
would be an unusual pattern of sensor
00:02:55,810 --> 00:02:57,930
readings in a nuclear power plant and
00:02:57,940 --> 00:02:59,370
you wouldn't really want to have to deal
00:02:59,380 --> 00:03:00,990
with those by doing supervised learning
00:03:01,000 --> 00:03:03,960
where you look at the ones that blow up
00:03:03,970 --> 00:03:05,610
and see what what caused them to blow up
00:03:05,620 --> 00:03:07,560
you'd really like to recognize that
00:03:07,570 --> 00:03:09,750
something funny is happening without
00:03:09,760 --> 00:03:13,260
having any supervision signal it's just
00:03:13,270 --> 00:03:16,470
not behaving in its normal way and then
00:03:16,480 --> 00:03:18,630
this prediction so typically predicting
00:03:18,640 --> 00:03:21,210
future stock prices or currency exchange
00:03:21,220 --> 00:03:22,980
rates or predicting which movies a
00:03:22,990 --> 00:03:24,990
person will like from knowing which
00:03:25,000 --> 00:03:29,640
other movies they like and which movies
00:03:29,650 --> 00:03:33,000
a lot of other people liked so in this
00:03:33,010 --> 00:03:34,530
course I knees a standard example for
00:03:34,540 --> 00:03:38,310
explaining a lot of the machine learning
00:03:38,320 --> 00:03:40,980
algorithms this is done in a lot of
00:03:40,990 --> 00:03:44,820
science in genetics for example a lot of
00:03:44,830 --> 00:03:46,500
genetics is done on fruit flies and the
00:03:46,510 --> 00:03:49,199
reason is they're convenient they breed
00:03:49,209 --> 00:03:53,430
fast and a lot is already known about
00:03:53,440 --> 00:03:56,580
the genesis of fruit flies the emne
00:03:56,590 --> 00:03:58,050
stata basis of handwritten digits is the
00:03:58,060 --> 00:04:03,240
machine learning equivalent of fruit
00:04:03,250 --> 00:04:06,510
flies it's publicly available we can get
00:04:06,520 --> 00:04:08,190
machine learning out rooms to learn how
00:04:08,200 --> 00:04:10,410
to recognize these hundred and digits
00:04:10,420 --> 00:04:13,740
quite quickly so it's easy to try lots
00:04:13,750 --> 00:04:15,630
of variations and we know huge amounts
00:04:15,640 --> 00:04:18,180
about how well different machine
00:04:18,190 --> 00:04:19,380
learning methods do on any list and in
00:04:19,390 --> 00:04:20,820
particular the different machine
00:04:20,830 --> 00:04:23,010
learning methods were implemented by
00:04:23,020 --> 00:04:26,340
people who believed in them so we can
00:04:26,350 --> 00:04:27,330
rely on those results so for all those
00:04:27,340 --> 00:04:31,680
reasons we're going to use
00:04:31,690 --> 00:04:34,830
as our standard task here's an example
00:04:34,840 --> 00:04:36,600
of some of the digits in n rest these
00:04:36,610 --> 00:04:40,370
are ones that were correctly recognized
00:04:40,380 --> 00:04:42,390
by neural net the first time it saw them
00:04:42,400 --> 00:04:45,270
but they're ones where the neural net
00:04:45,280 --> 00:04:49,250
wasn't very confident and you can see
00:04:49,260 --> 00:04:51,990
why I've arranged these digits in
00:04:52,000 --> 00:04:54,840
standard scanline order so zeros than
00:04:54,850 --> 00:04:55,830
ones and twos and so on if you look at a
00:04:55,840 --> 00:04:58,650
bunch of two's
00:04:58,660 --> 00:05:01,860
like the ones in the green rectangle you
00:05:01,870 --> 00:05:04,290
can see that if you knew they were a
00:05:04,300 --> 00:05:07,110
handwritten digit you'd probably guess
00:05:07,120 --> 00:05:09,270
they were twos but it's very hard to say
00:05:09,280 --> 00:05:10,740
what it is that makes them twos there's
00:05:10,750 --> 00:05:13,050
nothing simple that they all have in
00:05:13,060 --> 00:05:15,450
common in particular if you try and
00:05:15,460 --> 00:05:18,060
overlay one on another you'll see it
00:05:18,070 --> 00:05:20,670
doesn't fit and even if you skew it a
00:05:20,680 --> 00:05:22,350
bit it's very hard to make them overlay
00:05:22,360 --> 00:05:25,500
on each other so template isn't going to
00:05:25,510 --> 00:05:27,150
do the job and in particular template is
00:05:27,160 --> 00:05:29,040
going to be very hard to find that'll
00:05:29,050 --> 00:05:31,110
fit those twos in the green box and
00:05:31,120 --> 00:05:34,950
won't also fit the things in the red
00:05:34,960 --> 00:05:36,900
boxes so that's one thing that makes
00:05:36,910 --> 00:05:40,469
recognizing handwritten digits a good
00:05:40,479 --> 00:05:41,610
task for machine learning now I don't
00:05:41,620 --> 00:05:44,640
want you to think that's the only thing
00:05:44,650 --> 00:05:46,290
we can do it's a relatively simple thing
00:05:46,300 --> 00:05:49,560
for a machine learning system to do now
00:05:49,570 --> 00:05:51,089
and to motivate the rest of the course I
00:05:51,099 --> 00:05:54,960
want to show you some examples are much
00:05:54,970 --> 00:05:58,170
more difficult things so we now have
00:05:58,180 --> 00:06:01,400
neural nets with approaching 100 million
00:06:01,410 --> 00:06:05,550
parameters in them that can recognize a
00:06:05,560 --> 00:06:07,440
thousand different object classes in 1.3
00:06:07,450 --> 00:06:11,339
million high resolution training images
00:06:11,349 --> 00:06:13,529
got from the web so there was a
00:06:13,539 --> 00:06:16,200
competition in 2010 and the best system
00:06:16,210 --> 00:06:18,510
got 47 percent error rate if you look at
00:06:18,520 --> 00:06:20,400
his first choice and 25 percent error
00:06:20,410 --> 00:06:22,680
rate if you say got it right if it was
00:06:22,690 --> 00:06:26,730
in its top 5 choices which isn't bad for
00:06:26,740 --> 00:06:28,800
a thousand different objects Jitendra
00:06:28,810 --> 00:06:31,020
malik who's an eminent neural net
00:06:31,030 --> 00:06:34,409
skeptic and a leading computer vision
00:06:34,419 --> 00:06:36,029
researcher has said that this
00:06:36,039 --> 00:06:37,589
competition is a good test of whether
00:06:37,599 --> 00:06:39,620
deep neural networks can work well for
00:06:39,630 --> 00:06:43,310
object recognition
00:06:43,320 --> 00:06:45,590
and a very deep neural network can now
00:06:45,600 --> 00:06:47,660
do considerably better than the thing
00:06:47,670 --> 00:06:49,940
that won the competition it can get less
00:06:49,950 --> 00:06:51,500
than 40 percent error for its first
00:06:51,510 --> 00:06:53,960
choice and less than 20 percent our
00:06:53,970 --> 00:06:55,910
efforts top 5 choices I'll describe that
00:06:55,920 --> 00:06:58,700
in much more detail in lecture 5 here's
00:06:58,710 --> 00:07:00,710
some examples of the kinds of images you
00:07:00,720 --> 00:07:02,270
have to recognize these are images from
00:07:02,280 --> 00:07:06,650
the test set that is never seen before
00:07:06,660 --> 00:07:08,750
and below the examples I'm showing you
00:07:08,760 --> 00:07:11,090
what the neural net thought the right
00:07:11,100 --> 00:07:12,830
answer was where the length of the
00:07:12,840 --> 00:07:16,640
horizontal bar is how confident it was
00:07:16,650 --> 00:07:18,380
and the correct answer is in red so if
00:07:18,390 --> 00:07:21,290
you look in the middle a correctly
00:07:21,300 --> 00:07:23,030
identified that as a snow plow but you
00:07:23,040 --> 00:07:24,980
can see that his other choices were also
00:07:24,990 --> 00:07:27,350
fairly sensible it does look a little
00:07:27,360 --> 00:07:29,030
bit like a drilling platform and if you
00:07:29,040 --> 00:07:30,890
look at its third choice a lifeboat
00:07:30,900 --> 00:07:32,360
it actually looks very like a lifeboat
00:07:32,370 --> 00:07:33,800
you can see the flag on the front of the
00:07:33,810 --> 00:07:35,720
boat and the bridge of the boat and the
00:07:35,730 --> 00:07:38,810
flag at the back and the high surf in
00:07:38,820 --> 00:07:40,190
the background so it's its errors tell
00:07:40,200 --> 00:07:42,380
you a lot about how it's doing it and
00:07:42,390 --> 00:07:44,800
they're very plausible errors if you
00:07:44,810 --> 00:07:47,000
look on the left it gets it wrong
00:07:47,010 --> 00:07:49,640
possibly because the beak of the bird is
00:07:49,650 --> 00:07:52,630
missing and cuz the feathers of the bird
00:07:52,640 --> 00:07:55,700
look very like the wet fur of an otter
00:07:55,710 --> 00:07:57,140
but he gets it in his top five and it
00:07:57,150 --> 00:07:58,940
does better than me I wouldn't know if
00:07:58,950 --> 00:08:01,730
that was a quail or a roughed grouse or
00:08:01,740 --> 00:08:05,180
a partridge if you look on the right he
00:08:05,190 --> 00:08:07,370
gets it completely wrong it a guillotine
00:08:07,380 --> 00:08:09,200
you can see why it says that you can
00:08:09,210 --> 00:08:11,060
possibly see why it says re Newtown and
00:08:11,070 --> 00:08:12,380
because of the sort of jungle looking
00:08:12,390 --> 00:08:14,750
background or something orange in the
00:08:14,760 --> 00:08:18,170
middle but it fails to get the right
00:08:18,180 --> 00:08:20,600
answer it can however deal with a wide
00:08:20,610 --> 00:08:24,050
range of different objects if you look
00:08:24,060 --> 00:08:26,830
on the left I would have said microwave
00:08:26,840 --> 00:08:29,060
as my first answer the labels are very
00:08:29,070 --> 00:08:31,250
systematic so actually the correct
00:08:31,260 --> 00:08:33,680
answer there's electric range and does
00:08:33,690 --> 00:08:35,060
get it in his top five in the middle
00:08:35,070 --> 00:08:37,940
it's getting a turnstile which is a
00:08:37,950 --> 00:08:39,350
distributed object it does can't it can
00:08:39,360 --> 00:08:41,000
do more than just recognize compact
00:08:41,010 --> 00:08:42,709
things and it can also deal with
00:08:42,719 --> 00:08:46,340
pictures as well as real scenes like the
00:08:46,350 --> 00:08:49,130
bulletproof vest and it makes them very
00:08:49,140 --> 00:08:50,840
cool errors if you look at the image on
00:08:50,850 --> 00:08:53,569
the left that's
00:08:53,579 --> 00:08:55,519
earphone it doesn't get anything like an
00:08:55,529 --> 00:08:58,069
earphone but if you look at its fourth
00:08:58,079 --> 00:08:59,300
bet it thinks it's an ant until you
00:08:59,310 --> 00:09:00,710
really think that's crazy
00:09:00,720 --> 00:09:02,090
but then if you look at it carefully you
00:09:02,100 --> 00:09:03,620
can see it's a view for an ant from
00:09:03,630 --> 00:09:05,600
underneath the eyes are looking down on
00:09:05,610 --> 00:09:07,699
you and you can see the antennae behind
00:09:07,709 --> 00:09:09,939
it it's not the kind of view of an ant
00:09:09,949 --> 00:09:12,530
you'd like to have if you're a green fly
00:09:12,540 --> 00:09:14,480
if you look at the one on the right it
00:09:14,490 --> 00:09:19,059
his answers are cylindrical objects
00:09:21,600 --> 00:09:25,699
another task that neural Nets and I very
00:09:25,709 --> 00:09:26,900
good at is speech recognition or at
00:09:26,910 --> 00:09:29,930
least part of a speech recognition
00:09:29,940 --> 00:09:33,139
system so speech recognition systems
00:09:33,149 --> 00:09:35,569
have several stages first they pre
00:09:35,579 --> 00:09:38,420
process the sound wave to get a vector
00:09:38,430 --> 00:09:41,809
of acoustic coefficients for each 10
00:09:41,819 --> 00:09:42,980
milliseconds of sine wave and so they
00:09:42,990 --> 00:09:46,249
get a hundred of those vectors per
00:09:46,259 --> 00:09:48,410
second they then take a few adjacent
00:09:48,420 --> 00:09:51,949
vectors of acoustic coefficients and
00:09:51,959 --> 00:09:54,829
they need to place bets on which part of
00:09:54,839 --> 00:09:56,749
which phoneme is being spoken so they
00:09:56,759 --> 00:09:58,759
look at this little window and they say
00:09:58,769 --> 00:10:00,740
in the middle of this window what do I
00:10:00,750 --> 00:10:04,490
think the phoneme is and which part of
00:10:04,500 --> 00:10:06,829
the phoneme is it and a good speech
00:10:06,839 --> 00:10:08,840
recognition system will have many
00:10:08,850 --> 00:10:10,249
alternative models for a phoneme and
00:10:10,259 --> 00:10:13,639
each model it might have three different
00:10:13,649 --> 00:10:15,199
parts so it might have many thousands of
00:10:15,209 --> 00:10:17,720
alternative fragments that it thinks
00:10:17,730 --> 00:10:19,400
this might be and you have to place bets
00:10:19,410 --> 00:10:22,970
on all those thousands of alternatives
00:10:22,980 --> 00:10:25,699
and then once you place those bets you
00:10:25,709 --> 00:10:29,840
have a decoding stage that does the best
00:10:29,850 --> 00:10:32,900
job it can of using plausible bets but
00:10:32,910 --> 00:10:35,960
piecing them together into a sequence of
00:10:35,970 --> 00:10:40,819
bets that corresponds to the kinds of
00:10:40,829 --> 00:10:43,069
things that people say currently deep
00:10:43,079 --> 00:10:45,350
neural networks pioneered by George Dahl
00:10:45,360 --> 00:10:47,660
and Abdul Rahman Muhammad of the
00:10:47,670 --> 00:10:49,670
University of Toronto are doing better
00:10:49,680 --> 00:10:51,470
than previous machine learning methods
00:10:51,480 --> 00:10:52,699
for the acoustic model and they're now
00:10:52,709 --> 00:11:01,340
beginning to be used in practical
00:11:01,350 --> 00:11:04,160
systems so Darla Mohammed developed a
00:11:04,170 --> 00:11:10,430
system that uses many layers
00:11:10,440 --> 00:11:13,490
of binary neurons to take some acoustic
00:11:13,500 --> 00:11:15,470
frames and make bets about the labels
00:11:15,480 --> 00:11:17,960
they were doing it on a fairly small
00:11:17,970 --> 00:11:20,210
database and then used 183 alternative
00:11:20,220 --> 00:11:21,740
lengths and to get their system to work
00:11:21,750 --> 00:11:23,389
well they did some pre training which
00:11:23,399 --> 00:11:26,540
will be described in the second half of
00:11:26,550 --> 00:11:27,319
the course after standard post
00:11:27,329 --> 00:11:29,960
processing
00:11:29,970 --> 00:11:31,879
they got twenty point seven percent
00:11:31,889 --> 00:11:33,769
error rate on a very standard benchmark
00:11:33,779 --> 00:11:36,949
which is kind of like the N missed for
00:11:36,959 --> 00:11:38,840
speech the best previous result on that
00:11:38,850 --> 00:11:40,759
benchmark for speech independent
00:11:40,769 --> 00:11:43,819
recognition was twenty four point four
00:11:43,829 --> 00:11:46,840
percent and a very experienced receipt
00:11:46,850 --> 00:11:48,829
speech researcher at Microsoft Research
00:11:48,839 --> 00:11:51,199
realized that that was a big enough
00:11:51,209 --> 00:11:52,579
improvement that probably this would
00:11:52,589 --> 00:11:57,079
change the way speech recognition
00:11:57,089 --> 00:12:00,220
systems were done and indeed it has so
00:12:00,230 --> 00:12:03,069
if you look at recent results from
00:12:03,079 --> 00:12:05,780
several different leading speech groups
00:12:05,790 --> 00:12:08,329
Microsoft showed that this kind of deep
00:12:08,339 --> 00:12:10,699
neural network when used as the acoustic
00:12:10,709 --> 00:12:12,170
model in a speech system reduce the
00:12:12,180 --> 00:12:13,579
error rate from thirty seven point four
00:12:13,589 --> 00:12:16,460
percent to eighteen point five percent
00:12:16,470 --> 00:12:18,110
or alternatively you could view it as
00:12:18,120 --> 00:12:20,600
reducing the amount of training data you
00:12:20,610 --> 00:12:21,949
needed from two thousand hours down to
00:12:21,959 --> 00:12:25,490
three hundred nine hours to get
00:12:25,500 --> 00:12:29,019
comparable performance IBM which has the
00:12:29,029 --> 00:12:32,269
best system for one of the standard
00:12:32,279 --> 00:12:35,360
speech recognition tasks for large with
00:12:35,370 --> 00:12:37,400
library speech recognition showed that
00:12:37,410 --> 00:12:40,069
even it's very highly tuned system that
00:12:40,079 --> 00:12:42,949
was getting 18.8% can be beaten by one
00:12:42,959 --> 00:12:45,050
of these deep neural networks and Google
00:12:45,060 --> 00:12:47,720
fairly recently trained to deep neural
00:12:47,730 --> 00:12:49,670
network on a large amount of speech five
00:12:49,680 --> 00:12:51,439
thousand eight hundred hours that was
00:12:51,449 --> 00:12:54,590
still much less than their trend they
00:12:54,600 --> 00:12:56,780
guess in mixture model on but even with
00:12:56,790 --> 00:12:59,329
much less data it did a lot better than
00:12:59,339 --> 00:13:01,370
the technology they had performed so
00:13:01,380 --> 00:13:03,050
reduce the error rate from 16 percent to
00:13:03,060 --> 00:13:05,660
twelve point three percent and the error
00:13:05,670 --> 00:13:08,870
rate is still falling and in the latest
00:13:08,880 --> 00:13:10,460
Android if you do voice search it's
00:13:10,470 --> 00:13:12,949
using one of these deep neural networks
00:13:12,949 --> 00:13:12,959

00:13:12,959 --> 00:13:15,380
recognition

In [None]:
for t in glob.glob("/content/transcripts/Lecture*.srt"):
    print(t)
    print(get_num_tokens(load_transcription(t)))

/content/transcripts/Lecture 4.1 — Learning to predict the next word — [ Deep Learning ｜ Geoffrey Hinton ｜ UofT ] [_LzxJ1LbSl4].en.srt
7193
/content/transcripts/Lecture 7.5 — Long term Short term memory — [ Deep Learning ｜ Geoffrey Hinton ｜ UofT ] [OtFAECd_IXQ].en.srt
5297
/content/transcripts/Lecture 2.1 — Types of neural network architectures — [ Deep Learning ｜ Geoffrey Hinton ｜ UofT ] [6cHupvcxA38].en.srt
4224
/content/transcripts/Lecture 8.3 — Predicting the next character using HF — [ Deep Learning ｜  Hinton ｜ UofT ] [74Hj4By5kjg].en.srt
6724
/content/transcripts/Lecture 4.3 — The softmax output function — [ Deep Learning ｜ Geoffrey Hinton ｜ UofT ] [PHP8beSz5o4].en.srt
4037
/content/transcripts/Lecture 5.1 — Why object recognition is difficult — [ Deep Learning ｜ Geoffrey Hinton ｜ UofT ] [XSmoVWhM8G4].en.srt
2713
/content/transcripts/Lecture 7.1 — Modeling sequences  a brief overview — [ Deep Learning ｜ Geoffrey Hinton ｜ UofT ] [V0-2pV8vQ84].en.srt
9658
/content/transcripts/Lectu

In [None]:
chapters_files = natsorted([file_path for file_path in glob.glob("/content/chapters/chapters*.txt")])
transcription_files = natsorted([file_path for file_path in glob.glob("/content/transcripts/Lecture*.srt")])

In [None]:
chapters_files[:5]

['/content/chapters/chapters_1A6Md5ZYyW0.txt',
 '/content/chapters/chapters_1CgojqlHrcE.txt',
 '/content/chapters/chapters_2k9XTr_jNfE.txt',
 '/content/chapters/chapters_3BDc0H9C9dw.txt',
 '/content/chapters/chapters_4gOdNtVNZtk.txt']

In [None]:
transcription_files[:5]

['/content/transcripts/Lecture 1.1 — Why do we need machine learning — [ Deep Learning ｜ Geoffrey Hinton ｜ UofT ] [OVwEeSsSCHE].en.srt',
 '/content/transcripts/Lecture 1.2 — What are neural networks — [ Deep Learning ｜ Geoffrey Hinton ｜ UofT ] [jNBYZbDWyQk].en.srt',
 '/content/transcripts/Lecture 1.3 — Some simple models of neurons — [ Deep Learning ｜ Geoffrey Hinton ｜ UofT ] [VA9niXgGOsQ].en.srt',
 '/content/transcripts/Lecture 1.4 — A simple example of learning — [ Deep Learning ｜ Geoffrey Hinton ｜ UofT ] [mnTJezQOIDU].en.srt',
 '/content/transcripts/Lecture 1.5 — Three types of learning — [ Deep Learning ｜ Geoffrey Hinton ｜ UofT ] [nrkpEx7tA2Y].en.srt']

In [None]:
video_ids = list(video_list_dict.keys())
urls = list(video_list_dict.values())

In [None]:
len(video_ids)

78

In [None]:
data = []
# Function to find the matching file based on the video ID
def find_matching_file(file_list, key):
    for file in file_list:
        if key in file:
            return file
    return None
# Loop through the video_list_dict
for video_id, url in video_list_dict.items():
    # Initialize an empty dictionary to store data for each set
    entry = {}

    entry["video_id"] = video_id
    entry["url"] = url

    # Find the corresponding chapter and transcription files based on the video ID
    chapter_file = find_matching_file(chapters_files, video_id)
    transcription_file = find_matching_file(transcription_files, video_id)

    # Check if matching files were found
    if chapter_file is not None and transcription_file is not None:
        # Read the chapter file and store its contents
        with open(chapter_file, "r") as f:
            entry["chapters_section"] = f.read()

        # Read the transcription file and store its contents
        with open(transcription_file, "r") as f:
            entry["transcription"] = f.read()

        # Append the dictionary to the list
        data.append(entry)



In [None]:
# Function to generate the new JSON structure
def generate_new_json_structure(original_data):
    new_data_list = []

    for entry in original_data:
        new_data = {"messages": []}

        # Add a system message
        system_message = {"role": "system", "content": "You are a helpful assistant."}
        new_data["messages"].append(system_message)

        url = entry["url"]
        chapters = entry["chapters_section"]
        transcription = entry["transcription"]

        # Add user message
        user_message_content = f"Given this Youtube video transcript: {transcription} I want you to create a chapters section for this Youtube video with the following format:\n"
        user_message_content += "Chapters:\n"
        user_message_content += "<double digit:time stamp> - <Concise phrase of a major part of the video>"
        user_message = {"role": "user", "content": user_message_content}
        new_data["messages"].append(user_message)

        # Add assistant message
        assistant_message_content = chapters  # Assuming chapters are already formatted as desired
        assistant_message = {"role": "assistant", "content": assistant_message_content}
        new_data["messages"].append(assistant_message)

        new_data_list.append(new_data)

    return new_data_list


# Generate the new JSON structure
new_data_list = generate_new_json_structure(data)
# Write the new JSON structure to a .jsonl file
with open("./dataset_fine_tunning.jsonl", "w") as f:
    for item in new_data_list:
        json.dump(item, f)
        f.write('\n')

In [None]:
data_path = "./dataset_fine_tunning.jsonl"

# Load the dataset
with open(data_path, 'r', encoding='utf-8') as f:
    dataset = [json.loads(line) for line in f]

# Initial dataset stats
print("Num examples:", len(dataset))
print("First example:")
for message in dataset[0]["messages"]:
    print(message)

Num examples: 75
First example:
{'role': 'system', 'content': 'You are a helpful assistant.'}
{'role': 'user', 'content': "Given this Youtube video transcript: 00:00:03,780 --> 00:00:06,070\nhello welcome to the Coursera course on\n00:00:06,080 --> 00:00:09,020\nneural networks for machine learning\n00:00:09,030 --> 00:00:11,270\nbefore we get into the details of neural\n00:00:11,280 --> 00:00:13,120\nnetwork learning algorithms I want to\n00:00:13,130 --> 00:00:16,400\ntalk a little bit about machine learning\n00:00:16,410 --> 00:00:18,590\nwhy we need machine learning the kinds\n00:00:18,600 --> 00:00:22,279\nof things we use it for and show you\n00:00:22,289 --> 00:00:25,220\nsome examples of what it can do so the\n00:00:25,230 --> 00:00:27,410\nreason we need machine learning is that\n00:00:27,420 --> 00:00:30,109\nthe some problems where it's very hard\n00:00:30,119 --> 00:00:31,479\nto write the programs recognizing a\n00:00:31,489 --> 00:00:34,400\nthree-dimensional object for e

In [None]:
# Format error checks
format_errors = defaultdict(int)

for ex in dataset:
    if not isinstance(ex, dict):
        format_errors["data_type"] += 1
        continue

    messages = ex.get("messages", None)
    if not messages:
        format_errors["missing_messages_list"] += 1
        continue

    for message in messages:
        if "role" not in message or "content" not in message:
            format_errors["message_missing_key"] += 1

        if any(k not in ("role", "content", "name", "function_call") for k in message):
            format_errors["message_unrecognized_key"] += 1

        if message.get("role", None) not in ("system", "user", "assistant", "function"):
            format_errors["unrecognized_role"] += 1

        content = message.get("content", None)
        function_call = message.get("function_call", None)

        if (not content and not function_call) or not isinstance(content, str):
            format_errors["missing_content"] += 1

    if not any(message.get("role", None) == "assistant" for message in messages):
        format_errors["example_missing_assistant_message"] += 1

if format_errors:
    print("Found errors:")
    for k, v in format_errors.items():
        print(f"{k}: {v}")
else:
    print("No errors found")

No errors found


In [None]:
# Some helpful utilities
encoding = tiktoken.get_encoding("cl100k_base")
# not exact!
# simplified from https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb
def num_tokens_from_messages(messages, tokens_per_message=3, tokens_per_name=1):
    num_tokens = 0
    for message in messages:
        num_tokens += tokens_per_message
        for key, value in message.items():
            num_tokens += len(encoding.encode(value))
            if key == "name":
                num_tokens += tokens_per_name
    num_tokens += 3
    return num_tokens

def num_assistant_tokens_from_messages(messages):
    num_tokens = 0
    for message in messages:
        if message["role"] == "assistant":
            num_tokens += len(encoding.encode(message["content"]))
    return num_tokens

def print_distribution(values, name):
    print(f"\n#### Distribution of {name}:")
    print(f"min / max: {min(values)}, {max(values)}")
    print(f"mean / median: {np.mean(values)}, {np.median(values)}")
    print(f"p5 / p95: {np.quantile(values, 0.1)}, {np.quantile(values, 0.9)}")

In [None]:
# Warnings and tokens counts
n_missing_system = 0
n_missing_user = 0
n_messages = []
convo_lens = []
assistant_message_lens = []

for ex in dataset:
    messages = ex["messages"]
    if not any(message["role"] == "system" for message in messages):
        n_missing_system += 1
    if not any(message["role"] == "user" for message in messages):
        n_missing_user += 1
    n_messages.append(len(messages))
    convo_lens.append(num_tokens_from_messages(messages))
    assistant_message_lens.append(num_assistant_tokens_from_messages(messages))

print("Num examples missing system message:", n_missing_system)
print("Num examples missing user message:", n_missing_user)
print_distribution(n_messages, "num_messages_per_example")
print_distribution(convo_lens, "num_total_tokens_per_example")
print_distribution(assistant_message_lens, "num_assistant_tokens_per_example")
n_too_long = sum(l > 4096 for l in convo_lens)
print(f"\n{n_too_long} examples may be over the 4096 token limit")

Num examples missing system message: 0
Num examples missing user message: 0

#### Distribution of num_messages_per_example:
min / max: 3, 3
mean / median: 3.0, 3.0
p5 / p95: 3.0, 3.0

#### Distribution of num_total_tokens_per_example:
min / max: 1603, 11270
mean / median: 5636.573333333334, 5328.0
p5 / p95: 3031.4, 8144.400000000001

#### Distribution of num_assistant_tokens_per_example:
min / max: 15, 143
mean / median: 61.56, 56.0
p5 / p95: 23.200000000000003, 109.00000000000004

58 examples may be over the 4096 token limit


In [None]:
MAX_TOKENS_PER_EXAMPLE = 4096

TARGET_EPOCHS = 5
MIN_TARGET_EXAMPLES = 100
MAX_TARGET_EXAMPLES = 25000
MIN_DEFAULT_EPOCHS = 1
MAX_DEFAULT_EPOCHS = 25

n_epochs = TARGET_EPOCHS
n_train_examples = len(dataset)
if n_train_examples * TARGET_EPOCHS < MIN_TARGET_EXAMPLES:
    n_epochs = min(MAX_DEFAULT_EPOCHS, MIN_TARGET_EXAMPLES // n_train_examples)
elif n_train_examples * TARGET_EPOCHS > MAX_TARGET_EXAMPLES:
    n_epochs = max(MIN_DEFAULT_EPOCHS, MAX_TARGET_EXAMPLES // n_train_examples)

n_billing_tokens_in_dataset = sum(min(MAX_TOKENS_PER_EXAMPLE, length) for length in convo_lens)
total_tokens = n_epochs * n_billing_tokens_in_dataset
print(f"Dataset has ~{n_billing_tokens_in_dataset} tokens that will be charged for during training")
print(f"By default, you'll train for {n_epochs} epochs on this dataset")
print(f"By default, you'll be charged for ~{total_tokens} tokens")

Dataset has ~289402 tokens that will be charged for during training
By default, you'll train for 5 epochs on this dataset
By default, you'll be charged for ~1447010 tokens


In [None]:
def calculate_cost_for_fine_tunning(token_count):
    return (0.0080*token_count)/1000

In [None]:
calculate_cost_for_fine_tunning(total_tokens)

11.57608

In [None]:
with open('openapi_key.json', 'r') as file:
    data = json.load(file)

# Access the API key
api_key = data['api_key']



In [None]:
openai.api_key = api_key

In [None]:
openai.File.create(
  file=open("./dataset_fine_tunning.jsonl", "rb"),
  purpose='fine-tune'
)

FileNotFoundError: ignored

In [None]:
openai.File.list()

<OpenAIObject list at 0x790381e2c400> JSON: {
  "object": "list",
  "has_more": false,
  "data": [
    {
      "object": "file",
      "id": "file-Gh9QjaWiSNkcVGMpQnUVIgfR",
      "purpose": "fine-tune",
      "filename": "file",
      "bytes": 1172965,
      "created_at": 1699904999,
      "status": "processed",
      "status_details": null
    },
    {
      "object": "file",
      "id": "file-Gte7mAIz441V3cXzg8TCMuob",
      "purpose": "fine-tune-results",
      "filename": "step_metrics.csv",
      "bytes": 52969,
      "created_at": 1699817871,
      "status": "processed",
      "status_details": null
    },
    {
      "object": "file",
      "id": "file-us4g6wbX61NcwnTkDyaDqdKS",
      "purpose": "fine-tune",
      "filename": "chat_reports.jsonl",
      "bytes": 1223848,
      "created_at": 1699817096,
      "status": "processed",
      "status_details": null
    },
    {
      "object": "file",
      "id": "file-uDAtT547HDymTmUCcX6Q5pYh",
      "purpose": "fine-tune",
      "f

In [None]:
file_id = openai.File.list()["data"][0]["id"]

In [None]:
openai.FineTuningJob.create(training_file=file_id, model="gpt-3.5-turbo")

<FineTuningJob fine_tuning.job id=ftjob-REGHglifBfozrWIfiGsf6zBk at 0x790381e2ec50> JSON: {
  "object": "fine_tuning.job",
  "id": "ftjob-REGHglifBfozrWIfiGsf6zBk",
  "model": "gpt-3.5-turbo-0613",
  "created_at": 1699905022,
  "finished_at": null,
  "fine_tuned_model": null,
  "organization_id": "org-0T52xTBIq1tVsEsgjkhEp7il",
  "result_files": [],
  "status": "validating_files",
  "validation_file": null,
  "training_file": "file-Gh9QjaWiSNkcVGMpQnUVIgfR",
  "hyperparameters": {
    "n_epochs": "auto",
    "batch_size": "auto",
    "learning_rate_multiplier": "auto"
  },
  "trained_tokens": null,
  "error": null
}

In [None]:
# List 10 fine-tuning jobs
openai.FineTuningJob.list(limit=10)

# Retrieve the state of a fine-tune
openai.FineTuningJob.retrieve("ftjob-REGHglifBfozrWIfiGsf6zBk")

# Cancel a job
#openai.FineTuningJob.cancel("file-kXlvspUQRKFUWcrW5pgzy0PW")

# List up to 10 events from a fine-tuning job
#openai.FineTuningJob.list_events(id="file-kXlvspUQRKFUWcrW5pgzy0PW", limit=10)

<FineTuningJob fine_tuning.job id=ftjob-REGHglifBfozrWIfiGsf6zBk at 0x790381eb8f40> JSON: {
  "object": "fine_tuning.job",
  "id": "ftjob-REGHglifBfozrWIfiGsf6zBk",
  "model": "gpt-3.5-turbo-0613",
  "created_at": 1699905022,
  "finished_at": 1699905745,
  "fine_tuned_model": "ft:gpt-3.5-turbo-0613:leadsift::8KXQY7Yj",
  "organization_id": "org-0T52xTBIq1tVsEsgjkhEp7il",
  "result_files": [
    "file-7wvC5iWYErSzMmWFDRunft4x"
  ],
  "status": "succeeded",
  "validation_file": null,
  "training_file": "file-Gh9QjaWiSNkcVGMpQnUVIgfR",
  "hyperparameters": {
    "n_epochs": 3,
    "batch_size": 1,
    "learning_rate_multiplier": 2
  },
  "trained_tokens": 868104,
  "error": null
}

## Inference

In [None]:
inference_url = "https://youtu.be/KWULpBYzIYk?si=Z4MQkyh7F3M7PmQi"

In [None]:
c_filename, t_filename = get_youtube_chapters_and_transcript(inference_url)

In [None]:
inferTranscription = load_transcription("/content/transcripts/#10 Machine Learning Specialization [Course 1, Week 1, Lesson 3] [KWULpBYzIYk].en.srt")

In [None]:
inferTranscription

"00:00:04,140 --> 00:00:06,889\nlet's look in this video at the process\n00:00:06,899 --> 00:00:08,810\nof how supervised Learning Works\n00:00:08,820 --> 00:00:11,089\nsupervised learning algorithm will input\n00:00:11,099 --> 00:00:13,430\nthe data set and then what exactly does\n00:00:13,440 --> 00:00:15,049\nit do and what does it output let's find\n00:00:15,059 --> 00:00:18,170\nout in this video\n00:00:18,180 --> 00:00:19,910\nrecall that a training set in supervised\n00:00:19,920 --> 00:00:21,769\nlearning includes both the input\n00:00:21,779 --> 00:00:24,769\nfeatures such as the size of the house\n00:00:24,779 --> 00:00:27,290\nand also the output targets such as the\n00:00:27,300 --> 00:00:29,450\nprice of the house the output targets\n00:00:29,460 --> 00:00:30,710\nare the right answers to the model we'll\n00:00:30,720 --> 00:00:33,410\nlearn from\n00:00:33,420 --> 00:00:35,690\nto train the model you feed the trading\n00:00:35,700 --> 00:00:38,270\nset both the input featu

In [None]:
prompt = """
Given this Youtube video transcript:
'''
00:00:04,140 --> 00:00:06,889
let's look in this video at the process
00:00:06,899 --> 00:00:08,810
of how supervised Learning Works
00:00:08,820 --> 00:00:11,089
supervised learning algorithm will input
00:00:11,099 --> 00:00:13,430
the data set and then what exactly does
00:00:13,440 --> 00:00:15,049
it do and what does it output let's find
00:00:15,059 --> 00:00:18,170
out in this video
00:00:18,180 --> 00:00:19,910
recall that a training set in supervised
00:00:19,920 --> 00:00:21,769
learning includes both the input
00:00:21,779 --> 00:00:24,769
features such as the size of the house
00:00:24,779 --> 00:00:27,290
and also the output targets such as the
00:00:27,300 --> 00:00:29,450
price of the house the output targets
00:00:29,460 --> 00:00:30,710
are the right answers to the model we'll
00:00:30,720 --> 00:00:33,410
learn from
00:00:33,420 --> 00:00:35,690
to train the model you feed the trading
00:00:35,700 --> 00:00:38,270
set both the input features and the
00:00:38,280 --> 00:00:39,650
output targets to your learning
00:00:39,660 --> 00:00:42,290
algorithm
00:00:42,300 --> 00:00:44,869
then your supervised learning algorithm
00:00:44,879 --> 00:00:47,389
will produce some function
00:00:47,399 --> 00:00:50,930
we'll write this function as lowercase f
00:00:50,940 --> 00:00:53,029
where F stands for function historically
00:00:53,039 --> 00:00:55,850
this function used to be called a
00:00:55,860 --> 00:00:58,069
hypothesis but I'm just going to call it
00:00:58,079 --> 00:01:02,689
a function f in this clause
00:01:02,699 --> 00:01:03,850
and the job of f is to take a new input
00:01:03,860 --> 00:01:08,230
X
00:01:08,240 --> 00:01:12,109
and upwards an estimate or prediction
00:01:12,119 --> 00:01:15,350
which I'm going to call Y hat and it's
00:01:15,360 --> 00:01:18,109
written like the variable y with this
00:01:18,119 --> 00:01:21,050
little hat symbol on top
00:01:21,060 --> 00:01:24,649
in machine learning the convention is
00:01:24,659 --> 00:01:27,770
that y hat is the estimate or the
00:01:27,780 --> 00:01:31,550
prediction for y
00:01:31,560 --> 00:01:34,609
the function f is called the model
00:01:34,619 --> 00:01:37,969
X is called the input or the input
00:01:37,979 --> 00:01:40,789
feature and the output of the model is
00:01:40,799 --> 00:01:43,130
the prediction y hat
00:01:43,140 --> 00:01:45,410
the model's prediction is the estimated
00:01:45,420 --> 00:01:49,550
value of y
00:01:49,560 --> 00:01:52,370
when the symbol is just a letter Y then
00:01:52,380 --> 00:01:55,910
that refers to the Target which is the
00:01:55,920 --> 00:01:58,969
actual True Value in the training set in
00:01:58,979 --> 00:02:02,030
contrast y hat is an estimate it may or
00:02:02,040 --> 00:02:03,889
may not be the actual True Value
00:02:03,899 --> 00:02:06,289
well if you're helping your client to
00:02:06,299 --> 00:02:08,930
sell the house well the true price of
00:02:08,940 --> 00:02:12,650
the house is unknown until they sell it
00:02:12,660 --> 00:02:14,690
so your model f given the size or
00:02:14,700 --> 00:02:17,330
pressure price which is the estimated
00:02:17,340 --> 00:02:19,250
that is the prediction of what the true
00:02:19,260 --> 00:02:23,030
price will be
00:02:23,040 --> 00:02:26,030
now when we design a learning algorithm
00:02:26,040 --> 00:02:29,030
a key question is how are we going to
00:02:29,040 --> 00:02:31,850
represent the function f or in other
00:02:31,860 --> 00:02:35,270
words what is the math formula we're
00:02:35,280 --> 00:02:38,449
going to use to compute f
00:02:38,459 --> 00:02:39,530
for now let's stick with f being a
00:02:39,540 --> 00:02:42,710
straight line
00:02:42,720 --> 00:02:47,750
so your function can be written as F
00:02:47,760 --> 00:02:52,009
subscript W comma B of x equals I'm
00:02:52,019 --> 00:02:55,790
going to use W Times X plus b
00:02:55,800 --> 00:02:59,150
I'll Define w and B soon but for now
00:02:59,160 --> 00:03:02,270
just know that W and B are numbers and
00:03:02,280 --> 00:03:05,509
the values chosen for w and B will
00:03:05,519 --> 00:03:10,970
determine the prediction y hat based on
00:03:10,980 --> 00:03:14,449
the input feature X so this FWB of X
00:03:14,459 --> 00:03:17,449
means f is a function that takes X's
00:03:17,459 --> 00:03:21,229
input and depending on the values of w
00:03:21,239 --> 00:03:23,809
and b f will output some value of a
00:03:23,819 --> 00:03:27,949
prediction y hat
00:03:27,959 --> 00:03:31,550
as an alternative to writing this FW
00:03:31,560 --> 00:03:34,490
comma B of X I'll sometimes just write f
00:03:34,500 --> 00:03:36,410
of x without explicitly including W and
00:03:36,420 --> 00:03:39,170
B in the subscript it's just a simple
00:03:39,180 --> 00:03:42,890
notation but means exactly the same
00:03:42,900 --> 00:03:45,170
thing as FWB of x
00:03:45,180 --> 00:03:47,630
let's plot the trading set on the graph
00:03:47,640 --> 00:03:51,050
where the input feature X is on the
00:03:51,060 --> 00:03:55,130
horizontal axis and the output targets Y
00:03:55,140 --> 00:03:57,470
is on the vertical axis remember the
00:03:57,480 --> 00:04:00,470
album learns from this data and
00:04:00,480 --> 00:04:02,149
generates a best fit line like maybe
00:04:02,159 --> 00:04:04,910
this one here
00:04:04,920 --> 00:04:09,710
this straight line is the linear
00:04:09,720 --> 00:04:11,750
function f w b of x equals W Times X
00:04:11,760 --> 00:04:16,009
plus b
00:04:16,019 --> 00:04:20,390
or more simply we can drop W and B and
00:04:20,400 --> 00:04:22,790
just write f of x equals WX plus b
00:04:22,800 --> 00:04:25,390
here's what this function is doing is
00:04:25,400 --> 00:04:28,610
making predictions for the value of y
00:04:28,620 --> 00:04:31,490
using a straight line function of x
00:04:31,500 --> 00:04:34,070
so you may ask why are we choosing a
00:04:34,080 --> 00:04:36,010
linear function where linear function is
00:04:36,020 --> 00:04:38,930
just a fancy term for a straight line
00:04:38,940 --> 00:04:40,850
instead of some nonlinear function like
00:04:40,860 --> 00:04:43,070
a curve or a parabola
00:04:43,080 --> 00:04:45,230
well sometimes you want to fit more
00:04:45,240 --> 00:04:48,350
complex non-linear functions as well
00:04:48,360 --> 00:04:50,749
like a curve like this but since this
00:04:50,759 --> 00:04:53,510
linear function is relatively simple and
00:04:53,520 --> 00:04:55,610
easy to work with let's use a line as a
00:04:55,620 --> 00:04:58,430
foundation that will eventually help you
00:04:58,440 --> 00:05:00,050
to get to more complex models that are
00:05:00,060 --> 00:05:02,990
non-linear
00:05:03,000 --> 00:05:05,090
this particular model as a name is
00:05:05,100 --> 00:05:07,129
called linear regression more
00:05:07,139 --> 00:05:10,189
specifically this is linear regression
00:05:10,199 --> 00:05:11,990
with one variable with a phrase one
00:05:12,000 --> 00:05:15,290
variable means that there's a single
00:05:15,300 --> 00:05:16,969
input variable or feature X namely the
00:05:16,979 --> 00:05:20,210
size of the host
00:05:20,220 --> 00:05:23,090
another name for a linear model with one
00:05:23,100 --> 00:05:26,330
input variable is univariate linear
00:05:26,340 --> 00:05:30,529
regression where uni means one in Latin
00:05:30,539 --> 00:05:32,570
and where variate means variable so univ
00:05:32,580 --> 00:05:34,430
variance is just a fancy way of saying
00:05:34,440 --> 00:05:37,129
one variable
00:05:37,139 --> 00:05:39,650
in a later video you also see a
00:05:39,660 --> 00:05:41,629
variation of regression where you want
00:05:41,639 --> 00:05:43,969
to make a prediction based not just on
00:05:43,979 --> 00:05:46,010
the size of a hose but on a bunch of
00:05:46,020 --> 00:05:47,749
other things that you may know about the
00:05:47,759 --> 00:05:50,210
whole such as number of bedrooms and
00:05:50,220 --> 00:05:51,950
other features and by the way when
00:05:51,960 --> 00:05:54,590
you're done with this video there is
00:05:54,600 --> 00:05:57,170
another optional lab you don't need to
00:05:57,180 --> 00:05:59,390
write any code just review it run the
00:05:59,400 --> 00:06:02,029
code and see what it does that will show
00:06:02,039 --> 00:06:04,909
you how to define in Python a straight
00:06:04,919 --> 00:06:08,330
line function and the lab will let you
00:06:08,340 --> 00:06:10,129
choose the values of wmb to try to fit
00:06:10,139 --> 00:06:12,350
the training data
00:06:12,360 --> 00:06:14,270
you don't have to do the lab if you
00:06:14,280 --> 00:06:16,969
don't want to but I hope you play of it
00:06:16,979 --> 00:06:19,969
when you're done watching this video
00:06:19,979 --> 00:06:21,770
so that's linear regression in order for
00:06:21,780 --> 00:06:23,029
you to make this work one of the most
00:06:23,039 --> 00:06:25,189
important things you have to do is
00:06:25,199 --> 00:06:27,230
construct a cost function
00:06:27,240 --> 00:06:29,570
the idea of a cost function is one of
00:06:29,580 --> 00:06:32,450
the most universal and important ideas
00:06:32,460 --> 00:06:34,909
in machine learning and is used in both
00:06:34,919 --> 00:06:36,950
linear regression and in training many
00:06:36,960 --> 00:06:39,350
of the most advanced AI models in the
00:06:39,360 --> 00:06:41,689
world so let's go on to the next video
00:06:41,689 --> 00:06:41,699

00:06:41,699 --> 00:06:44,900
a cost function
'''

I want you to create chapters section for this Youtube video with the following format: Chapters:\n< time stamp in min and seconds seperated by :> - <Concise phrase of a major part of the video>
"""

async def async_openai_request(prompt):
    loop = asyncio.get_event_loop()
    return await loop.run_in_executor(None, openai.ChatCompletion.create, {
        "model": "ft:gpt-3.5-turbo-0613:leadsift::8KXQY7Yj",
        "messages": [
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt}
        ]
    })


completion = openai.ChatCompletion.create(
  model="ft:gpt-3.5-turbo-0613:leadsift::8KXQY7Yj",
  messages=[
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": prompt}
  ]
)
print(completion.choices[0].message)

{
  "role": "assistant",
  "content": "0min 0s - Introduction\n0min 0s - What is supervised learning\n1min 43s - The function f\n3min 0s - Linear function\n4min 25s - Linear regression\n6min 6s - Cost function"
}
