In [7]:
import os
import re
# from time import time,sleep
import openai
from youtube_transcript_api import YouTubeTranscriptApi
from concurrent.futures import ThreadPoolExecutor, as_completed
from dotenv import load_dotenv

### Parsing video ID from URL

In [8]:
# video URL will be received through GET request from quasar frontend
# 11 char ID must then be parsed from link
url = "https://www.youtube.com/watch?v=Unl1jXFnzgo" # MANUAL TRANSCRIPT
# url = "https://www.youtube.com/watch?v=kBwOy-6CtAQ"
# url = "https://www.youtube.com/watch?v=kF4ju6j6aLE"
# url = "https://www.youtube.com/watch?v=TQCr9RV7twk"
vid_id = url.split("?")[1][2:13] # chars 2-13 of 2nd list item

# print(vid_id)

### Obtaining transcript of videos

In [9]:
# SRT format is then obtained through YT transcript API
try:
    srt = YouTubeTranscriptApi.get_transcript(vid_id)
except:
    print("unavailable")

# get length of video in seconds
video_length = int(srt[-1]['start'] + srt[-1]['duration'])
# print(video_length)

# keep track of number of timestamps
num_timestamps = len(srt)

# tokens = [segment['text'] for segment in srt]
vid_segments = {}

# iterate through srt list and separate timestamps in 5min interval
interval = 300 # start in first 5min interval

# add text segments to dictionary based in timestamp intervals
for text_seg in srt:
    text = " ".join(text_seg['text'].split()) # remove newlines, space and rejoin sentences

    if text_seg['start'] < interval or "." in text_seg['text']: # include . to stop mid sentence slicing
        if interval not in vid_segments :
            vid_segments[interval] = [text]
        else:
            vid_segments[interval].append(text)
    else:
        interval += 300 # move onto next interval
        vid_segments[interval] = [text] # don't lose current segment

# join text for each timestamp segment
for timestamp in vid_segments:
    vid_segments[timestamp] = " ".join(vid_segments[timestamp])

print(vid_segments)

{300: '400 years ago, Galileo started piecing together the basic principles of reality—what we now call modern science. But the questions he was trying to answer are as old as humanity itself. What are we made of? What are the fundamental building blocks of the universe from which you, me, the stars, and everything else is constructed? In the centuries since Galileo, thousands of theories and experiments have peered into smaller and smaller distances... converging on a single picture of the structure of matter. This somewhat daunting-looking formula is where we end up. It gives the correct answer to hundreds of thousands of experiments, in some cases with an accuracy that is unprecedented in science. It is, by any measure, the most successful scientific theory of all time. And yet for something so extraordinary, we give it a rubbish name. We call it the Standard Model. I’m David Tong, a theoretical physicist at the University of Cambridge. And in this video, we’re going to build the St

In [10]:

# Determine whether transcript is written manually or auto-generated
transcript_list = YouTubeTranscriptApi.list_transcripts(vid_id)
transcript = transcript_list.find_transcript(['en']) 
print(transcript.is_generated)

False


### Rewriting video segments for auto-generated transcripts

In [13]:
# Create functions for rewriting transcript using GPT-3

import time
st = time.time()


load_dotenv() # take environment variables from .env
openai.api_key = os.getenv("OPENAI_API_KEY")

# query gpt api
def rewrite_segment(extract):
    response = openai.Completion.create(
        model="text-curie-001",
        prompt=rewrite_prompt(extract),
        temperature=0.6,
        max_tokens=500,
        top_p=1,
        frequency_penalty=0,
        presence_penalty=0
    )
    return (response.choices[0].text.strip("\n")) # need to remove unnecessary chars from response

# generate prompt for GPT-3
def rewrite_prompt(extract):
    return f"The following is an unstructured video transcript. Please rewrite this as a more structured, clear and concise essay.\n\nTranscript: \"\"\"\n{extract}\n\"\"\"\n"

# Determine whether transcript is written manually or auto-generated
transcript_list = YouTubeTranscriptApi.list_transcripts(vid_id)
transcript = transcript_list.find_transcript(['en']) 
# print(transcript.is_generated)

# make api call for each segment -> need to parallelise
if transcript.is_generated:
    vid_segments = {stamp: rewrite_segment(vid_segments[stamp]) for stamp in vid_segments}
    
print(vid_segments)

et = time.time()
print(f"\nEXECUTION TIME TAKEN IS {et - st}")


{300: '400 years ago, Galileo started piecing together the basic principles of reality—what we now call modern science. But the questions he was trying to answer are as old as humanity itself. What are we made of? What are the fundamental building blocks of the universe from which you, me, the stars, and everything else is constructed? In the centuries since Galileo, thousands of theories and experiments have peered into smaller and smaller distances... converging on a single picture of the structure of matter. This somewhat daunting-looking formula is where we end up. It gives the correct answer to hundreds of thousands of experiments, in some cases with an accuracy that is unprecedented in science. It is, by any measure, the most successful scientific theory of all time. And yet for something so extraordinary, we give it a rubbish name. We call it the Standard Model. I’m David Tong, a theoretical physicist at the University of Cambridge. And in this video, we’re going to build the St

In [15]:
# Create functions for rewriting transcript using GPT-3

import time
st = time.time()

openai.api_key = "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"

# query gpt api
def rewrite_segment(extract):
    response = openai.Completion.create(
        model="text-davinci-003",
        prompt=rewrite_prompt(extract),
        temperature=0.6,
        max_tokens=500,
        top_p=1,
        frequency_penalty=0,
        presence_penalty=0
    )
    return (response.choices[0].text.strip("\n")) # need to remove unnecessary chars from response

# generate prompt for GPT-3
def rewrite_prompt(extract):
    return f"The following is an unstructured video transcript. Please rewrite this as a more structured, clear and concise essay.\n\nTranscript: \"\"\"\n{extract}\n\"\"\"\n"

# Determine whether transcript is written manually or auto-generated
transcript_list = YouTubeTranscriptApi.list_transcripts(vid_id)
transcript = transcript_list.find_transcript(['en']) 
# print(transcript.is_generated)

# make api call for each segment -> need to parallelise
if transcript.is_generated:
    vid_segments = {stamp: rewrite_segment(vid_segments[stamp]) for stamp in vid_segments}
    
print(vid_segments)

et = time.time()
print(f"\nEXECUTION TIME TAKEN IS {et - st}")


{300: '400 years ago, Galileo started piecing together the basic principles of reality—what we now call modern science. But the questions he was trying to answer are as old as humanity itself. What are we made of? What are the fundamental building blocks of the universe from which you, me, the stars, and everything else is constructed? In the centuries since Galileo, thousands of theories and experiments have peered into smaller and smaller distances... converging on a single picture of the structure of matter. This somewhat daunting-looking formula is where we end up. It gives the correct answer to hundreds of thousands of experiments, in some cases with an accuracy that is unprecedented in science. It is, by any measure, the most successful scientific theory of all time. And yet for something so extraordinary, we give it a rubbish name. We call it the Standard Model. I’m David Tong, a theoretical physicist at the University of Cambridge. And in this video, we’re going to build the St

In [57]:
# CONCURRENCY

st = time.time()

# function to change values of timestamped video segments dict
def join_segment(vid_dict, timestamp, text):
    vid_dict[timestamp] = rewrite_segment(text)
    return timestamp, vid_dict[timestamp]

# function to run
def runner(vid_segments):
    threads= []
    with ThreadPoolExecutor(max_workers=len(vid_segments)) as executor:
        for stamp, text in vid_segments.items():
            threads.append(executor.submit(join_segment, vid_segments, stamp, text))
            
#         for task in as_completed(threads):
#             print(f"{task.result()}-----------\n")

test = runner(vid_segments)
print(test)

et = time.time()
print(f"\nEXECUTION TIME TAKEN IS {et - st}")

None

EXECUTION TIME TAKEN IS 25.08240509033203


### Summarise segments of video transcript

In [16]:
# create functions to make API calls to GPT-3 for summarisation

# api key should be stored locally in file or as an environment variable
openai.api_key = "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"

# query gpt api
def sub_summary(extract):
    response = openai.Completion.create(
        model="text-davinci-003",
        prompt=sub_prompt(extract),
        temperature=0.6,
        max_tokens=2000,
        top_p=1,
        frequency_penalty=0,
        presence_penalty=0
    )
    return (response.choices[0].text.strip("\n")) # need to remove unnecessary chars from response

# generate prompt for gpt3
def sub_prompt(extract):
    return f"Summarise the following extract from a video transcript.\n\nTranscript: \"\"\"\n{extract}\n\"\"\"\n"

st = time.time()
# make api call for each segment -> need to parallelise
sub_summaries = [] # contains summaries of each video segment

# # summarise sub sections of transcript
for segment in vid_segments:
    sub_summaries.append(sub_summary(vid_segments[segment].strip()))

# TESTING
# for sub in sub_summaries:
#     print(len(sub))

# print("-----------")
print(sub_summaries)

et = time.time()
print(f"\nEXECUTION TIME TAKEN IS {et - st}")

RateLimitError: You exceeded your current quota, please check your plan and billing details.

### Concurrent API Calls

In [58]:
st = time.time()

def update_segment(func, vid_dict, timestamp, text):
    vid_dict[timestamp] = func(text) # assign new text to dict key
    return timestamp, vid_dict[timestamp]


# function to run tasks in parallel
def thread_runner(func, vid_segments):
    threads = []
    with ThreadPoolExecutor(max_workers=len(vid_segments)) as executor:
        for stamp, text in vid_segments.items():
            # pass necessary values and intented func to helper
            threads.append(executor.submit(update_segment, func, vid_segments, stamp, text))
            
    return [task.result() for task in as_completed(threads)]

thread_runner(sub_summary, vid_segments)
print(vid_segments)

et = time.time()
print(f"\nEXECUTION TIME TAKEN IS {et - st}")

{300: 'Python is widely used in the finance industry, which has a market cap of 20% of the 500 largest companies in the US and accounts for 26% of all corporate profits. Python is used for trading, risk management, and building complex trading systems, as it can handle large amounts of data and calculations. Knowing how Python is used in the finance industry can open up job and product opportunities, as well as help people manage their personal finances.', 600: 'Financial technology (fintech) is an industry that requires sophisticated software to process complex data in real-time. Front Arena is an example of this software, which is highly customizable and costly. Python is used to write the business logic and stored in the database, allowing for instant changes. Hedge funds and quants use big data, machine learning and AI to make money through stock market investments. Speed is essential, so Python code is recoded in C++ and built into an FPGA. Crowdsourcing is also becoming popular, 

### Create overall summary based on summarised segments

In [52]:
# query gpt api
def meta_summary(extract):
    response = openai.Completion.create(
        model="text-davinci-003",
        prompt=meta_prompt(extract),
        temperature=0.6,
        max_tokens=1000,
    )
    return (response.choices[0].text) # need to remove unnecessary chars from response

# generate prompt for gpt3
def meta_prompt(extract):
    return f"Create a complete overall summary of the following extract from a video transcript.\n\nTranscript extract: \"\"\"\n{extract}\n\"\"\"\n"

print(meta_summary("\n".join(sub_summaries)))
# print(" ".join(sub_summaries))


This extract discusses the use of machine learning for malware detection, mainly focusing on mobile applications. Traditional malware detection uses static and dynamic analysis, and signature-based detection, which attackers are now using machine learning techniques to evade. Deep learning can be used to detect behaviour, but data must be kept up to date for the models to be effective. Overfitting is a problem in machine learning and malware detection, where the system is good at identifying trained data but not new data. To increase accuracy, a Python program can be written to create a function that can identify the category of an image, reducing false positives and potentially being the Holy Grail of malware detection.
