Skip to content

Commit

Permalink
feat: summarize the entire video
Browse files Browse the repository at this point in the history
This also temporarily disables database caching,
will need to remove it later
  • Loading branch information
WofWca committed Mar 28, 2024
1 parent ca8e4c1 commit c7e007a
Show file tree
Hide file tree
Showing 3 changed files with 67 additions and 6 deletions.
6 changes: 5 additions & 1 deletion app.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,8 @@ async def summarize(vid: str):

found = find_chapters_by_vid(vid)
if found:
if (chapters and found[0].slicer != ChapterSlicer.YOUTUBE) or \
# TODO remove `True` and return `video_summary` in the `else` part
if True or (chapters and found[0].slicer != ChapterSlicer.YOUTUBE) or \
need_to_resummarize(vid, found):
logger.info(f'summarize, need to resummarize, vid={vid}')
delete_chapters_by_vid(vid)
Expand All @@ -160,6 +161,7 @@ async def summarize(vid: str):
else:
logger.info(f'summarize, found chapters in database, vid={vid}')
await do_if_found_chapters_in_database(vid, found)
# TODO `video_summary`
return build_summary_response(State.DONE, found)

if rds.exists(no_transcript_rds_key) or no_transcript:
Expand Down Expand Up @@ -312,6 +314,8 @@ async def do_summarize_job(
summarizing_rds_key = build_summarizing_rds_key(vid)
rds.set(summarizing_rds_key, 1, ex=SUMMARIZING_RDS_KEY_EX)

# TODO return `video_summary` from `summarizing` and handle it.
# Add to DB or whatever.
chapters, _ = await summarizing(
vid=vid,
trigger=trigger,
Expand Down
11 changes: 11 additions & 0 deletions prompt.py
Original file line number Diff line number Diff line change
Expand Up @@ -210,6 +210,17 @@
Do not output any redundant explanation or information.
'''

# TODO do we even need to specify that the summary is split by chapter?
# we can just output their summaries in sequence and yeah.
SUMMARIZE_CHAPTER_SUMMARIES_PROMPT = '''
You will be provided with a summary of a video split by chapter.
Your task is to summarize the entire video.
It is preferable to provide the summary in the form of 5 key points.
The summary is to act as a TL;DR and not a "student's note", i.e. it must be useful to someone who has not and will not watch the video.
Be concise.
Do not output any redundant explanation.
'''


def generate_multi_chapters_example_messages_for_4k(lang: str) -> list[Message]:
system_prompt = _GENERATE_MULTI_CHAPTERS_SYSTEM_PROMPT.format(lang=lang)
Expand Down
56 changes: 51 additions & 5 deletions summary.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
GENERATE_MULTI_CHAPTERS_TOKEN_LIMIT_FOR_16K, \
GENERATE_ONE_CHAPTER_SYSTEM_PROMPT, \
GENERATE_ONE_CHAPTER_TOKEN_LIMIT, \
SUMMARIZE_CHAPTER_SUMMARIES_PROMPT, \
SUMMARIZE_FIRST_CHAPTER_SYSTEM_PROMPT, \
SUMMARIZE_FIRST_CHAPTER_TOKEN_LIMIT, \
SUMMARIZE_NEXT_CHAPTER_SYSTEM_PROMPT, \
Expand All @@ -43,11 +44,16 @@ def build_summary_channel(vid: str) -> str:
return f'summary_{vid}'


def build_summary_response(state: State, chapters: list[Chapter] = []) -> dict:
def build_summary_response(
state: State,
chapters: list[Chapter] = [],
video_summary: str = ''
) -> dict:
chapters = list(map(lambda c: asdict(c), chapters))
return {
'state': state.value,
'chapters': chapters,
'video_summary': video_summary
}


Expand All @@ -63,6 +69,8 @@ async def do_if_found_chapters_in_database(vid: str, chapters: list[Chapter]):
rds.delete(build_no_transcript_rds_key(vid))
rds.delete(build_summarizing_rds_key(vid))
channel = build_summary_channel(vid)
# TODO return `video_summary`. Though now the usage of this function is
# commented-out
data = build_summary_response(State.DONE, chapters)
await sse_publish(channel=channel, event=SseEvent.SUMMARY, data=data)
await sse_publish(channel=channel, event=SseEvent.CLOSE)
Expand Down Expand Up @@ -169,7 +177,11 @@ async def summarize(
openai_api_key=openai_api_key,
)
if chapters:
await _do_before_return(vid, chapters)
video_summary = await _summarize_chapter_summaries(
chapters=chapters,
openai_api_key=openai_api_key,
)
await _do_before_return(vid, chapters, video_summary)
return chapters, has_exception

# Just use the "outline" field if it can be generated in 16k.
Expand Down Expand Up @@ -222,7 +234,14 @@ async def summarize(
logger.error(f'summarize, but has exception, vid={vid}, e={r}')
has_exception = True

await _do_before_return(vid, chapters)
# TODO handle `has_exception`? IDK what it does.
video_summary = await _summarize_chapter_summaries(
chapters=chapters,
openai_api_key=openai_api_key,
)

await _do_before_return(vid, chapters, video_summary)
# TODO return `video_summary`
return chapters, has_exception


Expand Down Expand Up @@ -570,9 +589,36 @@ async def _summarize_chapter(
data=build_summary_response(State.DOING, [chapter]),
)

async def _summarize_chapter_summaries(
chapters: list[Chapter],
# TODO need to use `lang`?
# lang: str,
openai_api_key: str = '',
):
system_message = build_message(Role.SYSTEM, SUMMARIZE_CHAPTER_SUMMARIES_PROMPT)
# TODO can mashing chapters together like this into a single prompt
# make separation between them unclear to the assistant?
chapter_strings = [f"## {c.chapter}\n\n {c.summary}" for c in chapters]
user_message = build_message(Role.USER, "\n\n".join(chapter_strings))
# TODO handle token limit, though the chapter summary should be
# well within it.
body = await chat(
messages=[system_message, user_message],
model=Model.GPT_3_5_TURBO,
# TODO I just copy-pasted this from the rest of the code
# https://platform.openai.com/docs/api-reference/chat/create#chat-create-top_p
top_p=0.1,
timeout=90,
api_key=openai_api_key,
)
summary = get_content(body).strip()
logger.info(f"got video summary. len: {len(summary)}")
# TODO sse_publish ? What does it do?
return summary

async def _do_before_return(vid: str, chapters: list[Chapter]):

async def _do_before_return(vid: str, chapters: list[Chapter], video_summary: str):
channel = build_summary_channel(vid)
data = build_summary_response(State.DONE, chapters)
data = build_summary_response(State.DONE, chapters, video_summary)
await sse_publish(channel=channel, event=SseEvent.SUMMARY, data=data)
await sse_publish(channel=channel, event=SseEvent.CLOSE)

0 comments on commit c7e007a

Please sign in to comment.