In [None]:
from sydney import SydneyClient
import textract
import re
import numpy as np
from IPython.display import display, Markdown, HTML
import json
import time

In [None]:
# Copy cookie from `create?BundleVersion` request to BING_COOKIES envar to enable authentication
sydney = SydneyClient(style="precise")

In [None]:
# [Optional] Test connection
await sydney.start_conversation()
res = await sydney.ask("What is H2ckathon? Respond in 1 sentence.")
print(res)
await sydney.close_conversation()

In [None]:
# Get input; use `ln -s` to symlink files from Windows to /data or copy
filepath = "data/h2ckathon_transcript_session_1.docx"
# filepath = "data/h2ckathon_transcript_session_2_part_1.docx"
# filepath = "data/h2ckathon_transcript_session_2_part_2.docx"
# filepath = "data/h2ckathon_transcript_session_3_part_1.docx"  # modelling
# filepath = "data/h2ckathon_transcript_session_3_part_2.docx"  # discussion
doc = textract.process(filepath).decode("utf-8")

In [None]:
len(doc)

In [None]:
# Remove timestamps
doc = re.sub(r'\d+:\d+\n', '', doc)
# Replace multiple line breaks with a single one
doc = re.sub(r'\n+', '\n', doc)

In [None]:
len(doc), round(len(doc) / 18000, 2)

In [None]:
# separate document lines into chunks; this is because of the limit of 18k characters per query
# split by line break, then sort to chunks, then put chunks back together
n_chunks = 10
doc_chunks = ["\n".join(chunk) for chunk in np.array_split(doc.split("\n"), n_chunks)]

In [None]:
# verify each chunk is < 18k characters
[len(chunk) for chunk in doc_chunks]

# 1. Summarise the text into bullet points

In [None]:
i = 1

In [None]:
context = """
We believe that relying solely on round-the-clock electrolyser dispatching is not the ultimate solution, as it won’t be matched by green energy production or low prices. That's why we gathered industry leaders in 3 roundtable sessions to share insights, align on best practices, and shape open-source proceedings that will drive progress across the sector.
"""
query_summary = """Given conference description in quotes for context, use part {i} of audio transcript from the conference in backticks to list the main points and subtopics raised during the conference.
'{context}'
`{transcript}`
"""

In [None]:
await sydney.start_conversation()

In [None]:
summary = await sydney.ask(query_summary.format(i=i+1, context=context, transcript=doc_chunks[i]), search=False)

In [None]:
summary

In [None]:
display(Markdown(summary))

In [None]:
query_format_summary = "Process the text in backticks into a dictionary. Return as json without line breaks: '{topic: [points]}'."

In [None]:
formatted_summary = await sydney.ask(query_format_summary + f" `{summary}`", search=False)
formatted_summary

In [None]:
# formatted_summary = json.loads(re.findall(r'\{.*?\}', formatted_summary)[0])
formatted_summary = json.loads("{" + re.findall(r'\{((.|\s)*?)\}', formatted_summary)[0][0] + "}")
formatted_summary

In [None]:
# await sydney.close_conversation()

# 2. Ask a clarifying question about each bullet point

In [None]:
example_topic = "Organizational Details"
example_points = [
  'Each session started with a short presentation, followed by a discussion and a 10-minute break.',
  'The event was recorded for the purpose of creating a transcript.',
  'Proceedings from the conference were planned to be published after the third session.',
  'Pictures were taken for marketing purposes.']
example_response = "What did the speaker say about the organizational details, such as format, recording and marketing?"

In [None]:
query_format_paper = " Present the speaker's views in a form of a scientific paper paragraph, removing the speaker's agency (don't mention the speaker or the conference), don't use overly complicated language."

In [None]:
# check if the example question gives intended answer
res_example = await sydney.ask(example_response + query_format_paper, search=False)
res_example

In [None]:
query_question = """Ask a question about the topics given in backtics to better understand what the speaker's standpoint was. Use the example for reference.
Example:
topic: {example_topic}, points: {example_points}, question: {example_response}
Query:
`topic: {topic}, points: {points}`
"""

In [None]:
topic = "Assumptions and Scenarios"
points = formatted_summary[topic]

In [None]:
question = await sydney.ask(query_question.format(example_topic=example_topic, example_points=example_points, example_response=example_response, topic=topic, points=points), search=False)
question

# 3. Respond with clarification in a form of an article paragraph

In [None]:
answer = await sydney.ask(question.split("\n\n")[-1] + query_format_paper, search=False)
answer

In [None]:
await sydney.close_conversation()

# Automate

In [None]:
# Define queries for presentation transcripts (sessions 1-2)
context = """
We believe that relying solely on round-the-clock electrolyser dispatching is not the ultimate solution, as it won’t be matched by green energy production or low prices. That's why we gathered industry leaders in 3 roundtable sessions to share insights, align on best practices, and shape open-source proceedings that will drive progress across the sector.
"""
query_summary = """Given conference description in quotes for context, use part {i} of audio transcript from the conference in backticks to list the main points and subtopics raised during the conference.
'{context}'
`{transcript}`
"""
query_format_summary = "Process the text in backticks into a dictionary. Return as json without line breaks: '{topic: [points]}'."
example_topic = "Organizational Details"
example_points = [
  'Each session started with a short presentation, followed by a discussion and a 10-minute break.',
  'The event was recorded for the purpose of creating a transcript.',
  'Proceedings from the conference were planned to be published after the third session.',
  'Pictures were taken for marketing purposes.']
example_response = "What did the speaker say about the organizational details, such as format, recording and marketing?"
query_question = """Ask a question about the topics given in backtics to better understand what the speaker's standpoint was. Use the example for reference.
Example:
topic: {example_topic}, points: {example_points}, question: {example_response}
Query:
`topic: {topic}, points: {points}`
"""
query_format_paper = " Present the speaker's views in a form of a scientific paper paragraph, removing the speaker's agency (don't mention the speaker or the conference), don't use overly complicated language."

In [None]:
# Modify for discussion transcripts (session 3)
query_summary = """
Given conference description in quotes for context, use part {i} of audio transcript from the conference in backticks to list the main points and subtopics raised during the conference. For each topic, label subtopics by whether they were raised by the presenter or the audience. Make sure to include discussions between presenter and the audience, and both sides of disagreements.
'{context}'
`{transcript}`
"""
query_format_summary = "Process the text in backticks into a dictionary. Return as json without line breaks: '{topic: [points]}'. Ignore the split to presenter, audience, disagreements."
example_topic = "Discussion on Flexibility"
example_points = ["The audience engaged in a discussion on the flexibility of the electrolyzer, questioning the practicality of turning it on and off based on electricity prices."]
example_response = "What were the audience's concerns about about electrolyser flexibility and what did they say about the practicality of turning it on and off based on electricity prices?"

In [None]:
# 1. Summarise the text into bullet points
i = 1
await sydney.start_conversation()
summary = await sydney.ask(query_summary.format(i=i+1, context=context, transcript=doc_chunks[i]), search=False)
display(Markdown(summary))

In [None]:
# if not satisfied with the summary, close connection and rerun
# await sydney.close_conversation()

In [None]:
formatted_summary = await sydney.ask(query_format_summary + f" `{summary}`", search=False)
formatted_summary = formatted_summary.replace("\n", "")
# formatted_summary = json.loads(re.findall(r'\{.*?\}', formatted_summary)[0])
formatted_summary = json.loads("{" + re.findall(r'\{((.|\s)*?)\}', formatted_summary)[0][0] + "}")
formatted_summary

In [None]:
# 2. Ask a clarifying question about each bullet point
# 3. Answer as a paper paragraph
paragraphs = {}
for topic, points in formatted_summary.items():
    # if topic in ['Merit Order of Demand', 'Subsidization', 'Energy Utilization', 'Brainstorming Session', 'Closing Remarks']:
        # continue
    print("processing topic", topic)
    question = await sydney.ask(query_question.format(example_topic=example_topic, example_points=example_points, example_response=example_response, topic=topic, points=points), search=False)
    answer = await sydney.ask(question.split("\n\n")[-1] + query_format_paper, search=False)
    paragraphs[topic] = answer
    # avoid captcha
    time.sleep(10)

In [None]:
for t, p in paragraphs.items():
    print(t)
    print(p)
    print("\n")

In [None]:
await sydney.close_conversation()

---