In [None]:
!apt-get update

In [None]:
!apt-get install ffmpeg

In [None]:
!pip install langchain-community==0.2.5 langchain-core==0.2.9 langchain-openai==0.1.9 pydub

In [None]:
import os
import configparser

from langchain.chat_models import ChatOpenAI


def credential_init():

  credential_file = "credentials.ini"

  if os.path.exists(credential_file):
    credentials = configparser.ConfigParser()
    credentials.read(credential_file)
    os.environ['OPENAI_API_KEY'] = credentials['openai'].get('api_key')
  else:
    os.environ['OPENAI_API_KEY'] = os.environ['OPENAI']

credential_init()


model = ChatOpenAI(openai_api_key=os.environ['OPENAI_API_KEY'],
           model_name="gpt-4o-mini-2024-07-18", temperature=0)

In [None]:
from pydub import AudioSegment
from openai import OpenAI

client = OpenAI()

# Experiment 1: two pieces
# Explicit steps

speech = AudioSegment.from_mp3("05_12_2013_Torti_CLAS_1.mp3")

one_second = 1000

# PyDub handles time in milliseconds
one_minute = 1 * 60 * one_second

# 3 seconds overlap
overlap = 3 * one_second

signal_1 = speech[:one_minute]
signal_2 = speech[one_minute - overlap: 2 * one_minute]

signal_1.export("signal_1.mp3", format="mp3")
signal_2.export("signal_2.mp3", format='mp3')

transcription_1 = client.audio.transcriptions.create(
  model="whisper-1",
  file=open("signal_1.mp3", 'rb')
)

transcription_2 = client.audio.transcriptions.create(
  model="whisper-1",
  file=open("signal_2.mp3", 'rb')
)

In [None]:
transcription_1.text

In [None]:
transcription_2.text

In [None]:
from langchain.prompts import PromptTemplate, HumanMessagePromptTemplate, ChatPromptTemplate, SystemMessagePromptTemplate
from langchain_core.output_parsers.string import StrOutputParser

system_prompt = PromptTemplate(template=
    """
    You are a AI assistant as a copywriter
    You are assigned with a task of concatenate two texts <text_1> and <text_2>
    """)

system_message = SystemMessagePromptTemplate(prompt=system_prompt)

human_prompt = PromptTemplate(template="""
                    <text_1>: {text_1};
                    <text_2>: {text_2};
                    """)
human_message = HumanMessagePromptTemplate(prompt=human_prompt)

chat_prompt = ChatPromptTemplate.from_messages([system_message,
                         human_message
                          ])

pipeline_ = chat_prompt | model | StrOutputParser()

In [None]:
pipeline_.invoke({"text_1":transcription_1.text,
         "text_2":transcription_2.text})

In [None]:
# system_prompt = PromptTemplate.from_template(
#     """
#     You are a helpful AI assistant assigned with a task of concatenating
#     two pieces of text <text_1> and <text_2> with the end of
#     <text_1> and the begin of <text_2> are overlapped.

#     Both <text_1> and <text_2> are extracted from the same piece of text.
#     Please keep all the content.
#     """)

# system_message = SystemMessagePromptTemplate(prompt=system_prompt)

# human_prompt = PromptTemplate(template="""
#                       <text_1>: {text_1}
#                       <text_2>: {text_2}
#                       """)
# human_message = HumanMessagePromptTemplate(prompt=human_prompt)

# chat_prompt = ChatPromptTemplate.from_messages([system_message,
#                           human_message
#                           ])

# pipeline_ = chat_prompt | model | StrOutputParser()

# pipeline_.invoke({"text_1":transcription_1.text, "text_2":transcription_2.text})

## 我們可以從頭到尾把整段錄音的文字接起來嗎? 想像一下若是音檔很大的話。

In [None]:
text_list = []

speech = AudioSegment.from_mp3("05_12_2013_Torti_CLAS_1.mp3")

one_second = 1000

# PyDub handles time in milliseconds
one_minute = 2 * 60 * one_second

# 5 seconds overlap
overlap = 3 * one_second

count = 0

while True:
  signal = speech[count * one_minute: (count + 1) * one_minute + overlap]
  signal.export("signal.mp3", format="mp3")
  transcription = client.audio.transcriptions.create(
  model="whisper-1",
  file=open("signal.mp3", 'rb')
  )
  print("\n")
  print(f"Paragraph {count}")
  print(transcription.text)
  print("\n")
  print("********************************************")
  text_list.append(transcription.text)
  count += 1
  if len(text_list) == 1:

    continue
  # else:
  #   text_list.append(transcription.text)

  result = pipeline_.invoke({"text_1":text_list[0], "text_2":text_list[1]})
  text_list = [result]
  if len(signal) < one_minute:
    break

  # 示範用 節省時間
  if count == 5:
    break

In [None]:
result

In [None]:
text_list[-1]

## 實踐出真理

In [None]:
signal = speech[:5 * one_minute]
signal.export("signal.mp3", format="mp3")
transcription = client.audio.transcriptions.create(
model="whisper-1",
file=open("signal.mp3", 'rb')
)
print(transcription.text)

In [None]:
transcription.text