<a href="https://colab.research.google.com/github/nariakiiwatani/easylazyscrap_speech/blob/main/easylazyscrap_speech.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install requests openai gtts newspaper3k langchain faiss-gpu tiktoken pydub mutagen

In [None]:
import os
import numpy as np

# for local run
# from dotenv import load_dotenv
# load_dotenv()

# for colab run
import json
from google.colab import drive
drive.mount('/content/drive', force_remount=True)
with open('/content/drive/MyDrive/Colab Notebooks/secrets/openai.json') as jsonfile:
    for i, (k,v) in enumerate(json.load(jsonfile).items()):
      os.environ[k] = v



In [None]:
import datetime
from pytz import timezone

def generate_recent_page_urls(base_url, base_date, within_days):
    recent_page_urls = []
    for i in range(within_days):
        date_string = (base_date - datetime.timedelta(days=i)).strftime("%Y%%2F%m%%2F%d")
        page_url = f"{base_url}/scrap_{date_string}/text"
        recent_page_urls.append(page_url)
    return recent_page_urls

within_days = 7
base_url = "https://scrapbox.io/api/pages/easylazyscrap"
now = datetime.datetime.now()
today = now.astimezone(timezone('Asia/Tokyo'))
# yesterday
base_date = today - datetime.timedelta(days=1)
recent_page_urls = generate_recent_page_urls(base_url, base_date, within_days)

In [None]:
import requests
import newspaper
import re

def fetch_links_from_page(url):
    links = []
    response = requests.get(url)
    for line in response.text.splitlines():
        match = re.search('\[(.*?) ((?:https?|ftp):\/\/?[\w/\-?=%.]+\.[\w/\-?=%.]+)\s*\]', line)
        if match:
            link_text = match.group(1)
            link_url = match.group(2)
            links.append((link_text, link_url))
    return links

def get_article(url):
    article = newspaper.Article(url)
    article.download()
    article.parse()
    return article

links = []
articles = []
for page_url in recent_page_urls:
    new_links = fetch_links_from_page(page_url)
    links.extend(new_links)
    for title, url in new_links:
        try:
          article = get_article(url)
          if article.text != "":
              articles.append((title, article.text, url, article.meta_img or article.meta_favicon or None))
        except:
          next



In [None]:
choice = 8
choice = min(choice, len(articles))

idx = np.random.choice(np.arange(len(articles)), choice, replace=False)

articles = list(np.array(articles)[idx])


In [None]:
import os
import openai

from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.document_loaders import TextLoader
from langchain.vectorstores import FAISS
import faiss

text_splitter = CharacterTextSplitter(chunk_size=140, chunk_overlap=0, separator="\n")
res = faiss.StandardGpuResources()
flat_config = faiss.GpuIndexFlatConfig()

clustered = []
def clusterize(text, n):
    documents = text_splitter.create_documents([text])
    texts = [doc.page_content.replace("\n\n", "") for doc in documents]

    response = openai.Embedding.create(input=texts, model="text-embedding-ada-002")
    embeds = [record['embedding'] for record in response['data']]
    embeds_np = np.array(embeds).astype('float32')
    if(embeds_np.shape[0] == 0):
      return False
    clusters_num = min(embeds_np.shape[0], 5)
    index = faiss.GpuIndexFlatL2(res, embeds_np.shape[1], flat_config)
    index.add(embeds_np)

    kmeans = faiss.Kmeans(d=embeds_np.shape[1], k=clusters_num, niter=20, verbose=True)
    kmeans.train(embeds_np)
    clusters = kmeans.index.search(embeds_np, 1)[1].flatten()

    # 最も近いクラスタ中心点との距離を計算し、クラスタ中心点に近いものからn個だけ残して、それ以外を除外する
    clustered_sentences = {}
    for i, label in enumerate(clusters):
        if label not in clustered_sentences:
            clustered_sentences[label] = []

        distances = np.linalg.norm(embeds_np[i] - kmeans.centroids[label])
        clustered_sentences[label].append((texts[i], distances))

    # 中心点に近いものからn個だけ残す
    for label, cluster in clustered_sentences.items():
        sorted_cluster = sorted(cluster, key=lambda x: x[1])
        clustered_sentences[label] = [c[0] for c in sorted_cluster[:n]]

    return clustered_sentences

def pick(src, num):
    sorted_list = sorted(src.items(), key=lambda x: len(x[1]), reverse=True)
    return dict(sorted_list[:num])

pick_cluster = 3
pick_inside_cluster = 3
for (title, article, url, img) in articles:
  result = clusterize(article, pick_inside_cluster)
  if result != False:
    clustered.append((title, pick(result, pick_cluster), url, img))



In [None]:

from langchain.prompts import PromptTemplate
from langchain import OpenAI

prompt_template = """以下の内容について200字程度の簡潔な要約を日本語で作成してください:

"{text}"

簡潔な要約:"""
PROMPT = PromptTemplate(template=prompt_template, input_variables=['text'])

from langchain.chat_models import ChatOpenAI
from langchain.docstore.document import Document
from langchain.chains.summarize import load_summarize_chain

def summarize(clusters):
    cluster_summaries = []
    for cluster_sentences in clusters.values():
        docs = [Document(page_content=sentence) for sentence in cluster_sentences]
        chain = load_summarize_chain(
            OpenAI(
                model_name="text-davinci-002",
                temperature=0.7,
            ),
            chain_type="map_reduce",
            map_prompt=PROMPT,
            combine_prompt=PROMPT,
        )
        summary = chain.run(input_documents=docs, token_max=2000)
        cluster_summaries.append(summary)
    return cluster_summaries

summaries = [(title, summarize(cluster), url, img) for (title, cluster, url, img) in clustered]


In [None]:
import os
import datetime

basename = today.strftime("%Y_%m_%d")

export_directory = f"/content/drive/MyDrive/easylazyscrap/episodes/{basename}"
if not os.path.exists(export_directory):
    os.makedirs(export_directory)


In [None]:
to_date = base_date.strftime("%Y年%-m月%-d日")
from_date = (base_date - datetime.timedelta(days=within_days-1)).strftime("%Y年%-m月%-d日")

podcast_script = f"""
<![BGM_START[Another_Face.wav, 0]]>
<![BLANK[23000]]>
<![BGM_VOLUME[-16, 2000]]>
<![BLANK[2000]]>
こんにちは。
<![BLANK[300]]>
今週の、イージーレイジースクラップ、パーソナリティーの、レイジーです。
今週もお聴きくださり、ありがとうございます。

<![BGM_VOLUME[-20, 1000]]>
<![BLANK[1500]]>

イージーレイジースクラップは、スクラップボックス上で「あとで読む」を共有できる、半匿名のブックマーク共有サービスです。
<![BLANK[200]]>
この番組では、この一週間でイージーレイジースクラップに投稿された内容から、いくつかをピックアップしてご紹介します。

<![BLANK[1500]]>

今回は、{from_date}から、{to_date}のスクラップをご紹介します。

<![BGM_VOLUME[0, 1000]]>
<![BLANK[1500]]>
<![BGM_END[-1000]]>

<![BLANK[1000]]>
<![SOUND[start_scrap.wav]]>
<![BLANK[2000]]>

<![BGM_START[Bouquet.wav, 5000]]>
<![BGM_VOLUME[-20, 0]]>

"""

for i, (title, summary, url, img) in enumerate(summaries):
    text = ''.join(summary).replace('\n', '')
    podcast_script += f"<![CHAPTER[{url}]]>{title}\n{text}<![SOUND[inter.wav]]>\n\n"

podcast_script += """今週の、ピックアップは、以上です。
<![BLANK[1000]]>
最後に、今週寄せられた、全てのスクラップのタイトルを読み上げます。
<![CHAPTER[今週のスクラップ一覧]]>

"""

for i, (title, url) in enumerate(links):
    podcast_script += f"<![SOUND[click.wav]]>{title}\n"

podcast_script += """
<![BLANK[3000]]>

<![BGM_END[-3000]]>

今週寄せられた、スクラップは、以上です。

<![BLANK[1000]]>

<![BGM_START[Another_Face.wav, 0]]>
<![BLANK[13000]]>

<![BGM_VOLUME[-20, -1000]]>

今週のスクラップはいかがでしたか？
<![BLANK[300]]>
ご紹介したリンクユーアールエルは、全て概要欄テキストに記載されています。
気になる記事がありましたら、ぜひご覧ください。

<![BLANK[3000]]>

イージーレイジースクラップへは、どなたでも記事をご投稿いただけます。
<![BLANK[200]]>
参加方法は、概要欄テキストのリンクからご確認ください。
<![BLANK[500]]>
これで、今週のイージーレイジースクラップを終わります。
良い週末を、お過ごしください。

<![BLANK[500]]>
ありがとうございました！
<![BLANK[500]]>
<![BGM_VOLUME[0, -1000]]>
<![BLANK[18000]]>
<![BGM_END[-300]]>
<![BLANK[3000]]>
"""


In [None]:
from pydub import AudioSegment
import gtts
import os
import re
import tempfile

def make_speech(script, options={}):
    opt = {**{'playback_speed':1.25,'gain':-4}, **options}
    text_to_speech = gtts.gTTS(script, lang="ja", lang_check=False)
    try:
      fp = tempfile.NamedTemporaryFile()
      text_to_speech.save(fp.name)
      speech = AudioSegment.from_mp3(fp.name) + opt['gain']
      return speech.speedup(playback_speed=opt['playback_speed'])
    except:
      return None

def loopAudio(segment, duration_ms):
  if duration_ms >= len(segment):
    segment = segment * (duration_ms // len(segment))
    remaining_duration_ms = duration_ms % len(segment)
    if remaining_duration_ms > 0:
      segment += segment[:remaining_duration_ms]
  else:
    segment = segment[:duration_ms]
  return segment

def applyVolumeChanges(segment, volume_changes):
  for position_ms, volume, duration_ms in volume_changes[:1]:
    segment = segment.fade_in(max(1,duration_ms))
  prev = 0
  for position_ms, volume, duration_ms in volume_changes[1:]:
    diff = volume - prev
    segment = segment.fade(to_gain=diff, start=position_ms, duration=max(1,duration_ms))
    prev = volume
  return segment

def endBGM(segment, duration, volume_changes, time_offset):
  segment = loopAudio(segment, duration)
  segment = applyVolumeChanges(segment, [(time-time_offset, volume, duration) for (time, volume, duration) in volume_changes])
  return segment

def search_by_url(summaries, target_url):
  for i, (title, summary, url, img) in enumerate(summaries):
    if url == target_url:
      return i, (title, summary, url, img)
  return None

def create_audio(script, output_file_name, playback_speed=1.0, sound_effects_dir='./'):
    pattern = r'<!\[(.*?)\[(.*?)\]\]>'
    parts = re.split(pattern, ' '+script)

    audio_segments = []
    bgm_segment = None
    bgm_volume_changes = []
    bgm_segments = []
    chapters = []
    current_time_ms = 0
    bgm_start_time_ms = 0
    for i, part in enumerate(parts):
        if i % 3 == 0:  # テキスト部分
          speech = make_speech(part)
          if speech:
            audio_segments.append(speech)
            current_time_ms += len(speech)
        elif i % 3 == 1:  # 指示種別
            continue  # 指示種別を次のループで処理するため、ここではスキップ
        else:  # 指示内容
            instruction_type = parts[i - 1]
            if instruction_type == 'SOUND':
                sound_effect_file_path = os.path.join(sound_effects_dir, part)
                sound_effect = AudioSegment.from_wav(sound_effect_file_path)
                audio_segments.append(sound_effect)
                current_time_ms += len(sound_effect)
            elif instruction_type == 'BLANK':
                blank_duration_ms = int(part)
                blank_segment = AudioSegment.silent(duration=blank_duration_ms)
                audio_segments.append(blank_segment)
                current_time_ms += len(blank_segment)

            elif instruction_type == 'CHAPTER':
                result = search_by_url(summaries, part)
                if result is not None:
                  index, (title, summary, url, img) = result
                chapters.append((current_time_ms, title, img))

            elif instruction_type == 'BGM_START':
                bgm_file_name, fade_duration_ms_str = part.split(',')
                fade_duration_ms = int(fade_duration_ms_str)
                bgm_start_time_ms = current_time_ms
                if fade_duration_ms < 0:
                  fade_duration_ms = abs(fade_duration_ms)
                  bgm_start_time_ms -= fade_duration_ms

                bgm_file_path = os.path.join(sound_effects_dir, bgm_file_name)
                bgm_segment = AudioSegment.from_wav(bgm_file_path)
                bgm_volume_changes.append((bgm_start_time_ms, 0.0, fade_duration_ms))
            elif instruction_type == 'BGM_VOLUME':
                volume, fade_duration_ms_str = part.split(',')
                fade_duration_ms = int(fade_duration_ms_str)
                fade_start_time_ms = current_time_ms
                if fade_duration_ms < 0:
                  fade_duration_ms = abs(fade_duration_ms)
                  fade_start_time_ms -= fade_duration_ms
                bgm_volume_changes.append((fade_start_time_ms, float(volume), fade_duration_ms))
            elif instruction_type == 'BGM_END':
                fade_duration_ms = int(part)
                fade_start_time_ms = current_time_ms
                if fade_duration_ms < 0:
                  fade_duration_ms = abs(fade_duration_ms)
                  fade_start_time_ms -= fade_duration_ms
                bgm_volume_changes.append((fade_start_time_ms, -120.0, fade_duration_ms))
                if bgm_segment is not None:
                  bgm_segment = endBGM(bgm_segment, fade_start_time_ms - bgm_start_time_ms + fade_duration_ms, bgm_volume_changes, bgm_start_time_ms)
                  bgm_segments.append((bgm_start_time_ms, bgm_segment))
                  bgm_volume_changes = []
                  bgm_segment = None
            else:
                speech = make_speech(instruction_type + ' ' + part)
                if speech:
                  audio_segments.append(speech)
                  current_time_ms += len(speech)

    if bgm_segment:
      bgm_volume_changes.append((current_time_ms, -120.0, 0))
      bgm_segment = endBGM(bgm_segment, current_time_ms - bgm_start_time_ms, bgm_volume_changes, bgm_start_time_ms)
      bgm_segments.append((bgm_start_time_ms, bgm_segment))
      bgm_volume_changes = []
      bgm_segment = None


    # 全ての音声データを結合
    final_audio = audio_segments[0]
    for segment in audio_segments[1:]:
        final_audio += segment

    for start_time_ms, segment in bgm_segments:
        final_audio = final_audio.overlay(segment, position=start_time_ms)

    final_audio.export(output_file_name, format="mp3")

    return chapters

audio_file_path = os.path.join(export_directory, f"{basename}.mp3")
sound_directory = "/content/drive/MyDrive/easylazyscrap/sounds"

chapters = create_audio(podcast_script, audio_file_path, 1.25, sound_directory)

In [None]:
from IPython.display import Audio

Audio(audio_file_path)


In [None]:
from pydub import AudioSegment
from mutagen.mp3 import MP3, EasyMP3
from mutagen.id3 import ID3, CHAP, CTOC, TIT2, USLT, APIC
import requests
from io import BytesIO

def add_chapters_to_mp3(file_path, chapters):
    # MP3ファイルをロード
    audio = AudioSegment.from_mp3(file_path)
    audio_length_ms = audio.duration_seconds * 1000

    file = EasyMP3(file_path)
    file.tags = ID3()

    child_elements = []
    # 各チャプターを追加
    for i, chapter in enumerate(chapters):
        start_time_ms, title, img = chapter
        # 次のチャプターの開始時間またはオーディオの全体の長さを終了時間として使用
        if i + 1 < len(chapters):
            end_time_ms = chapters[i + 1][0]
        else:
            end_time_ms = audio_length_ms

        encoded_title = title.encode('utf-8')
        sub_frames = [TIT2(encoding=3, text=[encoded_title])]
        # Download image
        if img is not None:
          response = requests.get(img)
          img_data = BytesIO(response.content).getvalue()
          mime_type = response.headers.get('Content-Type', 'image/jpeg')  # default to 'image/jpeg' if not provided

          # Create APIC frame for image
          apic = APIC(
              encoding=3,  # UTF-8
              mime=mime_type,
              type=3,  # 3 is for the cover image
              desc=u'Cover',
              data=img_data
          )
          sub_frames.append(apic)

        chap = CHAP(
            element_id=encoded_title,
            start_time=int(start_time_ms),
            end_time=int(end_time_ms),
            start_offset=0,
            end_offset=len(audio),
            sub_frames=sub_frames
        )
        child_elements.append(encoded_title)

        file.tags.add(chap)

    ctoc = CTOC(element_id="toc", flags=0x00, child_elements=child_elements, top_level=True)
    file.tags.add(ctoc)

    file.save()

add_chapters_to_mp3(audio_file_path, chapters)



In [None]:
description = """easylazyscrapは、Scrapbox上で「あとで読む」を共有できる、半匿名のブックマーク共有サービスです。
この番組では、今週投稿されたリンクからランダムにいくつかをピックアップして、その内容を紹介します。

easylazyscrapへの参加方法は、下記のリンクからご確認ください。
https://scrapbox.io/easylazyscrap/easylazyscrap%E3%81%AE%E4%BD%BF%E3%81%84%E6%96%B9
"""

description += f"""
---
## 紹介したスクラップ

"""

def convert_ms_to_time_format(start_time_ms):
    seconds = start_time_ms // 1000
    minutes, seconds = divmod(seconds, 60)
    hours, minutes = divmod(minutes, 60)

    if hours > 0:
        return "{:02d}:{:02d}:{:02d}".format(hours, minutes, seconds)
    elif minutes > 0:
        return "{:02d}:{:02d}".format(minutes, seconds)
    else:
        return "{:02d}".format(seconds)

for i, (start_time_ms, title, url) in enumerate(chapters[:-1]):
  time_str = convert_ms_to_time_format(start_time_ms)
  description += f"({time_str}) {title}\n{url}\n"

list_time_ms, list_title, url = chapters[-1]
list_time_str = convert_ms_to_time_format(list_time_ms)
description += f"""
---
## ({list_time_str}){list_title}({from_date}〜{to_date})

"""

for i, (title, url) in enumerate(links):
    description += f"- {title}\n{url}\n\n"

description += f"""
---
## BGM・効果音

OP/ED: Another Face written by yuhei komatsu
https://dova-s.jp/bgm/play18474.html

紹介中BGM: Bouquet written by shimtone
https://dova-s.jp/bgm/play18979.html

効果音: 効果音ラボ
https://soundeffect-lab.info/

"""

script_file_path = os.path.join(export_directory, "description.txt")
f = open(script_file_path, mode="w")
f.write(description)
f.close()

print(description)

In [None]:
from mutagen.id3 import ID3

def print_chapters(file_path):
    audio = ID3(file_path)

    # Iterate over all ID3 frames
    for key, frame in audio.items():
        # If the frame is a CHAP frame, print its details
        if key.startswith("CHAP"):
            print(f"Chapter ID: {frame.element_id}")
            print(f"Start time: {frame.start_time} ms")
            print(f"End time: {frame.end_time} ms")
            print(f"Start offset: {frame.start_offset} bytes")
            print(f"End offset: {frame.end_offset} bytes")
            for subframe in frame.sub_frames:
                print(subframe)
            print()

# Example usage
print_chapters(audio_file_path)