<a href="https://colab.research.google.com/github/nariakiiwatani/easylazyscrap_speech/blob/main/easylazyscrap_speech.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install requests openai gtts newspaper3k langchain faiss-gpu tiktoken pydub eyed3

In [None]:
import os
import numpy as np

# for local run
# from dotenv import load_dotenv
# load_dotenv()

# for colab run
import json
from google.colab import drive
drive.mount('/content/drive', force_remount=True)
with open('/content/drive/MyDrive/Colab Notebooks/secrets/openai.json') as jsonfile:
    for i, (k,v) in enumerate(json.load(jsonfile).items()):
      os.environ[k] = v



In [None]:
import datetime
from pytz import timezone

def generate_recent_page_urls(base_url, base_date, within_days):
    recent_page_urls = []
    for i in range(within_days):
        date_string = (base_date - datetime.timedelta(days=i)).strftime("%Y%%2F%m%%2F%d")
        page_url = f"{base_url}/scrap_{date_string}/text"
        recent_page_urls.append(page_url)
    return recent_page_urls

within_days = 7
base_url = "https://scrapbox.io/api/pages/easylazyscrap"
now = datetime.datetime.now()
today = now.astimezone(timezone('Asia/Tokyo'))
# yesterday
base_date = today - datetime.timedelta(days=1)
recent_page_urls = generate_recent_page_urls(base_url, base_date, within_days)

In [None]:
import requests
import newspaper
import re

def fetch_links_from_page(url):
    links = []
    response = requests.get(url)
    for line in response.text.splitlines():
        match = re.search('\[(.*?) ((?:https?|ftp):\/\/?[\w/\-?=%.]+\.[\w/\-?=%.]+)\s*\]', line)
        if match:
            link_text = match.group(1)
            link_url = match.group(2)
            links.append((link_text, link_url))
    return links

def get_article(url):
    article = newspaper.Article(url)
    article.download()
    article.parse()
    return article.text

links = []
articles = []
for page_url in recent_page_urls:
    new_links = fetch_links_from_page(page_url)
    links.extend(new_links)
    for title, url in new_links:
        try:
          article = get_article(url)
          if article != "":
              articles.append((title, article))
        except:
          next



In [None]:
choice = 8
choice = min(choice, len(articles))

idx = np.random.choice(np.arange(len(articles)), choice, replace=False)

articles = list(np.array(articles)[idx])


In [None]:
import os
import openai

from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.document_loaders import TextLoader
from langchain.vectorstores import FAISS
import faiss

text_splitter = CharacterTextSplitter(chunk_size=140, chunk_overlap=0, separator="\n")
res = faiss.StandardGpuResources()
flat_config = faiss.GpuIndexFlatConfig()

clustered = []
def clusterize(text, n):
    documents = text_splitter.create_documents([text])
    texts = [doc.page_content.replace("\n\n", "") for doc in documents]

    response = openai.Embedding.create(input=texts, model="text-embedding-ada-002")
    embeds = [record['embedding'] for record in response['data']]
    embeds_np = np.array(embeds).astype('float32')
    if(embeds_np.shape[0] == 0):
      return False
    clusters_num = min(embeds_np.shape[0], 5)
    index = faiss.GpuIndexFlatL2(res, embeds_np.shape[1], flat_config)
    index.add(embeds_np)

    kmeans = faiss.Kmeans(d=embeds_np.shape[1], k=clusters_num, niter=20, verbose=True)
    kmeans.train(embeds_np)
    clusters = kmeans.index.search(embeds_np, 1)[1].flatten()

    # 最も近いクラスタ中心点との距離を計算し、クラスタ中心点に近いものからn個だけ残して、それ以外を除外する
    clustered_sentences = {}
    for i, label in enumerate(clusters):
        if label not in clustered_sentences:
            clustered_sentences[label] = []

        distances = np.linalg.norm(embeds_np[i] - kmeans.centroids[label])
        clustered_sentences[label].append((texts[i], distances))

    # 中心点に近いものからn個だけ残す
    for label, cluster in clustered_sentences.items():
        sorted_cluster = sorted(cluster, key=lambda x: x[1])
        clustered_sentences[label] = [c[0] for c in sorted_cluster[:n]]

    return clustered_sentences

def pick(src, num):
    sorted_list = sorted(src.items(), key=lambda x: len(x[1]), reverse=True)
    return dict(sorted_list[:num])

pick_cluster = 3
pick_inside_cluster = 3
for (title, article) in articles:
  result = clusterize(article, pick_inside_cluster)
  if result != False:
    clustered.append((title, pick(result, pick_cluster)))



In [None]:

from langchain.prompts import PromptTemplate
from langchain import OpenAI

prompt_template = """以下の内容について200字程度の簡潔な要約を日本語で作成してください:

"{text}"

簡潔な要約:"""
PROMPT = PromptTemplate(template=prompt_template, input_variables=['text'])

from langchain.chat_models import ChatOpenAI
from langchain.docstore.document import Document
from langchain.chains.summarize import load_summarize_chain

def summarize(clusters):
    cluster_summaries = []
    for cluster_sentences in clusters.values():
        docs = [Document(page_content=sentence) for sentence in cluster_sentences]
        chain = load_summarize_chain(
            OpenAI(
                model_name="text-davinci-002",
                temperature=0.7,
            ),
            chain_type="map_reduce",
            map_prompt=PROMPT,
            combine_prompt=PROMPT,
        )
        summary = chain.run(input_documents=docs, token_max=2000)
        cluster_summaries.append(summary)
    return cluster_summaries

summaries = [(title, summarize(cluster)) for (title, cluster) in clustered]


In [None]:
import os
import datetime

basename = today.strftime("%Y_%m_%d")

export_directory = f"/content/drive/MyDrive/easylazyscrap/episodes/{basename}"
if not os.path.exists(export_directory):
    os.makedirs(export_directory)


In [None]:
to_date = base_date.strftime("%Y年%-m月%-d日")
from_date = (base_date - datetime.timedelta(days=within_days-1)).strftime("%Y年%-m月%-d日")

podcast_script = f"""<![SOUND[start.wav]]>
今週の、イージーレイジースクラップ。
イージーレイジースクラップは、スクラップボックス上で「あとで読む」を共有できる、半匿名のブックマーク共有サービスです。
この番組では、今週、イージーレイジースクラップに投稿された内容から、いくつかをピックアップして紹介します。

今回は、{from_date}から、{to_date}のスクラップを紹介します。
<![SOUND[start_scrap.wav]]>
"""

for i, (title, summary) in enumerate(summaries):
    text = ''.join(summary).replace('\n', '')
    podcast_script += f"<![CHAPTER[{title}]]>{title}\n{text}<![SOUND[inter.wav]]>\n\n"

podcast_script += """今週の、ピックアップは、以上です。
<![BLANK[1000]]>
最後に、今週寄せられた、全てのスクラップのタイトルを読み上げます。
<![CHAPTER[今週のスクラップ一覧]]>
<![SOUND[click.wav]]>
"""

for i, (title, url) in enumerate(links):
    podcast_script += f"{title}<![SOUND[click.wav]]>\n"

podcast_script += """
今週、寄せられたリンクは、以上です。
これで、今週のイージーレイジースクラップを終わります。

イージーレイジースクラップへは、どなたでもご参加いただけます。
概要欄のリンクから、参加方法をご確認ください。
ありがとうございました！
<![SOUND[end.wav]]>
"""


In [None]:
from pydub import AudioSegment
import gtts
import os
import re
import tempfile

def make_speech(script, options={}):
    opt = {**{'playback_speed':1.25}, **options}
    text_to_speech = gtts.gTTS(script, lang="ja", lang_check=False)
    try:
      fp = tempfile.NamedTemporaryFile()
      text_to_speech.save(fp.name)
      speech = AudioSegment.from_mp3(fp.name)
      return speech.speedup(playback_speed=opt['playback_speed'])
    except:
      return None

def create_audio(script, output_file_name, playback_speed=1.0, sound_effects_dir='./'):
    pattern = r'<!\[(.*?)\[(.*?)\]\]>'
    parts = re.split(pattern, ' '+script)

    audio_segments = []
    chapters = []
    current_time_ms = 0
    for i, part in enumerate(parts):
        if i % 3 == 0:  # テキスト部分
          speech = make_speech(part)
          print(speech)
          if speech:
            audio_segments.append(speech)
            current_time_ms += len(speech)
        elif i % 3 == 1:  # 指示種別
            continue  # 指示種別を次のループで処理するため、ここではスキップ
        else:  # 指示内容
            instruction_type = parts[i - 1]
            if instruction_type == 'SOUND':
                sound_effect_file_path = os.path.join(sound_effects_dir, part)
                sound_effect = AudioSegment.from_wav(sound_effect_file_path)
                audio_segments.append(sound_effect)
                current_time_ms += len(sound_effect)
            elif instruction_type == 'BLANK':
                blank_duration_ms = int(part)
                blank_segment = AudioSegment.silent(duration=blank_duration_ms)
                audio_segments.append(blank_segment)
                current_time_ms += len(blank_segment)
            elif instruction_type == 'CHAPTER':
                chapters.append((current_time_ms, part))
            else:
              speech = make_speech(instruction_type + ' ' + part)
              if speech:
                audio_segments.append(speech)
                current_time_ms += len(speech)

    # 全ての音声データを結合
    final_audio = audio_segments[0]
    for segment in audio_segments[1:]:
        final_audio += segment

    final_audio.export(output_file_name, format="mp3")

    return chapters

audio_file_path = os.path.join(export_directory, f"{basename}.mp3")
sound_directory = "/content/drive/MyDrive/easylazyscrap/sounds"

chapters = create_audio(podcast_script, audio_file_path, 1.25, sound_directory)

In [None]:
import eyed3
from eyed3.id3.tag import Tag

def add_chapters_to_mp3(mp3_file_path, chapters):
    # MP3ファイルをロード
    audio_file = eyed3.load(mp3_file_path)

    audio_file.tag = Tag()
    toc = audio_file.tag.table_of_contents.set(b"toc", toplevel=True,
                                    description=u"Table of Contents")

    audio_length_s = audio_file.info.time_secs

    # 各チャプターを追加
    for i, chapter in enumerate(chapters):
        start_time_ms, title = chapter
        start_time_s = start_time_ms / 1000
        # 次のチャプターの開始時間またはオーディオの全体の長さを終了時間として使用
        if i + 1 < len(chapters):
            end_time_s = chapters[i + 1][0] / 1000
        else:
            end_time_s = audio_length_s

        chp = audio_file.tag.chapters.set(f"chp{i}".encode('utf-8'), (start_time_s, end_time_s))
        chp.title = title
        toc.child_ids.append(chp.element_id)

    # タグを保存
    audio_file.tag.save()

add_chapters_to_mp3(audio_file_path, chapters)

In [None]:
from pprint import pprint
def read_chapters_from_mp3(mp3_file_path):
    # MP3ファイルをロード
    audio_file = eyed3.load(mp3_file_path)

    # チャプター情報を読み取る
    toc = audio_file.tag.table_of_contents.get(b"toc")
    for child_id in toc.child_ids:
        chp = audio_file.tag.chapters.get(child_id)
        pprint(vars(chp))
        start_time, end_time = chp.times
        print(f"Chapter {chp.element_id.decode('utf-8')}: {start_time} - {end_time}, title: {chp.title}")

#read_chapters_from_mp3(audio_file_path)

In [None]:
from IPython.display import Audio

Audio(audio_file_path)


In [None]:
description = """easylazyscrapは、Scrapbox上で「あとで読む」を共有できる、半匿名のブックマーク共有サービスです。
この番組では、今週投稿されたリンクからランダムにいくつかをピックアップして、その内容を紹介します。

easylazyscrapへの参加方法は、下記のリンクからご確認ください。
https://scrapbox.io/easylazyscrap/easylazyscrap%E3%81%AE%E4%BD%BF%E3%81%84%E6%96%B9
"""

description += f"""
---
## 今回のリンク集({from_date}〜{to_date})

"""

for i, (title, url) in enumerate(links):
    description += f"- {title}\n{url}\n\n"

script_file_path = os.path.join(export_directory, "description.txt")
f = open(script_file_path, mode="w")
f.write(description)
f.close()