<a href="https://colab.research.google.com/github/nariakiiwatani/easylazyscrap_speech/blob/main/easylazyscrap_speech.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install requests openai gtts newspaper3k langchain faiss-gpu tiktoken pydub

In [None]:
import os
import numpy as np

# for local run
# from dotenv import load_dotenv
# load_dotenv()

# for colab run
import json
from google.colab import drive
drive.mount('/content/drive', force_remount=True)
with open('/content/drive/MyDrive/Colab Notebooks/secrets/openai.json') as jsonfile:
    for i, (k,v) in enumerate(json.load(jsonfile).items()):
      os.environ[k] = v



In [None]:
import datetime

def generate_recent_page_urls(base_url, base_date, within_days):
    recent_page_urls = []
    for i in range(within_days):
        date_string = (base_date - datetime.timedelta(days=i)).strftime("%Y%%2F%m%%2F%d")
        page_url = f"{base_url}/scrap_{date_string}/text"
        recent_page_urls.append(page_url)
    return recent_page_urls

within_days = 7
base_url = "https://scrapbox.io/api/pages/easylazyscrap"
base_date = datetime.date.today()
# yesterday
base_date -= datetime.timedelta(days=1)
recent_page_urls = generate_recent_page_urls(base_url, base_date, within_days)

In [None]:
import requests
import newspaper
import re

def fetch_links_from_page(url):
    links = []
    response = requests.get(url)
    for line in response.text.splitlines():
        match = re.search('\[(.*?) ((?:https?|ftp):\/\/?[\w/\-?=%.]+\.[\w/\-?=%.]+)\s*\]', line)
        if match:
            link_text = match.group(1)
            link_url = match.group(2)
            links.append((link_text, link_url))
    return links

def get_article(url):
    article = newspaper.Article(url)
    article.download()
    article.parse()
    return article.text

links = []
articles = []
for page_url in recent_page_urls:
    new_links = fetch_links_from_page(page_url)
    links.extend(new_links)
    for title, url in new_links:
        article = get_article(url)
        if article != "":
            articles.append((title, article))


In [None]:
choice = 8
choice = min(choice, len(articles))

idx = np.random.choice(np.arange(len(articles)), choice, replace=False)

articles = list(np.array(articles)[idx])


In [None]:
import os
import openai

from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.document_loaders import TextLoader
from langchain.vectorstores import FAISS
import faiss

text_splitter = CharacterTextSplitter(chunk_size=140, chunk_overlap=0, separator="\n")
res = faiss.StandardGpuResources()
flat_config = faiss.GpuIndexFlatConfig()

clustered = []
def clusterize(text, n):
    documents = text_splitter.create_documents([text])
    texts = [doc.page_content.replace("\n\n", "") for doc in documents]

    clusters_num = 5

    response = openai.Embedding.create(input=texts, model="text-embedding-ada-002")
    embeds = [record['embedding'] for record in response['data']]
    embeds_np = np.array(embeds).astype('float32')
    if embeds_np.shape[0] < clusters_num:
      return False
    index = faiss.GpuIndexFlatL2(res, embeds_np.shape[1], flat_config)
    index.add(embeds_np)

    kmeans = faiss.Kmeans(d=embeds_np.shape[1], k=clusters_num, niter=20, verbose=True)
    kmeans.train(embeds_np)
    clusters = kmeans.index.search(embeds_np, 1)[1].flatten()

    # 最も近いクラスタ中心点との距離を計算し、クラスタ中心点に近いものからn個だけ残して、それ以外を除外する
    clustered_sentences = {}
    for i, label in enumerate(clusters):
        if label not in clustered_sentences:
            clustered_sentences[label] = []

        distances = np.linalg.norm(embeds_np[i] - kmeans.centroids[label])
        clustered_sentences[label].append((texts[i], distances))

    # 中心点に近いものからn個だけ残す
    for label, cluster in clustered_sentences.items():
        sorted_cluster = sorted(cluster, key=lambda x: x[1])
        clustered_sentences[label] = [c[0] for c in sorted_cluster[:n]]

    return clustered_sentences

def pick(src, num):
    sorted_list = sorted(src.items(), key=lambda x: len(x[1]), reverse=True)
    return dict(sorted_list[:num])

pick_cluster = 3
pick_inside_cluster = 3
for (title, article) in articles:
  result = clusterize(article, pick_inside_cluster)
  if result != False:
    clustered.append((title, pick(result, pick_cluster)))



In [None]:

from langchain.prompts import PromptTemplate
from langchain import OpenAI

prompt_template = """以下の内容について200字程度の簡潔な要約を日本語で作成してください:

"{text}"

簡潔な要約:"""
PROMPT = PromptTemplate(template=prompt_template, input_variables=['text'])

from langchain.chat_models import ChatOpenAI
from langchain.docstore.document import Document
from langchain.chains.summarize import load_summarize_chain

def summarize(clusters):
    cluster_summaries = []
    for cluster_sentences in clusters.values():
        docs = [Document(page_content=sentence) for sentence in cluster_sentences]
        chain = load_summarize_chain(
            OpenAI(
                model_name="text-davinci-002",
                temperature=0.7,
            ),
            chain_type="map_reduce",
            map_prompt=PROMPT,
            combine_prompt=PROMPT,
        )
        summary = chain.run(input_documents=docs, token_max=2000)
        cluster_summaries.append(summary)
    return cluster_summaries

summaries = [(title, summarize(cluster)) for (title, cluster) in clustered]


In [None]:
to_date = base_date.strftime("%Y年%-m月%-d日")
from_date = (base_date - datetime.timedelta(days=within_days-1)).strftime("%Y年%-m月%-d日")

podcast_script = """今週の、イージーレイジースクラップ。
イージーレイジースクラップは、スクラップボックス上で「あとで読む」を共有できる、半匿名のブックマーク共有サービスです。
この番組では、今週、イージーレイジースクラップに投稿された内容から、ランダムにいくつかをピックアップして紹介します。
"""

podcast_script += f"今回は、{from_date}から、{to_date}のスクラップを紹介します。"
podcast_script += f"""

それでは、最初のスクラップです。


"""
for i, (title, summary) in enumerate(summaries):
    text = ''.join(summary).replace('\n', '')
    podcast_script += f"{title}\n{text}\n\n"
    if i < len(summaries)-1:
        podcast_script += "続いて、こちらのスクラップです。\n\n"
podcast_script += """今週の、ピックアップは、以上です。

最後に、今週寄せられた、全てのスクラップのタイトルを読み上げます。

"""

for i, (title, url) in enumerate(links):
    podcast_script += f"{title}\n"

podcast_script += """

今週、寄せられたリンクは、以上です。
これで、今週のイージーレイジースクラップを終わります。

イージーレイジースクラップへの参加方法は、概要欄のリンクからご確認ください。
ありがとうございました！
"""


In [None]:
import os
import datetime

basename = (datetime.date.today()).strftime("%Y_%m_%d")

export_directory = f"/content/drive/MyDrive/easylazyscrap/episodes/{basename}"
if not os.path.exists(export_directory):
    os.makedirs(export_directory)


In [None]:
description = """easylazyscrapは、Scrapbox上で「あとで読む」を共有できる、半匿名のブックマーク共有サービスです。
この番組では、今週投稿されたリンクからランダムにいくつかをピックアップして、その内容を紹介します。

easylazyscrapへの参加方法は、下記のリンクからご確認ください。
https://scrapbox.io/easylazyscrap/easylazyscrap%E3%81%AE%E4%BD%BF%E3%81%84%E6%96%B9
"""

description += f"""
---
## 今回のリンク集({from_date}〜{to_date})

"""

for i, (title, url) in enumerate(links):
    description += f"- {title}\n{url}\n\n"

script_file_path = os.path.join(export_directory, "description.txt")
f = open(script_file_path, mode="w")
f.write(description)
f.close()

In [None]:
import gtts

tts = gtts.gTTS(podcast_script,
                lang="ja",
                lang_check=False,
               )

audio_tmp_file_path = os.path.join('/tmp', f"{basename}.mp3")
tts.save(audio_tmp_file_path)

In [None]:
from pydub import AudioSegment
from pydub import effects

af = AudioSegment.from_mp3(audio_tmp_file_path)
af2 = af.speedup(playback_speed=1.25)
audio_file_path = os.path.join(export_directory, f"{basename}.mp3")
af2.export(audio_file_path, format="mp3")

In [None]:
from IPython.display import Audio

Audio(audio_file_path)
