<a href="https://colab.research.google.com/github/nariakiiwatani/easylazyscrap_speech/blob/main/trailer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install gtts pydub

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting gtts
  Downloading gTTS-2.3.2-py3-none-any.whl (28 kB)
Collecting pydub
  Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)
Installing collected packages: pydub, gtts
Successfully installed gtts-2.3.2 pydub-0.25.1


In [2]:
import os

# for local run
# from dotenv import load_dotenv
# load_dotenv()

# for colab run
import json
from google.colab import drive
drive.mount('/content/drive', force_remount=True)
with open('/content/drive/MyDrive/Colab Notebooks/secrets/openai.json') as jsonfile:
    for i, (k,v) in enumerate(json.load(jsonfile).items()):
      os.environ[k] = v

basename = "trailer"
export_directory = f"/content/drive/MyDrive/easylazyscrap/episodes/{basename}"
if not os.path.exists(export_directory):
    os.makedirs(export_directory)


Mounted at /content/drive


In [50]:
podcast_script = """
<![BGM_START[Another_Face.wav, 0]]>
<![BLANK[23000]]>
<![BGM_VOLUME[-16, 1500]]>
<![BLANK[1500]]>
こんにちは。
<![BLANK[300]]>
今週の、イージーレイジースクラップ、パーソナリティーの、レイジーです。
<![BLANK[200]]>
この番組では、半匿名のブックマーク共有サービスである、イージーレイジースクラップに寄せられたリンクを、抜粋、要約してお伝えします。
毎週土曜日更新です。
<![BLANK[1500]]>
イージーレイジースクラップへは、どなたでもご参加いただけます。
概要欄テキストのリンクから、参加方法をご確認ください。
<![BLANK[500]]>
<![BGM_VOLUME[0, 1000]]>
<![BLANK[4000]]>
<![BGM_END[-1500]]>
"""

In [51]:
from pydub import AudioSegment
import gtts
import os
import re
import tempfile

def make_speech(script, options={}):
    opt = {**{'playback_speed':1.25,'gain':-2}, **options}
    text_to_speech = gtts.gTTS(script, lang="ja", lang_check=False)
    try:
      fp = tempfile.NamedTemporaryFile()
      text_to_speech.save(fp.name)
      speech = AudioSegment.from_mp3(fp.name) + opt['gain']
      return speech.speedup(playback_speed=opt['playback_speed'])
    except:
      return None

def loopAudio(segment, duration_ms):
  if duration_ms >= len(segment):
    segment = segment * (duration_ms // len(segment))
    remaining_duration_ms = duration_ms % len(segment)
    if remaining_duration_ms > 0:
      segment += segment[:remaining_duration_ms]
  else:
    segment = segment[:duration_ms]
  return segment

def applyVolumeChanges(segment, volume_changes):
  for position_ms, volume, duration_ms in volume_changes[:1]:
    segment = segment.fade_in(max(1,duration_ms))
  prev = 0
  for position_ms, volume, duration_ms in volume_changes[1:]:
    diff = volume - prev
    segment = segment.fade(to_gain=diff, start=position_ms, duration=max(1,duration_ms))
    prev = volume
  return segment

def endBGM(segment, duration, volume_changes, time_offset):
  segment = loopAudio(segment, duration)
  segment = applyVolumeChanges(segment, [(time-time_offset, volume, duration) for (time, volume, duration) in volume_changes])
  return segment

def create_audio(script, output_file_name, playback_speed=1.0, sound_effects_dir='./'):
    pattern = r'<!\[(.*?)\[(.*?)\]\]>'
    parts = re.split(pattern, ' '+script)

    audio_segments = []
    bgm_segment = None
    bgm_volume_changes = []
    bgm_segments = []
    chapters = []
    current_time_ms = 0
    bgm_start_time_ms = 0
    for i, part in enumerate(parts):
        if i % 3 == 0:  # テキスト部分
          speech = make_speech(part)
          if speech:
            audio_segments.append(speech)
            current_time_ms += len(speech)
        elif i % 3 == 1:  # 指示種別
            continue  # 指示種別を次のループで処理するため、ここではスキップ
        else:  # 指示内容
            instruction_type = parts[i - 1]
            if instruction_type == 'SOUND':
                sound_effect_file_path = os.path.join(sound_effects_dir, part)
                sound_effect = AudioSegment.from_wav(sound_effect_file_path)
                audio_segments.append(sound_effect)
                current_time_ms += len(sound_effect)
            elif instruction_type == 'BLANK':
                blank_duration_ms = int(part)
                blank_segment = AudioSegment.silent(duration=blank_duration_ms)
                audio_segments.append(blank_segment)
                current_time_ms += len(blank_segment)

            elif instruction_type == 'CHAPTER':
                chapters.append((current_time_ms, part))

            elif instruction_type == 'BGM_START':
                bgm_file_name, fade_duration_ms_str = part.split(',')
                fade_duration_ms = int(fade_duration_ms_str)
                bgm_start_time_ms = current_time_ms
                if fade_duration_ms < 0:
                  fade_duration_ms = abs(fade_duration_ms)
                  bgm_start_time_ms -= fade_duration_ms

                bgm_file_path = os.path.join(sound_effects_dir, bgm_file_name)
                bgm_segment = AudioSegment.from_wav(bgm_file_path)
                bgm_volume_changes.append((bgm_start_time_ms, 0.0, fade_duration_ms))
            elif instruction_type == 'BGM_VOLUME':
                volume, fade_duration_ms_str = part.split(',')
                fade_duration_ms = int(fade_duration_ms_str)
                fade_start_time_ms = current_time_ms
                if fade_duration_ms < 0:
                  fade_duration_ms = abs(fade_duration_ms)
                  fade_start_time_ms -= fade_duration_ms
                bgm_volume_changes.append((fade_start_time_ms, float(volume), fade_duration_ms))
            elif instruction_type == 'BGM_END':
                fade_duration_ms = int(part)
                fade_start_time_ms = current_time_ms
                if fade_duration_ms < 0:
                  fade_duration_ms = abs(fade_duration_ms)
                  fade_start_time_ms -= fade_duration_ms
                bgm_volume_changes.append((fade_start_time_ms, -120.0, fade_duration_ms))
                if bgm_segment is not None:
                  bgm_segment = endBGM(bgm_segment, fade_start_time_ms - bgm_start_time_ms + fade_duration_ms, bgm_volume_changes, bgm_start_time_ms)
                  bgm_segments.append((bgm_start_time_ms, bgm_segment))
                  bgm_volume_changes = []
                  bgm_segment = None
            else:
                speech = make_speech(instruction_type + ' ' + part)
                if speech:
                  audio_segments.append(speech)
                  current_time_ms += len(speech)

    if bgm_segment:
      bgm_volume_changes.append((current_time_ms, -120.0, 0))
      bgm_segment = endBGM(bgm_segment, current_time_ms - bgm_start_time_ms, bgm_volume_changes, bgm_start_time_ms)
      bgm_segments.append((bgm_start_time_ms, bgm_segment))
      bgm_volume_changes = []
      bgm_segment = None


    # 全ての音声データを結合
    final_audio = audio_segments[0]
    for segment in audio_segments[1:]:
        final_audio += segment

    for start_time_ms, segment in bgm_segments:
        final_audio = final_audio.overlay(segment, position=start_time_ms)

    final_audio.export(output_file_name, format="mp3")

    return chapters

audio_file_path = os.path.join(export_directory, f"{basename}.mp3")
sound_directory = "/content/drive/MyDrive/easylazyscrap/sounds"

chapters = create_audio(podcast_script, audio_file_path, 1.25, sound_directory)

In [52]:
from IPython.display import Audio

Audio(audio_file_path)
