<a href="https://colab.research.google.com/github/olaviinha/SloppyButchery/blob/main/PoeticButcher.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#<font face="Trebuchet MS" size="6">Poetic Butcher <font color="#999" size="3">v 0.0.1<font color="#999" size="4">&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;</font><font size="4">Sloppy Butchery @</font> <a href="https://github.com/olaviinha/SloppyButchery" target="_blank"><font color="#999" size="4">Github</font></a><font color="#999" size="4">&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;</font><font size="3" color="#999"><a href="https://inha.se" target="_blank"><font color="#999">O. Inha</font></a></font></font>

Poetic Butcher is a speech slicer. It slices speech to individual words using [Mozilla DeepSpeech](https://github.com/mozilla/DeepSpeech) RNN, and exports each word as a separate WAV file.

<font size="5">Howto</font>
1. Input a path to an audiofile located in your Google Drive.
2. Hit <i>Runtime > Run all</i>.

DeepSpeech was not designed for the task at hand and thus is not performing with perfect accuracy. You may want to finetune the start and end time of each word (in milliseconds) with the `start_ms` and `end_ms` sliders.

WAV files will be saved under the same directory where your input file is located, in subdirectory `<input_audio>_words`

<hr size="1" color="#666"/>

In [None]:
#@title #Setup
#@markdown This cell needs to be run only once. It will mount your Google Drive and setup prerequisities.

pip_packages = 'deepspeech-gpu pysoundfile'
apt_packages = 'sox'

import os
from google.colab import output

# inhagcutils
if not os.path.isfile('/content/inhagcutils.ipynb'):
  %cd /content/
  !apt-get update -qq && apt-get install -qq {apt_packages}
  !pip -q install import-ipynb {pip_packages}
  !curl -s -O https://raw.githubusercontent.com/olaviinha/inhagcutils/master/inhagcutils.ipynb
  import import_ipynb
  from inhagcutils import *

# Mount Drive
if not os.path.isdir('/content/drive'):
  from google.colab import drive
  drive.mount('/content/drive')

# Drive symlink
if not os.path.isdir('/content/mydrive'):
  os.symlink('/content/drive/My Drive', '/content/mydrive')
  drive_root_set = True
drive_root = '/content/mydrive/'

# Download models
if not 'models_downloaded' in globals():
  !curl -LO https://github.com/mozilla/DeepSpeech/releases/download/v0.8.1/deepspeech-0.8.1-models.pbmm
  !curl -LO https://github.com/mozilla/DeepSpeech/releases/download/v0.8.1/deepspeech-0.8.1-models.scorer
  models_downloaded = True

dir_tmp = '/content/tmp/'
create_dirs([dir_tmp])

last_input_audio = ''

output.clear()
op(c.ok, 'Setup finished.')

In [None]:
#@title Slice
input = "" #@param {type:"string"}

#@markdown <small>Finetune the start and end time of each exported WAV (in milliseconds).</small>
start_ms = -100 #@param {type:"slider", min:-100, max:100, step:5}
end_ms = -85 #@param {type:"slider", min:-100, max:100, step:5}

#@markdown <small>Create preview players inside this notebook –OR– save WAV files directly to your Drive.</small>
mode = "preview" #@param ["preview", "save_to_drive"]

fade_ms = 3
input_audio = drive_root+input_audio
id = rnd_str(8)
global_sr = 44100
global_fade = fade_ms/1000
head = start_ms/1000
tail = end_ms/1000

dir_tmp_input = dir_tmp+'input/'
create_dirs([dir_tmp_input])

input_type = check_input_type(input)

def clip_audio(audio_data, start, duration, sr=global_sr):
  global global_fade
  xstart = librosa.time_to_samples(start, sr=sr)
  xduration = librosa.time_to_samples(start+duration, sr=sr)
  audio_data = fade_audio(audio_data[:, xstart:xduration])
  return audio_data

def fade_audio(audio_data, fade_in=global_fade, fade_out=global_fade, sr=global_sr):
  a_duration = librosa.get_duration(audio_data, sr=sr)
  if fade_in > 0:
    fade_in_to = librosa.time_to_samples(fade_in, sr=sr)
    in_y = audio_data[:, 0:fade_in_to]
    fade_ins = []
    for channel in in_y:
      fade = [ i/len(channel)*smp for i, smp in enumerate(channel) ]
      fade_ins.append(fade)
    fade_ins = np.array(fade_ins)
    tail_start = fade_in_to+1  
    tail = audio_data[:, tail_start:]
    audio_data = np.concatenate([fade_ins, tail], axis=1)
  if fade_out > 0:
    fade_out_start = librosa.time_to_samples(a_duration-fade_out, sr=sr)
    out_y = audio_data[:, fade_out_start:]
    fade_outs = []
    for channel in out_y:
      fade = [ smp-(i/len(channel)*smp) for i, smp in enumerate(channel) ]
      fade_outs.append(fade)
    fade_outs = np.array(fade_outs)
    head_start = fade_out_start-1
    head = audio_data[:, :head_start]
    audio_data = np.concatenate([head, fade_outs], axis=1)
  return audio_data

def save(audio_data, save_as, sr=global_sr):
  soundfile.write(save_as, audio_data.T, sr)

if input_type == 'file':
  !cp {input} {dir_tmp_input}
  target = dir_tmp_input
if input_type == 'dir':
  target = input
if input_type == 'youtube':
  !pip -q install youtube-dl ffmpeg
  !youtube-dl --restrict-filenames -x --no-continue --audio-format wav -o "{dir_tmp_input}%(title)s.%(ext)s" {input}
  target = dir_tmp_input

import json, soundfile, librosa

file_list = list_audio(target)
for audiofile in file_list:
  file_id = slug(basename(audiofile))
  dir_output = path_dir(input)+file_id+'_words/'
  !deepspeech --model=deepspeech-0.8.1-models.pbmm --scorer=deepspeech-0.8.1-models.scorer --audio='{audiofile}' --json --candidate_transcripts=1 > {dir_tmp}{file_id}.json
  with open(dir_tmp+file_id'.json', 'r') as j:
    json_data = json.load(j)
  y, sr = librosa.load(audiofile, mono=False, sr=global_sr)
  for i, word in enumerate(json_data['transcripts'][0]['words']):
    start = word['start_time'] + head
    if start < 0:
      start = 0
    duration = word['duration'] + abs(head) + tail
    word_audio = clip_audio(y, start, duration)
    if mode == 'save_to_drive':
      save_as = str(i).zfill(4)+'_'+word['word']+'.wav'
      save(word_audio, dir_output+save_as)
      print('Saved', save_as)
    else:
      print(word['word'])
      audio_player(word_audio)
    