In [21]:
# start by extracting the raw audio with ffmpeg
import subprocess
import os
from pathlib import Path

project_dir = os.path.join(Path.home(), Path("dev/HRI-experiments"))
video_dir = os.path.join(project_dir, "video_files")
print("Searching for video files in: " + video_dir)
audio_dir = os.path.join(project_dir, "audio_files")
print("Placing audio files in: " + audio_dir)

for in_file in os.listdir(video_dir):
    print("Found " + in_file + ", converting...")
    in_file = os.path.join(video_dir, in_file)
    out_file = os.path.join(audio_dir, Path(in_file).stem + ".wav")
    convert_command = "ffmpeg -i " + in_file + " -ab 160k -ac 2 -ar 16000 -vn " + out_file
    print("Running conversion: " + convert_command)
    subprocess.call(convert_command, shell=True)

Searching for video files in: /Users/scottloftin/dev/HRI-experiments/video_files
Placing audio files in: /Users/scottloftin/dev/HRI-experiments/audio_files
Found session_46_clipped.mov, converting...
Running conversion: ffmpeg -i /Users/scottloftin/dev/HRI-experiments/video_files/session_46_clipped.mov -ab 160k -ac 2 -ar 16000 -vn /Users/scottloftin/dev/HRI-experiments/audio_files/session_46_clipped.wav
Found robot_talking.mp4, converting...
Running conversion: ffmpeg -i /Users/scottloftin/dev/HRI-experiments/video_files/robot_talking.mp4 -ab 160k -ac 2 -ar 16000 -vn /Users/scottloftin/dev/HRI-experiments/audio_files/robot_talking.wav


ffmpeg version 4.2.2 Copyright (c) 2000-2019 the FFmpeg developers
  built with clang version 4.0.1 (tags/RELEASE_401/final)
  configuration: --prefix=/opt/concourse/worker/volumes/live/d5b9ea1c-8223-4ff6-7416-83e6b4cd6874/volume/ffmpeg_1587154914508/_h_env_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehol --cc=x86_64-apple-darwin13.4.0-clang --disable-doc --enable-avresample --enable-gmp --enable-hardcoded-tables --enable-libfreetype --enable-libvpx --enable-pthreads --enable-libopus --enable-postproc --enable-pic --enable-pthreads --enable-shared --enable-static --enable-version3 --enable-zlib --enable-libmp3lame --disable-nonfree --enable-gpl --enable-gnutls --disable-openssl --enable-libopenh264 --enable-libx264
  libavutil      56. 31.100 / 56. 31.100
  libavcodec     58. 54.100 / 58. 54.100
  libavformat    58. 29.100 / 58. 29.100
  libavdevice    58.  8.100 / 58.  8.100
  libavfilt

In [35]:
# Now lets load a wav file as numpy array
from scipy.io import wavfile
import numpy as np

audio_filename = os.path.join(audio_dir, "Counting_test.wav")
output = wavfile.read(audio_filename)
sampling_rate = output[0]
print(sampling_rate)
# This needs to be set for either stereo or mono
# samples = [a[1] for a in output[1]]
audio_data = np.array(output[1], dtype=float)
audio_data.shape

16000


(152576,)

In [36]:
import torch
from transformers import Speech2TextProcessor, Speech2TextForConditionalGeneration
from datasets import load_dataset

model = Speech2TextForConditionalGeneration.from_pretrained("facebook/s2t-small-librispeech-asr")
processor = Speech2TextProcessor.from_pretrained("facebook/s2t-small-librispeech-asr")

In [37]:
# ds = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation")
# inputs = processor(ds[0]["audio"]["array"], sampling_rate=ds[0]["audio"]["sampling_rate"], return_tensors="pt")
inputs = processor(audio_data, sampling_rate=sampling_rate, return_tensors="pt")
generated_ids = model.generate(inputs["input_features"], attention_mask=inputs["attention_mask"])

In [38]:
transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)
transcription

['one two three four five six seven h']

In [26]:
import torch
from transformers import Speech2Text2Processor, SpeechEncoderDecoderModel
from datasets import load_dataset
import soundfile as sf

model = SpeechEncoderDecoderModel.from_pretrained("facebook/s2t-wav2vec2-large-en-de")
processor = Speech2Text2Processor.from_pretrained("facebook/s2t-wav2vec2-large-en-de")


def map_to_array(batch):
    speech, _ = sf.read(batch["file"])
    batch["speech"] = speech
    return batch


# ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
# ds = ds.map(map_to_array)

# inputs = processor(ds["speech"][0], sampling_rate=16_000, return_tensors="pt")
inputs = processor(audio_data, sampling_rate=sampling_rate, return_tensors="pt")
generated_ids = model.generate(inputs=inputs["input_values"], attention_mask=inputs["attention_mask"])

transcription = processor.batch_decode(generated_ids)

Config of the decoder: <class 'transformers.models.speech_to_text_2.modeling_speech_to_text_2.Speech2Text2ForCausalLM'> is overwritten by shared decoder config: Speech2Text2Config {
  "activation_dropout": 0.1,
  "activation_function": "relu",
  "add_cross_attention": true,
  "architectures": [
    "Speech2TextForConditionalGeneration"
  ],
  "attention_dropout": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": 0.0,
  "conv_channels": 1024,
  "conv_kernel_sizes": [
    5,
    5
  ],
  "d_model": 256,
  "decoder_attention_heads": 4,
  "decoder_ffn_dim": 2048,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 7,
  "decoder_start_token_id": 2,
  "dropout": 0.1,
  "early_stopping": true,
  "encoder_attention_heads": 4,
  "encoder_ffn_dim": 2048,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 12,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "init_std": 0.02,
  "input_channels": 1,
  "input_feat_per_channel": 80,
  "is_decoder": true,
  "is_encoder_decoder": true,
  "max_length"

In [27]:
transcription

['</s> Ich habe mir die Frage gestellt, dass es in diesem Raum zu sehen ist, die man in der Lage ist, in der Nähe zu lernen, die man mit einem neuen Gespräch zu lernen. </s>']