In [338]:
import imageio  # for reading in video files

# Imports the Google Cloud client library
from google.cloud import speech
from google.cloud.speech import enums
from google.cloud.speech import types

import io
import os
import sys

from __future__ import print_function, division, unicode_literals
import json
import csv
import numpy as np

import json

sys.path.insert(0, "/Users/anuj/coursework_cuboulder/spring_2018/algo_hri/torchMoji-master")

from torchmoji.sentence_tokenizer import SentenceTokenizer
from torchmoji.model_def import torchmoji_emojis
from torchmoji.global_variables import PRETRAINED_PATH, VOCAB_PATH

import subprocess

from pydub import AudioSegment
from pydub.silence import split_on_silence
from pydub.silence import detect_nonsilent

import pickle

In [235]:
def convert_audio_to_mono(data_folder):
    output_filename = data_folder+'mono_myrecording.wav'
    command = ['sox', data_folder+'myrecording.wav', '-c', '1', output_filename] 
    subprocess.Popen(command)
    
    print('mono audio saved to', output_filename)
    return output_filename

In [None]:
def get_relative_joint_positions(data_folder):
    def get_skeleton_dict(data_folder, real_world_coords=True):
        if real_world_coords:
            filename = data_folder+'realWorldCoords.json'
        else:
            filename = data_folder+'projSpaceCoords.json'
        json_contents = json.load(open(filename))

        return json_contents

    skeletal_data = get_skeleton_dict(data_folder, True)
    output = []
    for frame in skeletal_data:
        d = {}
        x_orig_torso, y_orig_torso, z_orig_torso = frame['torso'].split(',')
        x_orig_torso, y_orig_torso, z_orig_torso = float(x_orig_torso[1:]), float(y_orig_torso), float(z_orig_torso[:-1])
        for joint, position in frame.items():
            if joint == 'time_ms':
                d[joint] = position
                continue
            if joint not in d.keys():
                d[joint] = []
            x_orig, y_orig, z_orig = position.split(',')
            x_orig, y_orig, z_orig = float(x_orig[1:]), float(y_orig), float(z_orig[:-1])
            d[joint] = [x_orig - x_orig_torso, y_orig - y_orig_torso, z_orig - z_orig_torso]
        output.append(d)
    return output

In [276]:
def get_prosody_features_from_audio(audio_filename):
    #audio_filename = convert_audio_to_mono(data_folder)
    name = audio_filename.split('/')[-1].split('.wav')[0]
    command = 'python /Users/anuj/coursework_cuboulder/spring_2018/algo_hri/DisVoice-master/prosody/prosody.py \"'+audio_filename+'\" \"'+'/'.join(audio_filename.split('/')[:-1])+'/'+'prosody_'+ name+'.txt\" "static" "false"'
    subprocess.call(command, shell=True)
    print('prosody features saved to '+command.split('"')[-6])
    return command.split('"')[-6]

In [318]:
def get_text_from_speech(audio_filename):
    os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "google_creds.json"

    # Instantiates a client
    client = speech.SpeechClient()

    # Loads the audio into memory
    with io.open(audio_filename, 'rb') as audio_file:
        content = audio_file.read()
        audio = types.RecognitionAudio(content=content)

    config = types.RecognitionConfig(language_code='en-US')

    # Detects speech in the audio file
    text = ''
    response = client.recognize(config, audio)
    if len(response.results) != 0:
        text = response.results[0].alternatives[0].transcript
    a = audio_filename.split('/')
    filename = '/'+'/'.join(a[1:-1])+'/text_'+a[-1].split('.wav')[0]+'.txt'
    with open(filename, 'w') as f:
        f.write(text)
    print('converted text saved to '+filename)
    
    return text

In [333]:
def get_emotion_features_from_text(text, audio_filename):
    if text == '':
        emoji_ids = []
        one_hot_encodings = []
    else:
        text = [text]
        def top_elements(array, k):
            ind = np.argpartition(array, -k)[-k:]
            return ind[np.argsort(array[ind])][::-1]

        maxlen = 30

        #print('Tokenizing using dictionary from {}'.format(VOCAB_PATH))
        with open(VOCAB_PATH, 'r') as f:
            vocabulary = json.load(f)

        st = SentenceTokenizer(vocabulary, maxlen)

        #print('Loading model from {}.'.format(PRETRAINED_PATH))
        model = torchmoji_emojis(PRETRAINED_PATH)
        #print(model)
        #print('Running predictions.')
        tokenized, _, _ = st.tokenize_sentences(text)
        prob = model(tokenized)

        for prob in [prob]:
            # Find top emojis for each sentence. Emoji ids (0-63)
            # correspond to the mapping in emoji_overview.png
            # at the root of the torchMoji repo.
            #print('Writing results to {}'.format(OUTPUT_PATH))
            scores = []
            for i, t in enumerate(text):
                t_tokens = tokenized[i]
                t_score = [t]
                t_prob = prob[i]
                ind_top = top_elements(t_prob, 5)
                t_score.append(sum(t_prob[ind_top]))
                t_score.extend(ind_top)
                t_score.extend([t_prob[ind] for ind in ind_top])
                scores.append(t_score)

        emoji_ids = scores[0][2:2+5]
        one_hot_encodings = []
        for emoji_idx in emoji_ids:
            one_hot_encodings.append([0 if i!=emoji_idx else 1 for i in range(64)])
    a = audio_filename.split('/')
    filename = '/'+'/'.join(a[1:-1])+'/onehot_emotion_'+a[-1].split('.wav')[0]+'.pkl'
    with open(filename, 'wb') as f:
        pickle.dump(one_hot_encodings, f)
        
    print('one hot encoded emotion vector saved to '+filename)
    filename = '/'+'/'.join(a[1:-1])+'/emoji_ids_'+a[-1].split('.wav')[0]+'.pkl'
    with open(filename, 'wb') as f:
        pickle.dump(emoji_ids, f)
    print('emoji ids saved to '+filename)
    return emoji_ids, one_hot_encodings

# MAIN

In [339]:
data_folder = '/Users/anuj/coursework_cuboulder/spring_2018/algo_hri/data/data3/'

mono_audio_filename = convert_audio_to_mono(data_folder)

###### get skeleton frames for each non-silent audio portion ######

skeleton_dict = get_relative_joint_positions(data_folder)

audio_portions = detect_nonsilent(sound_file, min_silence_len=500, silence_thresh=-50, seek_step=1)
skeleton_frames = []
for pair in audio_portions:
    start = pair[0]
    end = pair[1]
    fr = []
    for d in skeleton_dict:
        if d['time_ms'] >= start and d['time_ms'] <= end:
            d_tmp = d.copy()
            del d_tmp['time_ms']
            fr.append(d_tmp)
    skeleton_frames.append(fr)

with open(data_folder+'relative_joint_positions.json', 'w') as f:
    json.dump(skeleton_frames, f)
    
sound_file = AudioSegment.from_wav(mono_audio_filename)
audio_chunks = split_on_silence(sound_file, 
    # must be silent for at least half a second
    min_silence_len=500,

    # consider it silent if quieter than -16 dBFS
    silence_thresh=-50
)

###### get feature vectors for each non-silent audio portion #######

all_prosody_vectors = []
all_emoji_ids = []  # not used
all_emotion_vectors = []

if not os.path.isdir(data_folder+'chunks/'):
    os.mkdir(data_folder+'chunks/')

for i, chunk in enumerate(audio_chunks):
    print('#############')
    out_file = data_folder+"chunks/chunk{0}.wav".format(i)
    print("exporting", out_file)
    chunk.export(out_file, format="wav")

    ###### prosody features #######
    
    prosody_features_filename = get_prosody_features_from_audio(out_file)
    with open(prosody_features_filename, 'r') as f:
        prosody_features = list(map(float, f.read()[:-1].split(' ')))
    all_prosody_vectors.append(prosody_features)
    
    ###### emotion features #######

    converted_text = get_text_from_speech(out_file)
    emoji_ids, emotion_features = get_emotion_features_from_text(converted_text, out_file)
    all_emoji_ids.append(emoji_ids)
    all_emotion_vectors.append(emotion_features)

mono audio saved to /Users/anuj/coursework_cuboulder/spring_2018/algo_hri/data/data3/mono_myrecording.wav
#############
exporting /Users/anuj/coursework_cuboulder/spring_2018/algo_hri/data/data3/chunks/chunk0.wav
prosody features saved to /Users/anuj/coursework_cuboulder/spring_2018/algo_hri/data/data3/chunks/prosody_chunk0.txt
converted text saved to /Users/anuj/coursework_cuboulder/spring_2018/algo_hri/data/data3/chunks/text_chunk0.txt


  input = module(input)


one hot encoded emotion vector saved to /Users/anuj/coursework_cuboulder/spring_2018/algo_hri/data/data3/chunks/onehot_emotion_chunk0.pkl
emoji ids saved to /Users/anuj/coursework_cuboulder/spring_2018/algo_hri/data/data3/chunks/emoji_ids_chunk0.pkl
#############
exporting /Users/anuj/coursework_cuboulder/spring_2018/algo_hri/data/data3/chunks/chunk1.wav
prosody features saved to /Users/anuj/coursework_cuboulder/spring_2018/algo_hri/data/data3/chunks/prosody_chunk1.txt
converted text saved to /Users/anuj/coursework_cuboulder/spring_2018/algo_hri/data/data3/chunks/text_chunk1.txt
one hot encoded emotion vector saved to /Users/anuj/coursework_cuboulder/spring_2018/algo_hri/data/data3/chunks/onehot_emotion_chunk1.pkl
emoji ids saved to /Users/anuj/coursework_cuboulder/spring_2018/algo_hri/data/data3/chunks/emoji_ids_chunk1.pkl
#############
exporting /Users/anuj/coursework_cuboulder/spring_2018/algo_hri/data/data3/chunks/chunk2.wav
prosody features saved to /Users/anuj/coursework_cuboulde

In [323]:
len(all_prosody_vectors), len(all_emotion_vectors), len(all_emoji_ids)

(14, 14, 14)

# JUNK

In [None]:
def read_video_file(filename):
    vid = imageio.get_reader(filename, 'ffmpeg')
    return vid

def extract_audio_from_video(video_filename):
    video_filename = video_filename.split(' ')
    video_filename = '\ '.join(video_filename)
    
    rmp = video_filename.split('.mp4')
    rmp.append('.wav')
    output_audio_filename = ''.join(rmp)
        
    command = "ffmpeg -i "+video_filename+" -ab 160k -ac 2 -ar 44100 -vn "+output_audio_filename
    subprocess.call(command, shell=True)
    
    print('audio saved to', output_audio_filename)

import os
import json

def get_skeleton_frames_consolidated_dict(directory):
    # key = frame index
    # value = parts position dictionary (part number: [x,y,z,c] -- pixel coordinates)
    skeleton_frames_dict = {}

    # TODO: why are some part positions empty?
    # TODO: get 3D coordinates intead of 2D
    for idx, skeleton_frame_filename in enumerate(sorted(os.listdir(directory))):
        json_contents = json.load(open(directory + skeleton_frame_filename))
        part_pos_dict = json_contents['part_candidates'][0]
        if idx not in skeleton_frames_dict.keys():
            skeleton_frames_dict[idx] = {}
        skeleton_frames_dict[idx] = part_pos_dict
        
    return skeleton_frames_dict

"""
Check if number of frames extracted from original video == number of frames from OpenPose
"""

video_filename = 'WhatsApp Video 2018-04-12 at 10.16.35 PM.mp4'
skeleton_json_folder = 'json/'

vid = read_video_file(video_filename)

skeleton_frames_dict = get_skeleton_frames_consolidated_dict(skeleton_json_folder)

vid.get_meta_data()['nframes'], len(skeleton_frames_dict.keys())

In [None]:
#sample from recording? to get some audio?
# frame by frame speech to text convert...
#assume you have perfect text, how will you segment frames by sentences...?

In [None]:
def get_ms(s, frame_rate):
    return s#(s/1000)*frame_rate

ls = []
for pair in audio_portions:
    start = pair[0]
    end = pair[1]
    
    ls.append((int(get_ms(start, sound_file.frame_rate)), int(get_ms(end, sound_file.frame_rate))))

In [None]:
def transcribe_gcs(gcs_uri):
    """Asynchronously transcribes the audio file specified by the gcs_uri."""
    from google.cloud import speech
    from google.cloud.speech import enums
    from google.cloud.speech import types

    client = speech.SpeechClient()

    audio = types.RecognitionAudio(uri=gcs_uri)
    config = types.RecognitionConfig(
        language_code='en-US')

    operation = client.long_running_recognize(config, audio)

    print('Waiting for operation to complete...')
    response = operation.result()
    
    return response.results[0].alternatives[0].transcript

    # Each result is for a consecutive portion of the audio. Iterate through
    # them to get the transcripts for the entire audio file.
    #for result in response.results:
        # The first alternative is the most likely one for this portion.
    #    print(u'Transcript: {}'.format(result.alternatives[0].transcript))
    #    print('Confidence: {}'.format(result.alternatives[0].confidence))
transcribe_gcs('gs://tttanuj/mono_myrecording.wav')

In [None]:
#first segment audio from text?
# find pauses instead of ends of sentences?

len(ls)
#len(sound_file) is in milliseconds, frame_rate is in hertz

from scipy.io import wavfile
audio = wavfile.read('/Users/anuj/coursework_cuboulder/spring_2018/algo_hri/data/data3/mono_myrecording.wav')
audio[1].shape

#sampling 44100 times per second, 95 seconds

prev_time_idx = 0
snippets = []
for idx,d in enumerate(skeletal_data):
    curr_time_idx = int(audio[0]*(d['time_ms']/1000))
    snippet = audio[1][prev_time_idx:curr_time_idx]
    prev_time_idx = curr_time_idx
    wavfile.write(data_folder+'audio/'+str(idx)+'.wav', audio[0], snippet)
    snippets.append(snippet)

In [None]:
frames = []
for pair in audio_portions:
    start = pair[0]
    end = pair[1]
    fr = []
    for d in sk:
        if d['time_ms'] >= start and d['time_ms'] <= end:
            fr.append(d)
    frames.append(fr)

s=0
for x in frames:
    s+=len(x)
    print(len(x),s)