In [34]:
import imageio  # for reading in video files

# Imports the Google Cloud client library
from google.cloud import speech
from google.cloud.speech import enums
from google.cloud.speech import types

import io
import os
import sys

from __future__ import print_function, division, unicode_literals
import json
import csv
import numpy as np

sys.path.insert(0, "/Users/anuj/coursework_cuboulder/spring_2018/algo_hri/torchMoji-master")

from torchmoji.sentence_tokenizer import SentenceTokenizer
from torchmoji.model_def import torchmoji_emojis
from torchmoji.global_variables import PRETRAINED_PATH, VOCAB_PATH

import subprocess

In [3]:
def read_video_file(filename):
    vid = imageio.get_reader(filename, 'ffmpeg')
    return vid

In [4]:
def extract_audio_from_video(video_filename):
    video_filename = video_filename.split(' ')
    video_filename = '\ '.join(video_filename)
    
    rmp = video_filename.split('.mp4')
    rmp.append('.wav')
    output_audio_filename = ''.join(rmp)
        
    command = "ffmpeg -i "+video_filename+" -ab 160k -ac 2 -ar 44100 -vn "+output_audio_filename
    subprocess.call(command, shell=True)
    
    print('audio saved to', output_audio_filename)

In [5]:
import os
import json

def get_skeleton_frames_consolidated_dict(directory):
    # key = frame index
    # value = parts position dictionary (part number: [x,y,z,c] -- pixel coordinates)
    skeleton_frames_dict = {}

    # TODO: why are some part positions empty?
    # TODO: get 3D coordinates intead of 2D
    for idx, skeleton_frame_filename in enumerate(sorted(os.listdir(directory))):
        json_contents = json.load(open(directory + skeleton_frame_filename))
        part_pos_dict = json_contents['part_candidates'][0]
        if idx not in skeleton_frames_dict.keys():
            skeleton_frames_dict[idx] = {}
        skeleton_frames_dict[idx] = part_pos_dict
        
    return skeleton_frames_dict

In [7]:
"""
Check if number of frames extracted from original video == number of frames from OpenPose
"""

video_filename = 'WhatsApp Video 2018-04-12 at 10.16.35 PM.mp4'
skeleton_json_folder = 'json/'

vid = read_video_file(video_filename)

skeleton_frames_dict = get_skeleton_frames_consolidated_dict(skeleton_json_folder)

vid.get_meta_data()['nframes'], len(skeleton_frames_dict.keys())



(1146, 90)

In [8]:
def convert_audio_to_mono(audio_filename):
    output_filename = 'mono_'+audio_filename
    command = ['sox', audio_filename, '-c', '1', output_filename] 
    subprocess.Popen(command)
    
    print('mono audio saved to', output_filename)
    return output_filename

In [16]:
def get_text_from_speech(audio_filename):
    os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "google_creds.json"

    audio_filename = convert_audio_to_mono(audio_filename)

    # Instantiates a client
    client = speech.SpeechClient()

    # Loads the audio into memory
    with io.open(audio_filename, 'rb') as audio_file:
        content = audio_file.read()
        audio = types.RecognitionAudio(content=content)

    config = types.RecognitionConfig(language_code='en-US')

    # Detects speech in the audio file
    response = client.recognize(config, audio)
    
    return response.results[0].alternatives[0].transcript

In [45]:
def get_emotion_features_from_text(text):
    text = [text]
    def top_elements(array, k):
        ind = np.argpartition(array, -k)[-k:]
        return ind[np.argsort(array[ind])][::-1]

    maxlen = 30

    print('Tokenizing using dictionary from {}'.format(VOCAB_PATH))
    with open(VOCAB_PATH, 'r') as f:
        vocabulary = json.load(f)

    st = SentenceTokenizer(vocabulary, maxlen)

    print('Loading model from {}.'.format(PRETRAINED_PATH))
    model = torchmoji_emojis(PRETRAINED_PATH)
    print(model)
    print('Running predictions.')
    tokenized, _, _ = st.tokenize_sentences(text)
    prob = model(tokenized)

    for prob in [prob]:
        # Find top emojis for each sentence. Emoji ids (0-63)
        # correspond to the mapping in emoji_overview.png
        # at the root of the torchMoji repo.
        #print('Writing results to {}'.format(OUTPUT_PATH))
        scores = []
        for i, t in enumerate(text):
            t_tokens = tokenized[i]
            t_score = [t]
            t_prob = prob[i]
            ind_top = top_elements(t_prob, 5)
            t_score.append(sum(t_prob[ind_top]))
            t_score.extend(ind_top)
            t_score.extend([t_prob[ind] for ind in ind_top])
            scores.append(t_score)
    
    emoji_ids = scores[0][2:2+5]
    one_hot_encodings = []
    for emoji_idx in emoji_ids:
        one_hot_encodings.append([0 if i!=emoji_idx else 1 for i in range(64)])
    return emoji_ids, one_hot_encodings

In [50]:
def get_prosody_features_from_audio(audio_filename):
    audio_filename = convert_audio_to_mono(audio_filename)
    command = 'python /Users/anuj/coursework_cuboulder/spring_2018/algo_hri/DisVoice-master/prosody/prosody.py \"'+audio_filename+'\" \"prosody_'+ audio_filename+'.txt\" "static" "false"'
    subprocess.call(command, shell=True)
    
    print('prosody features saved to prosody_' + audio_filename)

In [None]:
# frame by frame speech to text convert...