# Machine Learning Engineer Capstone

## Preprocessing

In [14]:
import glob
import subprocess
import json
import os
import csv
from tqdm import tnrange, tqdm_notebook

# returns the number of secs in video
def video_length(path):
    cmd = "ffprobe -i " + path + " -show_entries format=duration -v quiet -of json"
    pipe = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE).stdout
    output = pipe.read()
    d = json.loads(output)
    s = d["format"]["duration"]
    return int(float(s))

# returns the id of a video in the ./data/videos dir
def video_id(path):
    return path.split("/")[3].split(".")[0]

def clip_dir_path(path):
    vid_id = video_id(path)
    return "./data/clips/" + vid_id

# creates a folder with one sec clips from the source video
# takes about 30 mins for a 20 min video
def create_clips(path):
    # create clip dir
    dir_path = clip_dir_path(path)
    if not os.path.exists(dir_path):
        os.makedirs(dir_path)
    
    # create one sec clips from src
    video_len = video_length(path)
    for i in tqdm_notebook(xrange(video_len), desc="Clips for " + video_id(path)):
        clip_path = dir_path + "/" + '%05d' % i + ".mp4"    
        if not os.path.exists(clip_path):
            cmd = "ffmpeg -v error -y -i " + path + " -ss " + str(i) + " -t 1 " + clip_path
            os.system(cmd)

# creates folders with frames for each clip of source video
def create_frames(path):
    # create frame dir
    vid_id = video_id(path)
    dir_path = "./data/frames/" + vid_id
    if not os.path.exists(dir_path):
        os.makedirs(dir_path)
    
    # create frames from clip
    video_len = video_length(path)
    for i in tqdm_notebook(xrange(video_len), desc="Frames for " + vid_id):
        clip_path = clip_dir_path(path) + "/" + '%05d' % i + ".mp4"
        frame_dir_path = dir_path + "/" + '%05d' % i
        if not os.path.exists(frame_dir_path):
            os.makedirs(frame_dir_path)
            cmd = "ffmpeg -v error -y -i " + clip_path + " -r 5.0 " + frame_dir_path + "/%5d.jpg"
            os.system(cmd)

def create_spectrograms(path):
    # create audio dir
    vid_id = video_id(path)
    dir_path = "./data/audio/" + vid_id
    if not os.path.exists(dir_path):
        os.makedirs(dir_path)
    
    # create spectrogram from clip
    video_len = video_length(path)
    for i in tqdm_notebook(xrange(video_len), desc="Spectrograms for " + vid_id):
        clip_path = clip_dir_path(path) + "/" + '%05d' % i + ".mp4"
        spec_path = dir_path + "/" + '%05d' % i + ".png"
        if not os.path.exists(spec_path):
            cmd = "ffmpeg -v error -y -i " + clip_path + " -lavfi showspectrumpic=s=32x32:legend=false " + spec_path
            os.system(cmd)
            
        
video_paths = glob.glob("./data/videos/*.mp4")
videos_len = len(video_paths)
for i in tqdm_notebook(xrange(videos_len), desc="Preprocessing Videos"):
    path = video_paths[i]
    create_clips(path)
    create_frames(path)
    create_spectrograms(path)




## Read in Labels

In [1]:
import pandas as pd
import glob

labels = pd.read_csv("./data/labels.csv").as_matrix()
print "Labels Shape: {}".format(labels.shape)

Labels Shape: (1872, 3)


In [None]:
# write label to image mapping file
with open('data/audio_labels.txt', 'a') as f:
    for label in labels:
        vid_id = label[0]
        clip_id = label[1]
        value = 0 if label[2] == 0 else 1 
        img_path = "./data/audio/" + vid_id + "/" + '%05d' % clip_id + ".png"
        line = img_path + " " + '%d' % value + "\n"
        f.write(line)

## Create Audio Input

In [2]:
import numpy as np
import tflearn
from tflearn.layers.core import input_data, dropout, fully_connected
from tflearn.layers.conv import conv_2d, max_pool_2d
from tflearn.layers.normalization import local_response_normalization
from tflearn.layers.estimator import regression
from tflearn.data_preprocessing import ImagePreprocessing
from tflearn.data_utils import image_preloader

# Load path/class_id image file:
dataset_file = 'data/audio_labels.txt'
dataset_sample = 'data/audio_labels_sample.txt'

# Build the preloader array, resize images to 300x300
from tflearn.data_utils import image_preloader
X, Y = image_preloader(dataset_file, 
                       image_shape=(32, 32),
                       mode='file', 
                       categorical_labels=True,   
                       normalize=True)

# Real-time data preprocessing
img_prep = ImagePreprocessing()
img_prep.add_featurewise_zero_center()
img_prep.add_featurewise_stdnorm()

# Building convolutional network
network = input_data(shape=[None, 32, 32, 3],
                     data_preprocessing=img_prep)
network = conv_2d(network, 32, 3, activation='relu')
network = max_pool_2d(network, 2)
network = conv_2d(network, 64, 3, activation='relu')
network = conv_2d(network, 64, 3, activation='relu')
network = max_pool_2d(network, 2)
network = fully_connected(network, 512, activation='relu')
network = dropout(network, 0.5)
network = fully_connected(network, 2, activation='softmax')
network = regression(network,
                     optimizer='adam',
                     loss='categorical_crossentropy',
                     learning_rate=0.001)

# Training
model = tflearn.DNN(network,tensorboard_verbose=3)
model.fit(X, Y, 
          n_epoch=4,
          snapshot_step=5, 
          show_metric=True,
          validation_set=0.2,
          batch_size=50,
          shuffle=True,
          run_id="audio-1")

model.save("shot_audio.model")

Training Step: 239  | total loss: [1m[32m0.36329[0m[0m | time: 36.637s
| Adam | epoch: 004 | loss: 0.36329 - acc: 0.8292 -- iter: 2950/2995
Training Step: 240  | total loss: [1m[32m0.35939[0m[0m | time: 38.083s
| Adam | epoch: 004 | loss: 0.35939 - acc: 0.8303 | val_loss: 0.37591 - val_acc: 0.8171 -- iter: 2995/2995
--
INFO:tensorflow:/Users/logan/Documents/udacity/capstone/shot_audio.model is not in all_model_checkpoint_paths. Manually adding it.


## Prediction

In [12]:
X_test, Y_test = image_preloader(dataset_sample, 
                       image_shape=(32, 32),
                       mode='file', 
                       categorical_labels=True,   
                       normalize=True)

Y_predict = model.predict(X_test[:])

m = {}
for i in range(len(Y_predict)):
    pred_0, pred_1 = int(round(Y_predict[i][0])), int(round(Y_predict[i][1]))
    test_0, test_1 = int(round(Y_test[i][0])), int(round(Y_test[i][1]))
    key = "{}-{}-{}-{}".format(pred_0, pred_1, test_0, test_1)
    if key == '1-0-1-0':
        key = "tn"
    elif key == '0-1-0-1':
        key = "tp"
    elif key == '0-1-1-0':
        key = "fp"
    elif key == '1-0-0-1':
        key = "fn"
    if key in m:
        m[key] += 1
    else:
        m[key] = 0

print m

{'tn': 123, 'fp': 41, 'fn': 0, 'tp': 32}
