# Machine Learning Engineer Capstone

## Preprocessing

In [None]:
import glob
import subprocess
import json
import os
import csv
from tqdm import tnrange, tqdm_notebook
from keras.preprocessing import image
from keras.applications.inception_v3 import InceptionV3, preprocess_input
from keras.models import Model, load_model
from keras.layers import Input
import numpy as np

# taken from https://github.com/harvitronix/five-video-classification-methods
class Extractor():
    def __init__(self, weights=None):
        """Either load pretrained from imagenet, or load our saved
        weights from our own training."""

        self.weights = weights  # so we can check elsewhere which model

        if weights is None:
            # Get model with pretrained weights.
            base_model = InceptionV3(
                weights='imagenet',
                include_top=True
            )

            # We'll extract features at the final pool layer.
            self.model = Model(
                inputs=base_model.input,
                outputs=base_model.get_layer('avg_pool').output
            )

        else:
            # Load the model first.
            self.model = load_model(weights)

            # Then remove the top so we get features not predictions.
            # From: https://github.com/fchollet/keras/issues/2371
            self.model.layers.pop()
            self.model.layers.pop()  # two pops to get to pool layer
            self.model.outputs = [self.model.layers[-1].output]
            self.model.output_layers = [self.model.layers[-1]]
            self.model.layers[-1].outbound_nodes = []

    def extract(self, image_path):
        img = image.load_img(image_path, target_size=(299, 299))
        x = image.img_to_array(img)
        x = np.expand_dims(x, axis=0)
        x = preprocess_input(x)

        # Get the prediction.
        features = self.model.predict(x)

        if self.weights is None:
            # For imagenet/default network:
            features = features[0]
        else:
            # For loaded network:
            features = features[0]

        return features

# returns the number of secs in video
def video_length(path):
    cmd = "ffprobe -i " + path + " -show_entries format=duration -v quiet -of json"
    pipe = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE).stdout
    output = pipe.read()
    d = json.loads(output)
    s = d["format"]["duration"]
    return int(float(s))

# returns the id of a video in the ./data/videos dir
def video_id(path):
    return path.split("/")[3].split(".")[0]

def clip_dir_path(path):
    vid_id = video_id(path)
    return "./data/clips/" + vid_id

# creates a folder with one sec clips from the source video
# takes about 30 mins for a 20 min video
def create_clips(path):
    # create clip dir
    dir_path = clip_dir_path(path)
    if not os.path.exists(dir_path):
        os.makedirs(dir_path)
    
    # create one sec clips from src
    video_len = video_length(path)
    for i in tqdm_notebook(xrange(video_len), desc="Clips for " + video_id(path)):
        clip_path = dir_path + "/" + '%05d' % i + ".mp4"    
        if not os.path.exists(clip_path):
            cmd = "ffmpeg -v error -y -i " + path + " -ss " + str(i) + " -t 1 " + clip_path
            os.system(cmd)

# creates folders with frames for each clip of source video
def create_frames(path):
    # create frame dir
    vid_id = video_id(path)
    dir_path = "./data/frames/" + vid_id
    if not os.path.exists(dir_path):
        os.makedirs(dir_path)
    
    # create frames from clip
    video_len = video_length(path)
    for i in tqdm_notebook(xrange(video_len), desc="Frames for " + vid_id):
        clip_path = clip_dir_path(path) + "/" + '%05d' % i + ".mp4"
        frame_dir_path = dir_path + "/" + '%05d' % i
        if not os.path.exists(frame_dir_path):
            os.makedirs(frame_dir_path)
            cmd = "ffmpeg -v error -y -i " + clip_path + " -r 5.0 " + frame_dir_path + "/%5d.jpg"
            os.system(cmd)
            
            # resize frames to 299x299 for InceptionV3
            frame_paths = glob.glob(frame_dir_path + "/*.jpg")
            for fi in xrange(len(frame_paths)):
                path = frame_paths[fi]
                # resize first
                cmd = "convert " + path + " -resize 299x299 " + path
                os.system(cmd)
                # add black background
                cmd = "convert " + path + " -gravity center -background black -extent 299x299 " + path
                os.system(cmd)

def create_spectrograms(path):
    # create audio dir
    vid_id = video_id(path)
    dir_path = "./data/audio/" + vid_id
    if not os.path.exists(dir_path):
        os.makedirs(dir_path)
    
    # create spectrogram from clip
    video_len = video_length(path)
    for i in tqdm_notebook(xrange(video_len), desc="Spectrograms for " + vid_id):
        clip_path = clip_dir_path(path) + "/" + '%05d' % i + ".mp4"
        spec_path = dir_path + "/" + '%05d' % i + ".png"
        if not os.path.exists(spec_path):
            cmd = "ffmpeg -v error -y -i " + clip_path + " -lavfi showspectrumpic=s=32x32:legend=false " + spec_path
            os.system(cmd)


extractor = Extractor()

def create_features(path):
    # create feature dir
    vid_id = video_id(path)
    dir_path = "./data/features/" + vid_id
    if not os.path.exists(dir_path):
        os.makedirs(dir_path)    
    
    video_len = video_length(path)    
    with tqdm_notebook(total=video_len, desc="Features for " + vid_id) as pbar:
        for root, dirs, files in os.walk('./data/frames/'+ vid_id):
            for f in files:
                if f.endswith(".jpg"):
                    frame_path = root + "/" + f
                    feature_path = frame_path.replace("frames", "features").replace("jpg", "txt.gz")
                    feature_dir = root.replace("frames", "features")
                    features = extractor.extract(frame_path)
                    if not os.path.exists(feature_dir):
                        os.makedirs(feature_dir)
                    np.savetxt(feature_path, features)
            pbar.update(1)

video_paths = glob.glob("./data/videos/*.mp4")
videos_len = len(video_paths)
for i in tqdm_notebook(xrange(videos_len), desc="Preprocessing Videos"):
    path = video_paths[i]
    create_clips(path)
    create_frames(path)
    create_spectrograms(path)
    create_features(path)

In [1]:
import pandas as pd
import glob
import numpy as np

# read in and shuffle data
labels = pd.read_csv("./labelmaker/labels.csv").as_matrix()
print "Labels Shape: {}".format(labels.shape)
np.random.seed(0)
np.random.shuffle(labels)

# split labels into train, validation, and test sets
div = len(labels) / 5
train_labels = labels[0:div*3,:]
val_labels = labels[div*3:div*4,:]
test_labels = labels[div*4:,:]

print "Trainging Labels Shape: {}".format(train_labels.shape)
print "Validation Labels Shape: {}".format(val_labels.shape)
print "Test Labels Shape: {}".format(test_labels.shape)

# split labels into sample train, validation, and test sets
smpl_div = div / 10
smpl_train_labels = labels[0:smpl_div*3,:]
smpl_val_labels = labels[smpl_div*3:smpl_div*4,:]
smpl_test_labels = labels[smpl_div*4:smpl_div*5,:]

print "Sample Trainging Labels Shape: {}".format(smpl_train_labels.shape)
print "Sample Validation Labels Shape: {}".format(smpl_val_labels.shape)
print "Sample Test Labels Shape: {}".format(smpl_test_labels.shape)

Labels Shape: (641, 3)
Trainging Labels Shape: (384, 3)
Validation Labels Shape: (128, 3)
Test Labels Shape: (129, 3)
Sample Trainging Labels Shape: (36, 3)
Sample Validation Labels Shape: (12, 3)
Sample Test Labels Shape: (12, 3)


In [9]:
from keras.applications import InceptionV3
from keras.applications import Xception
from keras.preprocessing import image
from keras.models import Model
from keras.layers import concatenate
from keras.layers import Dense, GlobalAveragePooling2D, Input, Embedding, LSTM, Dropout
from keras import backend as K
from keras.applications.inception_v3 import preprocess_input
from keras.preprocessing.image import img_to_array
from keras.preprocessing.image import load_img

dropout = 0.5

f1_base = InceptionV3(weights='imagenet', include_top=False)
for layer in f1_base.layers:
    layer.trainable = False    
f1_x = f1_base.output
f1_x = GlobalAveragePooling2D()(f1_x)
f1_x = Dense(1024, activation='relu')(f1_x)
f1_x = Dropout(dropout)(f1_x)

# because of https://github.com/fchollet/keras/issues/7412
f2_base = Xception(weights='imagenet', include_top=False)
for layer in f2_base.layers:
    layer.trainable = False    
f2_x = f2_base.output
f2_x = GlobalAveragePooling2D()(f2_x)
f2_x = Dense(1024, activation='relu')(f2_x)
f2_x = Dropout(dropout)(f2_x)

lstm_input_shape = (2, 1024)

x = concatenate([f1_x, f2_x])
x = Reshape(lstm_input_shape)(x)
x.add(LSTM(512,return_sequences=True,dropout=0.5))
x.add(Flatten())
x.add(Dense(512, activation='relu'))
x.add(Dropout(0.5))
x.add(Dense(4, activation='softmax'))
        
# x = concatenate([f1_x, f2_x])
# x = Dense(1024, activation='relu')(x)
# x = Dropout(dropout)(x)
# x = Dense(1024, activation='relu')(x)
# x = Dropout(dropout)(x)
# x = Dense(4, activation='sigmoid', name="main_output")(x)
model = Model(inputs=[f1_base.input, f2_base.input],outputs=[x])
model.compile(loss="categorical_crossentropy", optimizer="sgd", metrics=['accuracy'])

print "Model Compiled"

NameError: name 'Reshape' is not defined

In [7]:
from keras.callbacks import TensorBoard
import time

preprocess = preprocess_input

def one_hot(i):
    a = np.array([int(i==0),int(i==1),int(i==2),int(i==3)])
    return a[None,:]

def generate_images(labels):
    while 1:
        for i in xrange(len(labels)):
            video_id = labels[i][0]
            clip_id = labels[i][1]
            label = labels[i][2]

            # image 1 
            img_1_path = "./data/frames/" + video_id + "/" + '%05d' % clip_id + "/00001.jpg"
            img_1 = load_img(img_1_path, target_size=(299,299))
            img_1 = img_to_array(img_1)
            img_1 = np.expand_dims(img_1, axis=0)
            img_1 = preprocess(img_1)

            # image 2 
            img_2_path = "./data/frames/" + video_id + "/" + '%05d' % clip_id + "/00007.jpg"
            img_2 = load_img(img_2_path, target_size=(299,299))
            img_2 = img_to_array(img_2)
            img_2 = np.expand_dims(img_2, axis=0)
            img_2 = preprocess(img_2)

            yield ([img_1, img_2], one_hot(label))

print "Fitting Model"
tensorboard = TensorBoard(log_dir='./logs', 
                          histogram_freq=0,
                          write_graph=True, 
                          write_images=True)

model.fit_generator(generate_images(train_labels),
                    100,
                    epochs=30,
                    verbose=2,
                    validation_data=generate_images(val_labels),
                    validation_steps=30,
                    callbacks=[tensorboard])

file_name = "shot_classifier_" + str(int(time.time())) + ".h5"
model.save(file_name)
print "Model Saved"

Fitting Model
Epoch 1/30
210s - loss: 1.4025 - acc: 0.2600 - val_loss: 1.2795 - val_acc: 0.4667
Epoch 2/30
191s - loss: 1.3820 - acc: 0.3000 - val_loss: 1.3243 - val_acc: 0.3667
Epoch 3/30
189s - loss: 1.3857 - acc: 0.3500 - val_loss: 1.3530 - val_acc: 0.3667
Epoch 4/30
189s - loss: 1.4180 - acc: 0.2700 - val_loss: 1.3432 - val_acc: 0.3667
Epoch 5/30
198s - loss: 1.3876 - acc: 0.3200 - val_loss: 1.3591 - val_acc: 0.3000
Epoch 6/30
195s - loss: 1.3825 - acc: 0.3400 - val_loss: 1.3694 - val_acc: 0.4000
Epoch 7/30
200s - loss: 1.3264 - acc: 0.2900 - val_loss: 1.3377 - val_acc: 0.3333
Epoch 8/30
195s - loss: 1.3844 - acc: 0.3300 - val_loss: 1.3801 - val_acc: 0.3333
Epoch 9/30
192s - loss: 1.3494 - acc: 0.3300 - val_loss: 1.3582 - val_acc: 0.3667
Epoch 10/30
193s - loss: 1.3724 - acc: 0.3400 - val_loss: 1.3342 - val_acc: 0.2667
Epoch 11/30
194s - loss: 1.3671 - acc: 0.2600 - val_loss: 1.3160 - val_acc: 0.3667
Epoch 12/30
192s - loss: 1.3875 - acc: 0.3500 - val_loss: 1.3645 - val_acc: 0.4000

KeyboardInterrupt: 

In [None]:
from keras.models import load_model

def predict_generate_images(labels):
    while 1:
        for i in xrange(len(labels)):
            video_id = labels[i][0]
            clip_id = labels[i][1]
            label = labels[i][2]

            # image 1 
            img_1_path = "./data/frames/" + video_id + "/" + '%05d' % clip_id + "/00001.jpg"
            img_1 = load_img(img_1_path, target_size=(299,299))
            img_1 = img_to_array(img_1)
            img_1 = np.expand_dims(img_1, axis=0)
            img_1 = preprocess(img_1)

            # image 2 
            img_2_path = "./data/frames/" + video_id + "/" + '%05d' % clip_id + "/00007.jpg"
            img_2 = load_img(img_2_path, target_size=(299,299))
            img_2 = img_to_array(img_2)
            img_2 = np.expand_dims(img_2, axis=0)
            img_2 = preprocess(img_2)

            yield [img_1, img_2]

model = load_model('shot_classifier.h5')
predictions = model.predict_generator(predict_generate_images(smpl_test_labels), 10)
for i in xrange(len(predictions)):
    p = predictions[i]
    print p

## Read in Labels

In [None]:
import pandas as pd
import glob

labels = pd.read_csv("./data/labels.csv").as_matrix()
print "Labels Shape: {}".format(labels.shape)

In [None]:
# write label to image mapping file
with open('data/audio_labels.txt', 'a') as f:
    for label in labels:
        vid_id = label[0]
        clip_id = label[1]
        value = 0 if label[2] == 0 else 1 
        img_path = "./data/audio/" + vid_id + "/" + '%05d' % clip_id + ".png"
        line = img_path + " " + '%d' % value + "\n"
        f.write(line)

## Create Audio Input

In [None]:
import numpy as np
import tflearn
from tflearn.layers.core import input_data, dropout, fully_connected
from tflearn.layers.conv import conv_2d, max_pool_2d
from tflearn.layers.normalization import local_response_normalization
from tflearn.layers.estimator import regression
from tflearn.data_preprocessing import ImagePreprocessing
from tflearn.data_utils import image_preloader

# Load path/class_id image file:
dataset_file = 'data/audio_labels.txt'
dataset_sample = 'data/audio_labels_sample.txt'

# Build the preloader array, resize images to 300x300
from tflearn.data_utils import image_preloader
X, Y = image_preloader(dataset_file, 
                       image_shape=(32, 32),
                       mode='file', 
                       categorical_labels=True,   
                       normalize=True)

# Real-time data preprocessing
img_prep = ImagePreprocessing()
img_prep.add_featurewise_zero_center()
img_prep.add_featurewise_stdnorm()

# Building convolutional network
network = input_data(shape=[None, 32, 32, 3],
                     data_preprocessing=img_prep)
network = conv_2d(network, 32, 3, activation='relu')
network = max_pool_2d(network, 2)
network = conv_2d(network, 64, 3, activation='relu')
network = conv_2d(network, 64, 3, activation='relu')
network = max_pool_2d(network, 2)
network = fully_connected(network, 512, activation='relu')
network = dropout(network, 0.5)
network = fully_connected(network, 2, activation='softmax')
network = regression(network,
                     optimizer='adam',
                     loss='categorical_crossentropy',
                     learning_rate=0.001)

# Training
model = tflearn.DNN(network,tensorboard_verbose=3)
model.fit(X, Y, 
          n_epoch=4,
          snapshot_step=5, 
          show_metric=True,
          validation_set=0.2,
          batch_size=50,
          shuffle=True,
          run_id="audio-1")

model.save("shot_audio.model")

## Prediction

In [None]:
X_test, Y_test = image_preloader(dataset_sample, 
                       image_shape=(32, 32),
                       mode='file', 
                       categorical_labels=True,   
                       normalize=True)

Y_predict = model.predict(X_test[:])

m = {}
for i in range(len(Y_predict)):
    pred_0, pred_1 = int(round(Y_predict[i][0])), int(round(Y_predict[i][1]))
    test_0, test_1 = int(round(Y_test[i][0])), int(round(Y_test[i][1]))
    key = "{}-{}-{}-{}".format(pred_0, pred_1, test_0, test_1)
    if key == '1-0-1-0':
        key = "tn"
    elif key == '0-1-0-1':
        key = "tp"
    elif key == '0-1-1-0':
        key = "fp"
    elif key == '1-0-0-1':
        key = "fn"
    if key in m:
        m[key] += 1
    else:
        m[key] = 0

print m