In [177]:
import os
import random
import numpy as np

# Set global variables to run notebook
data_root = 'C:/_data/' # set to whever the data is being stored
use_true_labels = True
if use_true_labels:
    feature_file = os.path.join(data_root, 'commercials\\data\\true_labels_images_features.txt')
else:
    feature_file = os.path.join(data_root, 'commercials\\data\\rekall_train_images_features.txt')

percent_for_test = 0.10

In [178]:
def load_data(filename):
    
    videos = []
    images = []
    labels = []
    avgs = []
    stddevs = []
    
    with open(filename, 'r') as f:
        for line in f.readlines():
            video, image, label, avg, stddev = line.split(',')
            videos.append(int(video))
            images.append(int(image))
            labels.append(int(label))
            avgs.append(float(avg))
            stddevs.append(float(stddev))
            
    return videos, images, labels, avgs, stddevs

In [179]:
videos, images, labels, avgs, stddevs = load_data(feature_file)
print('Loaded %d data points' % len(images))

Loaded 81105 data points


In [180]:
# shuffle data
data = list(zip(avgs, stddevs, labels))
random.shuffle(data)
x1, x2, y = zip(*data)
x = list(zip(x1, x2))

# remove test from dataset
split = int(len(y) * percent_for_test)
x_test = np.array(x[:split])
y_test = np.array(y[:split]).reshape(split,1)
x_train = np.array(x[split:])
y_train = np.array(y[split:]).reshape(len(y)-split,1)

In [181]:
# compute parameters
n = len(y_train)
phi = np.sum(y_train) / n
print('phi = %f' % phi)
mu0 = np.sum(x_train * (1 - y_train), axis=0) / np.sum(1 - y_train)
print('mu0 = %a' % mu0)
mu1 = np.sum(x_train * y_train, axis=0) / np.sum(y_train)
print('mu1 = %a' % mu1)

d = mu0.shape[0]
sigma = np.zeros((d,d))
for i in range(n):
    vec = x_train[i] - (mu0 * (1 - y_train[i]) + mu1 * y_train[i])
    sigma += np.outer(vec, vec)
sigma = sigma / n
print('sigma = %a' % sigma)
inv_sigma = np.linalg.inv(sigma)
sqrt_det_sigma = np.sqrt(np.linalg.det(sigma))

phi = 0.285609
mu0 = array([93.07238261, 66.18311846])
mu1 = array([108.79154688,  55.87230749])
sigma = array([[1384.76182814,  142.72332485],
       [ 142.72332485,  167.5220883 ]])


In [182]:
def gaussian_prob(x, mu):
    return np.exp(-np.dot(x - mu, np.dot(inv_sigma, x - mu)) / 2) / ((2 * np.pi) ** (d / 2) * sqrt_det_sigma)

In [183]:
predictions = []
for i in range(len(y_test)):
    x = x_test[i]
    neg = gaussian_prob(x, mu0) * (1 - phi)
    pos = gaussian_prob(x, mu1) * phi
    predictions.append(pos > neg)

In [184]:
accuracy = np.sum((np.array(predictions).reshape(len(predictions), 1) == y_test) * 1) / len(predictions)
print('Accuracy = %f' % accuracy)

Accuracy = 0.790506
