In [293]:
import os
import random
import numpy as np

# Set global variables to run notebook
data_root = 'C:/_data/' # set to whever the data is being stored

train_with_true_labels = False
if train_with_true_labels:
    feature_file = os.path.join(data_root, 'commercials\\data\\true_labels_images_features.txt')
else:
    feature_file = os.path.join(data_root, 'commercials\\data\\rekall_train_images_features.txt')

test_with_true_labels = True
test_file = os.path.join(data_root, 'commercials\\data\\true_labels_images_features.txt')
percent_for_test = 0.10 # used for splitting data when not testing with true labels

In [294]:
def load_data(filename):
    
    videos = []
    images = []
    y = []
    x = []
    
    with open(filename, 'r') as f:
        for line in f.readlines():
            video, image, label, data = line.split(',', 3)
            videos.append(int(video))
            images.append(int(image))
            y.append(int(label))
            x.append(eval(data))
    
    y = np.array(y).reshape(len(y), 1)
    x = np.array(x)
    
    return videos, images, y, x

In [295]:
# load training data
videos, images, y, x = load_data(feature_file)
print('Loaded %d data points' % len(images))

# shuffle data
data = list(zip(y, x))
random.shuffle(data)
y, x = zip(*data)
y_train = np.array(y).reshape(len(y), 1)
x_train = np.array(x)
print('y_train is shape ', y_train.shape)
print('x_train is shape ', x_train.shape)

Loaded 64130 data points
y_train is shape  (64130, 1)
x_train is shape  (64130, 8)


In [296]:
# set up the test data
if test_with_true_labels:
    videos, images, y, x = load_data(test_file) # load true labes for testing
    x_test = x #np.array(list(zip(avgs, stddevs)))
    y_test = y #np.array(labels).reshape(len(labels), 1)
else: # remove test from dataset
    split = int(len(y) * percent_for_test)
    x_train = np.array(x[split:])
    y_train = np.array(y[split:]).reshape(len(y)-split, 1)
    x_test = np.array(x[:split])
    y_test = np.array(y[:split]).reshape(split,1)

print('Loaded %d test points' % len(y))
print('y_test is shape ', y_test.shape)
print('x_test is shape ', x_test.shape)

Loaded 81105 test points
y_test is shape  (81105, 1)
x_test is shape  (81105, 8)


In [297]:
def compute_parameters(x_train, y_train):
    
    n = len(y_train)
    phi = np.sum(y_train) / n
    mu0 = np.sum(x_train * (1 - y_train), axis=0) / np.sum(1 - y_train)
    mu1 = np.sum(x_train * y_train, axis=0) / np.sum(y_train)

    d = mu0.shape[0]
    sigma0 = np.zeros((d, d))
    sigma1 = np.zeros((d, d))
    
    for i in range(n):
        if y_train[i]:
            vec = x_train[i] - mu1
            sigma1 += np.outer(vec, vec)
        else:
            vec = x_train[i] - mu0
            sigma0 += np.outer(vec, vec)
            
    sigma0 = sigma0 / np.sum(1 - y_train)
    sigma1 = sigma1 / np.sum(y_train)
    
#     print('phi = %f' % phi)
#     print('mu0 = %a' % mu0)
#     print('mu1 = %a' % mu1)
#     print('sigma0 = %a' % sigma0)
#     print('sigma1 = %a' % sigma1)
    
    return phi, mu0, mu1, sigma0, sigma1

In [298]:
def gaussian_prob(x, mu, inv_sigma, sqrt_det_sigma):
    return np.exp(-np.dot(x - mu, np.dot(inv_sigma, x - mu))/2) / ((2*np.pi)**(mu.shape[0]/2) * sqrt_det_sigma)

In [299]:
def make_predictions(x_test, y_test, phi, mu0, mu1, sigma0, sigma1):
    
    inv_sigma0 = np.linalg.inv(sigma0)
    sqrt_det_sigma0 = np.sqrt(np.linalg.det(sigma0))
    inv_sigma1 = np.linalg.inv(sigma1)
    sqrt_det_sigma1 = np.sqrt(np.linalg.det(sigma1))
    
    predictions = []
    for i in range(len(y_test)):
        x = x_test[i]
        neg = gaussian_prob(x, mu0, inv_sigma0, sqrt_det_sigma0) * (1 - phi)
        pos = gaussian_prob(x, mu1, inv_sigma1, sqrt_det_sigma1) * phi
        predictions.append(pos > neg)
        
    accuracy = np.sum((np.array(predictions).reshape(len(predictions), 1) == y_test) * 1) / len(predictions)
#     print('Accuracy = %f' % accuracy)
    
    return accuracy

In [300]:
phi, mu0, mu1, sigma0, sigma1 = compute_parameters(x_train[:10000], y_train[:10000])
accuracy = make_predictions(x_test, y_test, phi, mu0, mu1, sigma0, sigma1)
print('Accuracy = %f' % accuracy)

Accuracy = 0.820134


In [286]:
for i in range(n):
    if y_train[i]:
        vec = x_train[i] - mu1
        sigma1 += np.outer(vec, vec)
    else:
        vec = x_train[i] - mu0
        sigma0 += np.outer(vec, vec)

sigma1 = sigma1 / np.sum(y_train)
print(sigma1)
print(np.linalg.det(sigma1))

[[2914.4440629   231.74690785 2874.99449933 2948.66602712 2919.67166225
    50.42998947  102.52971024  125.31580463]
 [ 231.74690785  338.54544081  243.39278542  223.89529934  227.95263878
   144.13947183  155.5105204   154.40624829]
 [2874.99449933  243.39278542 3125.00644798 2856.06926321 2643.90778681
    68.44262588  120.04493603  131.6696048 ]
 [2948.66602712  223.89529934 2856.06926321 3064.06679731 2925.86202085
    41.86218696   92.46080776  114.83412728]
 [2919.67166225  227.95263878 2643.90778681 2925.86202085 3189.24517909
    40.98515557   95.08338692  129.4436818 ]
 [  50.42998947  144.13947183   68.44262588   41.86218696   40.98515557
   200.96985543  178.05912691  164.89280694]
 [ 102.52971024  155.5105204   120.04493603   92.46080776   95.08338692
   178.05912691  196.81199555  185.16343096]
 [ 125.31580463  154.40624829  131.6696048   114.83412728  129.4436818
   164.89280694  185.16343096  202.04480368]]
-622637.7771828806


In [301]:
# compute accuracy for increasing sizes of training data
step_size = len(y_train) // 10
for i in range(step_size, len(y_train), step_size):
    phi, mu0, mu1, sigma0, sigma1 = compute_parameters(x_train[0:i], y_train[0:i])
    accuracy = make_predictions(x_test, y_test, phi, mu0, mu1, sigma0, sigma1)
    sig1_det = np.linalg.det(sigma1)
    print('%d training examples = %f accuracy. Sigma1 determinant = %f.' % (i, accuracy, sig1_det))

  


6413 training examples = 0.713717 accuracy. Sigma1 determinant = -4745.813860.


  after removing the cwd from sys.path.


12826 training examples = 0.713717 accuracy. Sigma1 determinant = -164474.145573.
19239 training examples = 0.820455 accuracy. Sigma1 determinant = 229823.231452.
25652 training examples = 0.713717 accuracy. Sigma1 determinant = 478757.293005.
32065 training examples = 0.713717 accuracy. Sigma1 determinant = 360334.565072.
38478 training examples = 0.713717 accuracy. Sigma1 determinant = -217844.915893.
44891 training examples = 0.713717 accuracy. Sigma1 determinant = -49812.514589.
51304 training examples = 0.821392 accuracy. Sigma1 determinant = 16471.977659.
57717 training examples = 0.713717 accuracy. Sigma1 determinant = 293587.147777.
