In [86]:
# -*- coding: utf-8 -*-
from collections import defaultdict
import numpy as np
from scipy.stats import norm
import json

class GNB(object):
    def __init__(self):
        self.possible_labels = ['left', 'keep', 'right']

    def train(self, data, labels):
        """
        Trains the classifier with N data points and labels.

        INPUTS
        data - array of N observations
          - Each observation is a tuple with 4 values: s, d,
            s_dot and d_dot.
          - Example : [
                        [3.5, 0.1, 5.9, -0.02],
                        [8.0, -0.3, 3.0, 2.2],
                        ...
                ]

        labels - array of N labels
          - Each label is one of "left", "keep", or "right".
        """
        self.num_features = len(data[0])
        self.statistics = {label: {} for label in self.possible_labels}
        groups = {label: [] for label in self.possible_labels}
        for label, datum in zip(labels, data):
            groups[label].append(datum)

        for label, group_data in groups.iteritems():
            group_data = np.array(group_data)
            mean = group_data.mean(axis=0)
            stddev = group_data.std(axis=0)
            total = group_data.shape[0]
            self.statistics[label].update({
                    'mean': mean,
                    'stddev': stddev,
                    'total': total,
                })

    def predict(self, observation):
        """
        Once trained, this method is called and expected to return
        a predicted behavior for the given observation.

        INPUTS

        observation - a 4 tuple with s, d, s_dot, d_dot.
          - Example: [3.5, 0.1, 8.5, -0.2]

        OUTPUT

        A label representing the best guess of the classifier. Can
        be one of "left", "keep" or "right".
        """
        # scipy.stats.norm(0, 1).pdf(0)
        prob = {label: [] for label in self.possible_labels}
        for label in self.possible_labels:
            stats = self.statistics[label]
            mean = stats['mean']
            stddev = stats['stddev']
            total = stats['total']
            
            for feature_idx in range(self.num_features):
                norm_distrib = norm(loc=mean[feature_idx], scale=stddev[feature_idx])
                prob[label].append(norm_distrib.pdf(observation[feature_idx]))
        
            # also push total to the end of prob
            prob[label].append(float(total))
            
        final_prob = {label: 1.0 for label in self.possible_labels}
        for i in range(self.num_features + 1):
            normalizer = sum(prob[label][i] for label in self.possible_labels)
            for label in self.possible_labels:
                
                single_prob = prob[label][i]/normalizer
                final_prob[label] = final_prob[label] * single_prob
                
        # find max index
        max_label = self.possible_labels[0]
        max_prob = 0.0
        for label in self.possible_labels:
            if max_prob < final_prob[label]:
                max_label = label
                max_prob = final_prob[label]
        return max_label
        #return self.possible_labels[1]


gnb = GNB()
with open('train.json', 'rb') as f:
    j = json.load(f)
print j.keys()
X = j['states']
Y = j['labels']
gnb.train(X, Y)

with open('test.json', 'rb') as f:
    j = json.load(f)

X = j['states']
Y = j['labels']
score = 0
for coords, label in zip(X, Y):
    predicted = gnb.predict(coords)
    if predicted == label:
        score += 1
        
fraction_correct = float(score) / len(X)
print "You got {} percent correct".format(100 * fraction_correct)


[u'states', u'labels']
You got 84.4 percent correct
