# Module 12 - Programming Assignment

In [1]:
import csv
import random

## Naive Bayes Classifier

In this assignment you will be using the mushroom data from the Decision Tree module:

http://archive.ics.uci.edu/ml/datasets/Mushroom

The assignment is to write a program that will learn and apply a Naive Bayes Classifier for this problem. You'll first need to calculate all of the necessary probabilities (don't forget to use +1 smoothing) using a `learn` function. You'll then need to have a `classify` function that takes your probabilities, a List of instances (possibly a list of 1) and returns a List of Tuples. Each Tuple is a class and the *normalized* probability of that class. The List should be sorted so that the probabilities are in descending order. For example,

```
[("e", 0.98), ("p", 0.02)]
```

when calculating the error rate of your classifier, you should pick the class with the highest probability (the first one in the list).

As a reminder, the Naive Bayes Classifier generates the un-normalized probabilities from the numerator of Bayes Rule:

$$P(C|A) \propto P(A|C)P(C)$$

where C is the class and A are the attributes (data). Since the normalizer of Bayes Rule is the *sum* of all possible numerators and you have to calculate them all, the normalizer is just the sum of the probabilities.

You'll also need an `evaluate` function as before. You should use the $error\_rate$ again.

Use the same testing procedure as last time, on two randomized subsets of the data:

1. learn the probabilities for set 1
2. classify set 2
3. evaluate the predictions
4. learn the probabilities for set 2
5. classify set 1
6. evalute the the predictions
7. average the classification error.

-----

&nbsp;

**x**

x

In [2]:
def read_csv(file_name):
    with open(file_name, 'rb') as f:
        reader = csv.reader(f)
        table = list(reader)
    
    return table

&nbsp;

**x**

x

In [3]:
def create_train_test_sets(data):
    random.shuffle(data)
    split_point = len(data) / 2
    test_set = data[:split_point]
    train_set = data[split_point:]
    
    return train_set, test_set

&nbsp;

**x**

x

In [4]:
def get_class_label_counts(data):
    p_count = 1.0
    e_count = 1.0
    for row in data:
        if row[0] == 'p':
            p_count += 1.0
        elif row[0] == 'e':
            e_count += 1.0
    
    return p_count, e_count

&nbsp;

**x**

x

In [5]:
def get_probability_counts(data):
    probability_counts = {}
    
    for i in range(len(data[0]) - 1):
        i = i + 1
        probability_counts[i] = {}
    
    for row in data:
        for i in range(len(row) - 1):
            i = i + 1
            attribute_value = row[i]
            if attribute_value not in probability_counts[i]:
                probability_counts[i][attribute_value] = {'p': 1.0, 'e': 1.0} #+1 smoothing
            else:
                if row[0] == 'p':
                    probability_counts[i][attribute_value]['p'] += 1.0
                elif row[0] == 'e':
                    probability_counts[i][attribute_value]['e'] += 1.0
                
    return probability_counts

&nbsp;

**x**

x

In [6]:
def get_class_probabilities(p_count, e_count):
    class_probabilities = { 'p' : {}, 'e' : {} }
    p_count = p_count - 1.0
    e_count = e_count - 1.0
    total = p_count + e_count
    
    class_probabilities['p']['probability'] = p_count / total
    class_probabilities['e']['probability'] = e_count / total
    class_probabilities['p']['count'] = p_count 
    class_probabilities['e']['count'] = e_count 
    
    return class_probabilities

&nbsp;

**x**

x

In [7]:
def calculate_probability(probabilities, instance, label, class_probabilities):
    probability = 1
    for attribute_index in range(len(instance) - 1):
        attribute_index += 1                                #skip class label
        attribute_value = instance[attribute_index]
        
        if attribute_value in probabilities[attribute_index]:
            probability *= probabilities[attribute_index][attribute_value][label]
        
        else:
            probability *= 1 / class_probabilities[label]['count']  #If some combination wasn't in the training data, do +1 smoothing
        
    probability *= class_probabilities[label]['probability']
    
    return probability


&nbsp;

**x**

x

In [8]:
def normalize(results):
    denominator = results['p'] + results['e']
    
    results['p'] /= denominator
    results['e'] /= denominator
    
    return results

&nbsp;

**x**

x

In [9]:
def classify_instance(probabilities, instance, class_probabilities):
    results = {}
    
    results['p'] = calculate_probability(probabilities, instance, 'p', class_probabilities)
    results['e'] = calculate_probability(probabilities, instance, 'e', class_probabilities)
    
    results = normalize(results)
    if results['p'] > results['e']:
        return [('p', results['p']), ('e', results['e'])]
    else:
        return [('e', results['e']), ('p', results['p'])]
    

&nbsp;

**x**

x

&nbsp;

**x**

x

&nbsp;

**x**

x

&nbsp;

**x**

x

&nbsp;

**x**

x

&nbsp;

**x**

x

---

## Main Functions

&nbsp;

**x**

x

In [10]:
def learn(data):
    probabilities = get_probability_counts(data)
    p_count, e_count = get_class_label_counts(data)
    
    for attribute_index in probabilities:
        for attribute_value in probabilities[attribute_index]:
            probabilities[attribute_index][attribute_value]['p'] = probabilities[attribute_index][attribute_value]['p'] / p_count
            probabilities[attribute_index][attribute_value]['e'] = probabilities[attribute_index][attribute_value]['e'] / e_count
    
    return probabilities

&nbsp;

**x**

x

In [11]:
#Returns a list of tuples: each is a class and the normalized probability of that class
#sorted in descending order
#[("e", 0.98), ("p", 0.02)]
def classify(probabilities, instances):
    classifications = []
    p_count, e_count = get_class_label_counts(data)
    class_probabilities = get_class_probabilities(p_count, e_count)

    for instance in instances:
        classifications.append(classify_instance(probabilities, instance, class_probabilities))
    
    return classifications

&nbsp;

**x**

x

In [12]:
#Uses the error rate    
def evaluate(data, classifications):
    errors = 0.0
    for i in range(len(data)):
        if classifications[i][0][0] != data[i][0]:
            errors += 1
    
    error_rate = errors / len(data)
    return error_rate

-----

Put your main function calls here.

In [13]:
data = read_csv('agaricus-lepiota.data')

In [14]:
set1, set2 = create_train_test_sets(data)

In [15]:
probabilities = learn(set1)

In [16]:
classifications = classify(probabilities, set2) 

In [17]:
error_rate1 = evaluate(set2, classifications)
print 'error rate 1: ', error_rate1

error rate 1:  0.0544066962088


In [18]:
probabilities = learn(set2)

In [19]:
classifications = classify(probabilities, set1) 

In [20]:
error_rate2 = evaluate(set1, classifications)

In [21]:
print 'error rate 2: ', error_rate2

error rate 2:  0.0519448547514


In [22]:
print (error_rate1 + error_rate2) / 2

0.0531757754801
