# Classifiers

In [None]:
from datascience import *
import numpy as np
import matplotlib
%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')

In [None]:
patients = Table.read_table('breast-cancer.csv').drop('ID')
patients

**Row Operations**

- `t.row(i)`: evaluates to `i`th row of table `t`
- `t.row(i).item(j)`: is the value of row `i` column `j`
- `np.array(t.row(i))`: evaluates to an array of all numbers in the row (if all the values are numerical).

In [None]:
# Row Operations

In [None]:
patients.scatter('Bland Chromatin', 'Single Epithelial Cell Size', group = 'Class')

In [None]:
def randomize_column(a):
    return a + np.random.normal(0.0, 0.09, size = len(a))

jittered = Table().with_columns([
    'Bland Chromatin (jittered)',             randomize_column(patients.column('Bland Chromatin')),
    'Single Epithelial Cell Size (jitterd)',  randomize_column(patients.column('Single Epithelial Cell Size')),
    'Class',                                  patients.column('Class')
])

jittered.scatter('Bland Chromatin (jittered)', 'Single Epithelial Cell Size (jitterd)', group = 'Class')

# Distance

In [None]:
Table().with_columns(['x', [0, 2, 3], 
                      'y', [0, 2, 4]]).scatter('x', 'y')

**Question 1.** Write a function to calculate the distance between to points $(x_1,y_1)$ and $(x_2,y_2)$.

In [None]:
# Question 1

**Question 2.** Write a function to calculate the distance between two rows.

In [None]:
# Question 2

In [None]:
attributes = patients.drop('Class')
attributes.show(3)

**Question 3.** Find the distance between the patient in row 0 and the patients in row 1 and in row 2. 

In [None]:
 # Question 3

# Classification Procedure

In [None]:
def distances(training, example):
    """Compute a table with the training set and distances to the exmaple for each row in the training set."""
    dists = make_array()
    attributes = training.drop('Class')
    for row in attributes.rows:
        d = row_distance(row, example)
        dists = np.append(dists, d)
    return training.with_column('Distance', dists)

In [None]:
def closest(training, example, k):
    """Return a table of the k closest neighbors to the example."""
    return distances(training, example).sort('Distance').take(np.arange(k))

In [None]:
patients.take(12)

In [None]:
example = patients.drop('Class').row(12)
example

**Question 4.** Find the 5 closest patients to the patient in row 12. 

In [None]:
# Question 4

**Question 5.** What do you notice about the output? Are these the 5 closest patients to patient 12? 

In [None]:
# Question 5

# Classify

**Question 6.** Let's classify a patient by the $k$ nearest neighbors.

In [None]:
# Question 6

In [None]:
def majority_class(neighbors):
    """Return the class that is the most common among all of the neighbors."""
    return neighbors.group('Class').sort('count', descending = True).column('Class').item(0)

In [None]:
def classify(training, example, k):
    """Return the majority class among the k nearest neighbors."""
    nearest_neighbors = closest(training, example, k)
    return majority_class(nearest_neighbors)

In [None]:
classify(patients.exclude(12), example, 5)

# Evaluation

In [None]:
patients.num_rows

In [None]:
shuffled = patients.sample(with_replacement = False)
train_set = shuffled.take(range(0, 342))
test_set = shuffled.take(range(342, 683))

In [None]:
def evaluate_accuracy(training, test, k):
    test_attributes = test.drop('Class')
    num_correct = 0
    for i in range(test.num_rows):
        
        # Run the classifier on the ith patient in the test set
        test_patient = test_attributes.row(i)
        
        # Was the classifier correct?
        c = classify(training, test_patient, k)
        if c == test.column('Class').item(i):
            num_correct = num_correct + 1
    return num_correct/test.num_rows

In [None]:
evaluate_accuracy(train_set, test_set, 5)

# Decision Boundaries

In [None]:
ckd = Table.read_table('ckd.csv').relabeled('Blood Glucose Random', 'Glucose')
ckd                       

In [None]:
kidney = ckd.select('Hemoglobin', 'Glucose', 'Class')
kidney.scatter('Hemoglobin', 'Glucose', group = 'Class')
plots.scatter(13, 250, color = 'r', s = 30);

In [None]:
def show_closest(t, point):
    """Show closest training exmaple to a point."""
    near = closest(t, point, 1).row(0)
    t.scatter(0, 1, group = 'Class')
    plots.scatter(point.item(0), point.item(1), color = 'r', s =30)
    plots.plot([point.item(0), near.item(0)], [point.item(1), near.item(1)], color = 'k', lw = 2)
    
show_closest(kidney, make_array(13, 250))

In [None]:
def standard_units(any_numbers):
    """Convert any array of numbers to standard units."""
    return (any_numbers - np.mean(any_numbers))/np.std(any_numbers)

def standardize(t):
    """Return a table in which all columns of t are converted to standard units."""
    t_su = Table()
    for label in t.labels:
        t_su = t_su.with_column(label + ' (su)', standard_units(t.column(label)))
    return t_su

In [None]:
kidney_su = standardize(kidney.drop('Class')).with_column('Class', kidney.column('Class'))
show_closest(kidney_su, make_array(-0.2, 1.8))

In [None]:
show_closest(kidney_su, make_array(-0.2, 1.3))

In [None]:
show_closest(kidney_su, make_array(-0.2, 1))

In [None]:
show_closest(kidney_su, make_array(-0.2, 0.9))

In [None]:
def decision_boundary(t, k):
    """Decision boundary of a two-column + Class table."""
    t_su = standardize(t.drop('Class')).with_column('Class', t.column('Class'))
    decisions = Table(t_su.labels)
    for x in np.arange(-2, 2.1, 0.1):
        for y in np.arange(-2, 2.1, 0.1):
            predicted = classify(t_su, make_array(x, y), k)
            decisions.append([x, y, predicted])
    decisions.scatter(0, 1, group = 'Class', alpha = 0.4)
    plots.xlim(-2, 2)
    plots.ylim(-2, 2)
    t_su_0 = t_su.where('Class', 0)
    t_su_1 = t_su.where('Class', 1)
    plots.scatter(t_su_0.column(0), t_su_0.column(1), c = 'darkblue', edgecolor = 'k')
    plots.scatter(t_su_1.column(0), t_su_1.column(1), c = 'gold', edgecolor = 'k')

In [None]:
decision_boundary(kidney, 1)

In [None]:
decision_boundary(kidney, 5)

In [None]:
decision_boundary(jittered, 1)

In [None]:
decision_boundary(jittered, 5)