In [2]:
from IPython.core.display import *
import csv
import operator, random, math

In [3]:
data = []
names = ['Cement', 'Blast Furnace Slag', 'Fly Ash', 'Water', 
         'Superplasticizer', 'Coarse Aggregate', 'Fine Aggregate', 
         'Age', 'Concrete Compressive Strength']
with open('Concrete_Data.csv', 'r') as csvfile:
    datareader = csv.reader(csvfile, delimiter=',', quotechar='|')
    datareader.next()
    for row in datareader:
        inputs = [col for col in row]
        d = {}
        index = 0
        for name in names:
            d[name] = float(inputs[index])
            index += 1
        data.append(d)

In [4]:
def argmin(data, distance_func):
    min_example = data[0]
    min_distance = distance_func(min_example)
    for example in data:
        distance = distance_func(example)
        if distance < min_distance:
            min_example = example
            min_distance = distance
    return min_example

In [5]:
def mean(values):
    return sum(values)/float(len(values))

In [6]:
def distance(example1, example2):
    attributes = ['Cement', 'Blast Furnace Slag', 'Fly Ash', 'Water', 
                  'Superplasticizer', 'Coarse Aggregate', 'Fine Aggregate', 'Age']
    distance = 0
    for attr in attributes:
        distance += (example1[attr]-example2[attr])**2
    return distance**(0.5)

In [7]:
def estimate(k, data, example):
    if k == 1:
        neighbor = argmin(data, lambda e: distance(e, example))
        return neighbor['Concrete Compressive Strength']
    else:
        neighbors = []
        for e in data:
            d = distance(e, example)
            if (len(neighbors) < k):
                neighbors.append((d,e))
            elif d < neighbors[-1][0]:
                neighbors[-1] = (d,e)
                neighbors.sort()
        return mean([e['Concrete Compressive Strength'] for (d,e) in neighbors])

In [8]:
def knn(k, data, query):
    estimates = []
    for q in query:
        e = estimate(k, data, q)
        estimates.append(e)
    return estimates

In [9]:
def test_training(k, data, start, end, view=False):
    train_data = data[:start]+data[end:]
    test_data = data[start:end]
    regression = knn(k, train_data, test_data)
    n = len(test_data)
    error = 0.0
    for i in range(n):
        if view == True:
            print("%.2f     %.2f" %(regression[i],test_data[i]['Concrete Compressive Strength']))
        error += abs(test_data[i]['Concrete Compressive Strength'] - regression[i])
    return error/n

In [10]:
def cross_validation(k, data, k_folds=10, view=False):
    n = len(data)
    random.shuffle(data)
    return mean([test_training(k, data, i*(n/k_folds), (i+1)*(n/k_folds), view) for i in range(k_folds)])

In [11]:
def learning_curve(k, data, test_trials=10):
    test_sizes = range(2, len(data)-2, 300)
    random.shuffle(data)
    return [(size, mean([test_training(k, data, 0, size) for t in range(test_trials)])) for size in test_sizes]

In [12]:
best_k = 1
best_cv = cross_validation(1, data, 10, True)
print "cross validation when k = 1 is", best_cv

47.13     50.77
72.30     74.19
25.72     37.27
51.06     60.95
39.46     39.45
45.30     45.30
48.67     48.15
23.40     31.97
46.68     46.68
24.40     24.39
15.75     33.95
15.52     32.90
33.36     33.94
40.87     40.87
40.57     40.39
41.24     38.89
35.34     38.56
25.57     11.65
42.64     42.64
76.80     72.99
31.03     28.94
15.07     21.97
39.40     29.16
45.94     40.15
25.57     14.60
41.24     36.15
48.97     52.04
41.93     43.70
18.28     18.29
21.91     36.99
56.70     60.20
35.76     37.91
43.89     56.83
31.02     25.46
52.12     53.10
26.74     34.49
23.08     33.01
14.94     19.42
10.39     4.83
33.80     33.80
25.48     13.62
32.24     32.25
12.84     18.42
39.30     32.88
22.53     27.34
54.90     66.90
23.79     23.79
61.89     79.99
36.59     33.19
49.20     49.20
25.48     10.76
36.30     46.80
40.66     40.86
46.24     46.25
19.54     9.56
45.37     33.54
7.32     21.50
15.34     26.05
17.95     17.96
26.40     28.47
32.96     32.96
30.45     47.71
61.07     5

In [13]:
for k in range(2, 11):
    cv = cross_validation(k, data, 10, False)#10 folds
    print cv
    if (cv < best_cv):
        best_cv = cv
        best_k = k
print "best k for knn is", best_k

6.25699029126
6.48251132686
6.57112378641
6.8125184466
6.94601294498
6.97102912621
7.10104975728
7.15385329018
7.32757961165
best k for knn is 1


In [14]:
#20 folds
print "with 20 folds ", cross_validation(1, data, 20, False)
#30 folds
print "with 30 folds ", cross_validation(1, data, 30, False)

with 20 folds  5.84853921569
with 30 folds  5.75223529412


- Splitting data will help reducing the error rate.

In [15]:
lc = learning_curve(1, data, 10)
print lc

[(2, 15.979999999999999), (302, 6.726192052980137), (602, 7.700448504983389), (902, 10.26703991130819)]
