In [8]:
import numpy as np

MetaID = 0
MetaDATA = 1
MetaLABEL = 2
MetaSTART = 3
MetaDURATION = 4

# Things to test:
#   Can we predict without initial background data?
#   How quickly into an induction can we predict?

# Additional features to include:
#   Diff (between current value and value before banana added)
#   Quadratic (every combination, 121 new features)

print("Loading data...")
metadata = np.loadtxt('HT_Sensor_metadata.dat', skiprows=1, dtype=str)
dataset = np.loadtxt('/mnt/ExtDrive/sensor data/HT_Sensor_dataset.dat', skiprows=1)
print("Preprocessing data...")

# These include "difference features" between current readings and the previous background readings
banana_data1 = []
wine_data1 = []
background_data1 = []

# These do not include "difference features"
banana_data2 = []
wine_data2 = []
background_data2 = []


def sqr_features(features):
    features_sqr = np.resize(features, len(features) + len(features)*len(features))
    for i in range(0, len(features)):
        for j in range(0, len(features)):
            features_sqr[len(features) + j + i*len(features)] = features[i] * features[j]
    return features_sqr

def sqr_features_str(features):
    for i in range(0, len(features)):
        for j in range(0, len(features)):
            features.append(features[i] + "*" + features[j])

feature_names1 = ["time", "R1", "R2", "R3", "R4", "R5", "R6", "R7", "R8", "temp", "h"]
feature_names2 = feature_names1.copy()
for i in range(0, len(feature_names1)):
    feature_names2.append(feature_names2[i] + "'")
sqr_features_str(feature_names1)
sqr_features_str(feature_names2)


segment_len = 1 / 60 # one minute increments
next_segment = 0
cur_ind = None
cur_segment_data = []
cur_background_data = None
p_data1 = None
for entry in dataset:
    if cur_ind is None or int(entry[0]) != int(cur_ind[MetaID]):
        cur_ind = metadata[int(entry[0])]
        cur_background_data = None
        cur_segment_data = []
        next_segment = 0
        
        #print("setting the data")
        if cur_ind[MetaLABEL] == 'banana':
            p_data1 = banana_data1
            p_data2 = banana_data2
        elif cur_ind[MetaLABEL] == 'wine':
            p_data1 = wine_data1
            p_data2 = wine_data2
        else:
            p_data1 = background_data1
            p_data2 = background_data2
        
    cur_segment_data.append(entry[1:])
    if entry[1] < next_segment or entry[1] > float(cur_ind[MetaDURATION]):
        continue
    
    if cur_background_data is None:
        #print(np.array(cur_segment_data).shape)
        cur_background_data = np.mean(np.array(cur_segment_data), axis=0)
        #print(cur_background_data.shape)
    else:
        p_data1.append(entry[1:])
        #print(entry[1:].shape)
        #print(cur_background_data)
        #print(cur_background_data.shape)
        #print(cur_background_data.transpose()[:,0].shape)
        #print(cur_background_data[0,:].shape)
        #print(cur_background_data.squeeze().shape)
        #print(cur_background_data.flatten().shape)
        #print(cur_background_data.transpose().shape)
        #print(cur_background_data.transpose().flatten().shape)
        p_data2.append(np.concatenate((entry[1:], entry[1:] - cur_background_data)))
        p_data1[len(p_data1)-1] = sqr_features(p_data1[len(p_data1)-1])
        p_data2[len(p_data2)-1] = sqr_features(p_data2[len(p_data2)-1])
    
    cur_segment_data = []
    next_segment += segment_len
        
print(len(banana_data1))
print(len(wine_data1))
print(len(background_data1))

print("PCA 1...")
all_data = np.concatenate((banana_data1, wine_data1, background_data1))
all_data_z = all_data - np.mean(all_data, axis=0)
all_data_z /= np.std(all_data_z, axis=0)
print(all_data_z.shape)
U, s, VT = np.linalg.svd(all_data_z, full_matrices=False)
print(s)

print("PCA 2...")
all_data = np.concatenate((banana_data2, wine_data2, background_data2))
all_data_z = all_data - np.mean(all_data, axis=0)
all_data_z /= np.std(all_data_z, axis=0)
U, s, VT = np.linalg.svd(all_data_z, full_matrices=False)
print(s)

#   0: id
#   1: time
# 2-9: R1-R8
#  10: temperature
#  11: humidity

Loading data...
Preprocessing data...
0.0 = 0
1.0 = 1
2.0 = 2
3.0 = 3
4.0 = 4
5.0 = 5
6.0 = 6
7.0 = 7
8.0 = 8
9.0 = 9
10.0 = 10
11.0 = 11
12.0 = 12
13.0 = 13
14.0 = 14
15.0 = 15
16.0 = 16
17.0 = 17
18.0 = 18
19.0 = 19
20.0 = 20
21.0 = 21
22.0 = 22
23.0 = 23
24.0 = 24
25.0 = 25
26.0 = 26
27.0 = 27
28.0 = 28
29.0 = 29
30.0 = 30
31.0 = 31
32.0 = 32
33.0 = 33
34.0 = 34
35.0 = 35
36.0 = 36
37.0 = 37
38.0 = 38
39.0 = 39
40.0 = 40
41.0 = 41
42.0 = 42
43.0 = 43
44.0 = 44
45.0 = 45
46.0 = 46
47.0 = 47
48.0 = 48
49.0 = 49
50.0 = 50
51.0 = 51
52.0 = 52
53.0 = 53
54.0 = 54
55.0 = 55
56.0 = 56
57.0 = 57
58.0 = 58
59.0 = 59
60.0 = 60
61.0 = 61
62.0 = 62
63.0 = 63
64.0 = 64
65.0 = 65
66.0 = 66
67.0 = 67
68.0 = 68
69.0 = 69
70.0 = 70
71.0 = 71
72.0 = 72
73.0 = 73
74.0 = 74
75.0 = 75
76.0 = 76
77.0 = 77
78.0 = 78
79.0 = 79
80.0 = 80
81.0 = 81
82.0 = 82
83.0 = 83
84.0 = 84
85.0 = 85
86.0 = 86
87.0 = 87
88.0 = 88
89.0 = 89
90.0 = 90
91.0 = 91
92.0 = 92
93.0 = 93
94.0 = 94
96.0 = 96
97.0 = 97
98.0 = 98
99

In [22]:

Xb = np.concatenate((banana_data, background_data))
yb = np.concatenate((np.full(len(banana_data), 1), np.full(len(background_data), -1)))

NUM_TRAITS = 11
NUM_PARTS = 8

setIndices = [[0, int(len(yb) / NUM_PARTS)]]
for i in range(1, NUM_PARTS):
    prev = setIndices[i-1][1]
    setIndices.append([prev, int(prev+len(yb)/NUM_PARTS)])
setIndices[NUM_PARTS-1][1] = len(yb)

holdOuts = [[0, 1]]
for i in range(1, NUM_PARTS - 2):
    prev = holdOuts[i-1][1]
    holdOuts.append([prev, prev+1])

lam_vals = [1e-5, 1e-4, 1e-3, 1e-2, 0.1, 1, 10, 100]

errorRate = 0
for j in range(0, len(holdOuts)):
    print("Case", j)
    v1Ind = np.arange(setIndices[holdOuts[j][0]][0], setIndices[holdOuts[j][0]][1])
    v2Ind = np.arange(setIndices[holdOuts[j][1]][0], setIndices[holdOuts[j][1]][1])
    trnInd = list(set(range(len(yb))) - set(v1Ind) - set(v2Ind))
    
    Xt = Xb[trnInd,:]
    yt = yb[trnInd]
    
    Xv1 = Xb[v1Ind,:]
    yv1 = yb[v1Ind]
    Xv2 = Xb[v2Ind,:]
    yv2 = yb[v2Ind]
    
    minErr = float("inf")
    minIdx = 0
    minW = []
    for i in range(0, len(lam_vals)):
        w = np.linalg.inv(Xt.T@Xt + lam_vals[i] * np.identity(11))@Xt.T@yt
        error = np.abs(np.sign(Xv1@w) - yv1).sum()
        if error <= minErr:
            minErr = error
            minIdx = i
            minW = w
    print("Min lam:", lam_vals[minIdx])
    print("Min err:", minErr)
    errorRate += np.abs(np.sign(Xv2@minW) - yv2.flatten()).sum() / (2 * len(yv2))

errorRate /= len(holdOuts)    
print(errorRate)


Case 0
Min lam: 0.1
Min err: 23136.0
Case 1
Min lam: 100
Min err: 4898.0
Case 2
Min lam: 1
Min err: 20912.0
Case 3
Min lam: 100
Min err: 32298.0
Case 4
Min lam: 1
Min err: 9884.0
Case 5
Min lam: 0.1
Min err: 19026.0
0.49458424507658644
