# General Imports

    !! IMPORTANT !!
    If you did NOT install opengrid with pip, 
    make sure the path to the opengrid folder is added to your PYTHONPATH

In [0]:
import os
import inspect
import sys
import pandas as pd
import charts
import numpy as np
import pylab as pl
from scipy import spatial
from scipy.spatial import distance


from opengrid.library import houseprint

In [0]:
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams['figure.figsize'] = 16,8

## Houseprint

In [0]:
hp = houseprint.Houseprint()
# for testing:
# hp = houseprint.Houseprint(spreadsheet='unit and integration test houseprint')

### TMPO

The houseprint, sites, devices and sensors all have a get_data method. In order to get these working for the fluksosensors, the houseprint creates a tmpo session.

## Lookup sites, devices, sensors based on key

These methods return a single object

In [0]:
hp.find_site(1)

In [0]:
hp.find_device('FL03001441')

In [0]:
sensor = hp.find_sensor('d5a747b86224834f745f4c9775d70241')

In [0]:
print(sensor.site)
print(sensor.unit)

## Lookup sites, devices, sensors based on search criteria

These methods return a list with objects satisfying the criteria

hp.search_sites(inhabitants=5)

hp.search_sensors(type='electricity', direction='Import')

### TESTING METHODS 

In [0]:
electricity = hp.find_sensor('212ce724e124fbde0fb649396375d099')

In [0]:
electricity

In [0]:
head = pd.Timestamp('20151114')
tail = pd.Timestamp('20151115')
originalGraph = electricity.get_data(head=head, tail=tail, diff=True, resample='min', unit='kW')
charts.plot(originalGraph, stock=True, show='inline')

In [0]:
with pd.option_context('display.max_rows', 999, 'display.max_columns', 3):
    print originalGraph

In [0]:
timeStart = pd.Timestamp('20151114')
timeStop = pd.Timestamp('20151115')
electricityData = originalGraph.dropna()

Lijst met versch electriciteitsmetingen van 5 nov 2015 tem 6 nov 2015

Hier Moving average filter opzetten (Type van een low-pass filter om smoothing toe te passen)

In [0]:
movingAverage = pd.rolling_mean(electricityData,2)

Concatinate 2 dataframes. One being the original dataset, one smoothed with a movering average filter

In [0]:
df = pd.concat([electricityData,movingAverage], axis=1, keys =('A','B'))

### Plot Moving Average + Original Data

In [0]:
charts.plot(df,stock=True,show='inline')

In [0]:
derivative = electricityData.diff()

Concatinate electricityData and the derivative

In [0]:
chartPlot = pd.concat([electricityData, derivative], axis = 1, keys = ('A','B'))

### Derivative + Original Data

In [0]:
charts.plot(chartPlot, stock = True, show ='inline')

### List Values

In [0]:
with pd.option_context('display.max_rows', 999, 'display.max_columns', 3):
    print originalGraph

### Variables:

electricityData,
movingAverage,
derivative

<h2>Implementing SAX</h2>

In [0]:
dataFrame1= pd.DataFrame({'col1':[2.02, 2.33, 2.99, 6.85, 9.20, 8.80, 7.50, 6.00, 5.85, 3.85, 4.85, 3.85, 2.22, 1.45, 1.34], 
                         'col2':[0.50, 1.29, 2.58, 3.83, 3.25, 4.25, 3.83, 5.63, 6.44, 6.25, 8.75, 8.83, 3.25, 0.75, 0.72]})
charts.plot(dataFrame1, stock=True, show='inline')

<h3>Implementing z-score</h3>

In [0]:
column1 = pd.DataFrame([2.02, 2.33, 2.99, 6.85, 9.20, 8.80, 7.50, 6.00, 5.85, 3.85, 4.85, 3.85, 2.22, 1.45, 1.34])
column2 = pd.DataFrame([0.50, 1.29, 2.58, 3.83, 3.25, 4.25, 3.83, 5.63, 6.44, 6.25, 8.75, 8.83, 3.25, 0.75, 0.72])
saxExample = pd.concat([column1,column2], axis=1)
saxExample.columns=['A','B']
saxExample

In [0]:
#meanCol1 = column1.mean()
#stdCol2 = column1.std()
#meanCol2 = column2.mean()
#stdCol2 = column2.std()
def znormalization(ts):
    #Pakt mean, std van kolom.
    meanCalc = ts.mean(axis = 0)
    stdCalc = ts.std(axis = 0)
    return (ts - meanCalc) / stdCalc

In [0]:
zScores=znormalization(saxExample)

In [0]:
#zScoreCol1 = (column1-meanCol1)/stdCol1
#zScoreCol2 = (column2-meanCol2)/stdCol2
#zScores = pd.concat([zScoreCol1,zScoreCol2],axis = 1)
#zScores.columns=['Col1','Col2']
#zScores
charts.plot(zScores,stock=True,show='inline')

In [0]:
zScores.plot(style="-+")

<h3>Paa transformation of z normalized graphs</h3>

In [0]:
def paa_transform(timeSeries, n_pieces):
    splitted = np.array_split(timeSeries, n_pieces) ## along columns as we want
    return np.asarray(map(lambda xs: xs.mean(axis = 0), splitted))

In [0]:
split9 = paa_transform(zScores,9)
paaTransfo = np.repeat(split9,2, axis = 0)
for i in [0, 1]:
    pl.figure()
    pl.plot(zScores.iloc[:, i], '-+', label = "ts%i"%i)
    pl.plot(paaTransfo[:, i], label = "paa%i"%i)
    pl.legend(loc = "upper left")

In [0]:
dfzScores=pd.DataFrame(zScores) #z-normalization
dfzScores.columns=('col1','col2')

dfPaaTransfo=pd.DataFrame(paaTransfo) #paa
dfPaaTransfo.columns=('col1','col2')

concat = pd.concat([dfzScores,dfPaaTransfo])
charts.plot(concat,stock=True,show='inline')

In [0]:
def sax_transform(ts, n_pieces, alphabet):
    """
    ts: columns of which are time serieses represented by np.array
    n_pieces: number of segments in paa transformation
    alphabet: the letters to be translated to, e.g. "abcd", "ab"
    return np.array of ts's sax transformation
    Steps:
    1. znormalize
    2. paa
    3. find norm distribution breakpoints by scipy.stats
    4. convert paa transformation into strings
    """
    from scipy.stats import norm
    alphabet_sz = len(alphabet)
    thrholds = norm.ppf(np.linspace(1./alphabet_sz, 
                                    1-1./alphabet_sz, 
                                    alphabet_sz-1))
    def translate(ts_values):
        return np.asarray([(alphabet[0] if ts_value < thrholds[0]
                else (alphabet[-1] if ts_value > thrholds[-1]
                      else alphabet[np.where(thrholds <= ts_value)[0][-1]+1]))
                           for ts_value in ts_values])
    paa_ts = paa_transform(znormalization(ts), n_pieces)
    return np.apply_along_axis(translate, 0, paa_ts)

<h3>applying sax transformation</h3>

In [0]:
array1=sax_transform(saxExample, 9, "abcdef")
array1

In [0]:
sax=pd.DataFrame(array1)
sax.columns=("col1","col2")

<h3>Make array of values from the letters</h3>

In [0]:
def makeArray(array1):
    arrayValues=[]    
    for i in range(0, array1.size-1):
        if(i<array1.size):
             arrayValues.append(ord(array1.iloc[i])-97)
    df=pd.DataFrame(arrayValues)
    return df

In [0]:
lettersToNumbers = pd.concat([makeArray(sax.col1),makeArray(sax.col2)],axis=1)
lettersToNumbers.columns=['col1','col2']
lettersToNumbers

In [0]:
arrayLettersToNumbers=lettersToNumbers.col1.as_matrix()
arrayLettersToNumbersRepeat=np.repeat(arrayLettersToNumbers,2)
charts.plot(arrayLettersToNumbersRepeat, stock=True, show='inline')

<h3>Comparing sax_alphabet_graph with paa values</h3>

In [0]:
saxAndPaa=pd.concat([dfPaaTransfo.col1,znormalization(pd.DataFrame(arrayLettersToNumbersRepeat))],axis=1)
saxAndPaa.columns=("col1","col2")
charts.plot(saxAndPaa, stock=True,show='inline')

<h3>Comparing orignal znormalized graph with sax</h3>

In [0]:
newGraph=pd.concat([znormalization(pd.DataFrame(arrayLettersToNumbersRepeat)),znormalization(column1)],axis=1)
newGraph.columns=['A','B']
charts.plot(newGraph,stock=True,show='inline')

<h3>Making dataframes of value differences between successive values</h3>

In [0]:
def makeDiffArray(array): #Excepts a letter array
    arrayValues=[]    
    for i in range(0, array.size-1):
        if(i<array.size):
             arrayValues.append(ord(array.iloc[i+1])-ord(array.iloc[i]))
    df=pd.DataFrame(arrayValues)
    return df          


In [0]:
successive_df = pd.concat([makeDiffArray(sax.col1),makeDiffArray(sax.col2)],axis=1)
successive_df.columns=['col1','col2']
successive_df


<h3>Calculating sax differences</h3>

Differences are calculated between 2 arrays to see howmuch they are alike.
We want values around 0 or 1.

In [0]:
def differenceNumbers(array): #Expects array of numbers @
    arrayValues=[]
    for i in range(0, array.index.size):
        arrayValues.append(np.diff(array.iloc[i], axis=-1))
    df=pd.DataFrame(arrayValues)
    return df

In [0]:
def diffArrays(array1,array2):
    array=[]
    if array1.index.size==array2.index.size:
        for i in range(0,array1.index.size):
            array.append(array2.iloc[i]-array1.iloc[i])
    df=pd.DataFrame(array)
    return df

In [0]:
arrayOfOnes=[]
array2OfOnes=[]
for i in range (0,87):
    arrayOfOnes.append(1)
    dfArrayOfOnes=pd.DataFrame(arrayOfOnes)
for i in range (0,82):
    array2OfOnes.append(2)
    dfArray2OfOnes=pd.DataFrame(array2OfOnes)

reCalculating difference between successive values of the array.
This shows howmuch alike the arrays are with respect to the change of sucessive values.

A lot of 0 and 1's means that the graphs will be alike.

In [0]:
successive_difference_df = differenceNumbers(successive_df) #col2-col1
successive_difference_df

In [0]:
def differenceLetters(array1, array2): #Expects array of Letters
    array=[]
    if array1.index.size == array2.index.size:
        for i in range(0, array1.index.size-1):
            array.append(ord(array2.iloc[i])-ord(array1.iloc[i]))
    df=pd.DataFrame(array)
    return df

Best is to first change array to number values, at them together, and then perform differencing operation.

## Testing on actual data

In [0]:
"""
First z-normalization
Then paa
"""

zScoresOriginalGraph=znormalization(originalGraph)
splitData = paa_transform(zScoresOriginalGraph,240)
splitData_ext = np.repeat(splitData,4,axis = 0)
pl.figure()
pl.plot(zScoresOriginalGraph, '-+')
pl.plot(splitData_ext)
pl.legend(loc = "upper left")



<h3>Comparing original graph with paa values</h3>

In [0]:
splitDataDf=pd.DataFrame(splitData_ext)
originalGraphDf=pd.DataFrame(originalGraph)
splitDataDf=splitDataDf.iloc[:splitDataDf.size-20]
splitDataDf.index=originalGraphDf.index
graph = pd.concat([splitDataDf,zScoresOriginalGraph],axis=1).dropna()
charts.plot(graph,stock=True,show='inline')

In [0]:
saxTransformation=sax_transform(originalGraph, 240, "abcdefghijklmn")

In [0]:
'''
Werkwijze:
Dataframes gebruiken!
-sax transfo
-letters->nummers (makeArray())
-np.repeat (plateau's)
-plot(paa_transform+repeater,^)
'''

<h3>Paa and Sax graph</h3>

In [0]:
letters=pd.DataFrame(saxTransformation)
letters.columns=['col1']
lettersToNumSax=makeArray(letters.col1) #Expects a pandaframe

repeatedValues=np.repeat(lettersToNumSax.as_matrix(),4) #expects an array

dfRepeatedValues=pd.DataFrame(repeatedValues)
dfRepeatedValues=dfRepeatedValues.iloc[:dfRepeatedValues.size-16]
splitDataDf.index=dfRepeatedValues.index

concatinatePaaAndSax=pd.concat([znormalization(dfRepeatedValues),splitDataDf],axis=1)
concatinatePaaAndSax.columns=['col1','col2']

charts.plot(concatinatePaaAndSax,stock=True,show='inline')


<h3>sax and original graph (znormalized)</h3>

In [0]:
dfRepeatedValues.index=zScoresOriginalGraph.index
saxVsOriginalGraph=pd.concat([(dfRepeatedValues), znormalization(originalGraph)],axis=1).dropna()
charts.plot(saxVsOriginalGraph, stock=True, show='inline')


<h2>Methods for measuring the similarity of the arrays</h2>

<h5>Hamming Distance (not so interesting for us)</h5>

In [0]:
def hammingDistance(s1, s2):
    """Return the Hamming distance between equal-length sequences"""
    if len(s1) != len(s2):
        raise ValueError("Undefined for sequences of unequal length")
    return sum(el1 != el2 for el1, el2 in zip(s1, s2))

<h5>Euclidean Distance</h5>

In [0]:
def euclideanDistance(s1,s2):
    #Verwacht een numpy array
    dist = np.linalg.norm(s1-s2)
    return dist

<h5>Cosine Similarity</h5>

In [0]:
def cosineSimilarity(s1,s2):
    similarity = 1 - spatial.distance.cosine(s1, s2)
    return similarity

<h5>Mahalanobis Distance</h5>

In [0]:
def mahalanobisDistance(s1,s2):
    similarity = 1 - distance.mahalanobis(s1,s2)
    return similarity

<h5>Manhattan Distance</h5>

In [0]:
def manhattanDistance(s1,s2):
    difference=diffArrays(s1,s2)
    absDifference=abs(difference)
    sumOfValues=np.sum(absDifference)
    maxValues=np.maximum(s1,s2)
    maxValues=np.max(maxValues)
    manhattan=sumOfValues/maxValues
    return manhattan

In [0]:
dfRepeatedValues2=dfRepeatedValues
with pd.option_context('display.max_rows', 999, 'display.max_columns', 3):
    print dfRepeatedValues2


In [0]:
sampleData1=dfRepeatedValues.ix[pd.Timestamp('2015-11-14 15:22:00+00:00'):pd.Timestamp('2015-11-14 16:48:00+00:00')]
sampleData2=dfRepeatedValues.ix[pd.Timestamp('2015-11-14 16:49:00+00:00'):pd.Timestamp('2015-11-14 18:18:00+00:00')]
sampleData3=dfRepeatedValues.ix[pd.Timestamp('2015-11-14 18:19:00+00:00'):pd.Timestamp('2015-11-14 19:40:00+00:00')]
sampleData1.columns=["sampleData1"]
sampleData2.columns=["sampleData2"]
sampleData3.columns=["sampleData3"]

In [0]:
array1=np.array((1,2,3,3,4,5,5,5,6,6,6,6,7,7,7,8,9,5,4,2,3))
array2=np.array((1,5,3,5,3,2,2,5,5,6,6,6,7,7,7,8,9,6,5,2,3))

euclideanDistance(array1,array2)

In [0]:
shorterSampleData2=sampleData2.iloc[:sampleData2.size-3]
evenShorterSampleData2=sampleData2.iloc[:sampleData2.size-8]
shorterSampleData1=sampleData1.iloc[:sampleData1.size-5]
euclideanDistance(sampleData1.sampleData1,shorterSampleData2.sampleData2)

nparraySampleData1=np.array((sampleData1))
nparrayshorterSampleData1=np.array((shorterSampleData1))
nparraySampleData2=np.array((shorterSampleData2))
nparraySampleData3=np.array((sampleData3))
nparrayevenShorterSampleData2=np.array((evenShorterSampleData2))
print "euclideanDistance 1 and 2:", euclideanDistance(nparraySampleData1,nparraySampleData2)
print "euclideanDistance 1 and 3:", euclideanDistance(nparrayshorterSampleData1,nparraySampleData3)
print "euclideanDistance 2 and 3:", euclideanDistance(nparrayevenShorterSampleData2, nparraySampleData3)

In [0]:
manhattan1=[1,2,3,3,5,5,46,6,6,7]
manhattan2=[1,2,3,3,5,5,2,6,6,7]
dfManhattan1=pd.DataFrame(manhattan1)
dfManhattan2=pd.DataFrame(manhattan2)
sampleData1.index=dfArrayOfOnes.index
shorterSampleData2.index=dfArrayOfOnes.index
arraySth=diffArrays(sampleData1.sampleData1,shorterSampleData2.sampleData2)
arraySth
absValue=abs(arraySth)
sumsth=np.sum(absValue)
maxValue=np.maximum(sampleData1.sampleData1,shorterSampleData2.sampleData2)
maxValue=np.max(maxValue)
test=sumsth/maxValue
dfTest=pd.DataFrame(test)
dfTest

In [0]:
manhattan1=[1,1,2,3,4,5,6,7,8,4,5,3]
manhattan2=[1,1,4,6,4,2,3,7,9,4,5,3]
dfArrayOfOnes.columns=['col1'] 
#Als je met vershillende kolomnamen zit weet het prog niet welke je bedoelt, dus moet je deze definieren.
#dfManhattan1 en dfManhattan2 hebben zelfde kolomnamen. Daarom moet je niet nog eens defineren apart.
print "Manhattan with ones:", manhattanDistance(dfManhattan1,dfManhattan2)

In [0]:
print "cosine similarity 1 and 2:", cosineSimilarity(sampleData1,shorterSampleData2)
print "cosine similarity 1 and 3:", cosineSimilarity(shorterSampleData1, sampleData3)
print "cosine similarity 2 and 3:", cosineSimilarity(evenShorterSampleData2, sampleData3)

In [0]:
print "manhattan 1:", manhattanDistance(sampleData1.sampleData1,dfArrayOfOnes.col1)
print "manhattan 2:", manhattanDistance(sampleData1.sampleData1,shorterSampleData2.sampleData2)
print "manhattan 3:", manhattanDistance(sampleData3.sampleData3, evenShorterSampleData2.sampleData2)

print "Hier zien we duidelijk een verschil mbt gelijkaardigheid"

In [0]:
print "manhattan 1:", euclideanDistance(sampleData1.sampleData1,dfArrayOfOnes.col1)
print "manhattan 2:", euclideanDistance(sampleData1.sampleData1,shorterSampleData2.sampleData2)
print "manhattan 3:", euclideanDistance(nparrayevenShorterSampleData2, nparraySampleData3)

In [0]:
test=np.zeros(87)
test

## Device

In [0]:
#return np.asarray(ord(sax.col1.iloc[i+1])-ord(sax.col1.iloc[i]))