In [12]:
%%time

from multiprocessing import Pool

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from scipy import spatial
from datetime import timedelta
import seaborn as sns

import sys
import collections
import itertools
from scipy.stats import mode


from numpy import *
from pylab import *
from scipy.spatial import distance
from sklearn import metrics



class KnnDtw(object):
    """K-nearest neighbor classifier using dynamic time warping
    as the distance measure between pairs of time series arrays
    
    Arguments
    ---------
    n_neighbors : int, optional (default = 5)
        Number of neighbors to use by default for KNN
        
    max_warping_window : int, optional (default = infinity)
        Maximum warping window allowed by the DTW dynamic
        programming function
            
    subsample_step : int, optional (default = 1)
        Step size for the timeseries array. By setting subsample_step = 2,
        the timeseries length will be reduced by 50% because every second
        item is skipped. Implemented by x[:, ::subsample_step]
    """
    
    def __init__(self, n_neighbors=5, max_warping_window=10000, subsample_step=1):
        self.n_neighbors = n_neighbors
        self.max_warping_window = max_warping_window
        self.subsample_step = subsample_step
    
    def fit(self, x, l):
        """Fit the model using x as training data and l as class labels
        
        Arguments
        ---------
        x : array of shape [n_samples, n_timepoints]
            Training data set for input into KNN classifer
            
        l : array of shape [n_samples]
            Training labels for input into KNN classifier
        """
        
        self.x = x
        self.l = l
        
    def _dtw_distance(self, ts_a, ts_b, d = lambda x,y: abs(x-y)):
        """Returns the DTW similarity distance between two 2-D
        timeseries numpy arrays.

        Arguments
        ---------
        ts_a, ts_b : array of shape [n_samples, n_timepoints]
            Two arrays containing n_samples of timeseries data
            whose DTW distance between each sample of A and B
            will be compared
        
        d : DistanceMetric object (default = abs(x-y))
            the distance measure used for A_i - B_j in the
            DTW dynamic programming function
        
        Returns
        -------
        DTW distance between A and B
        """

        # Create cost matrix via broadcasting with large int
        ts_a, ts_b = np.array(ts_a), np.array(ts_b)
        M, N = len(ts_a), len(ts_b)
        cost = sys.maxsize * np.ones((M, N))
         
        # Initialize the first row and column
        cost[0, 0] = d(ts_a[0], ts_b[0])
        for i in range(1, M):
            cost[i, 0] = cost[i-1, 0] + d(ts_a[i], ts_b[0])

        for j in range(1, N):
            cost[0, j] = cost[0, j-1] + d(ts_a[0], ts_b[j])

        # Populate rest of cost matrix within window
        for i in range(1, M):
            for j in range(max(1, i - self.max_warping_window),
                            min(N, i + self.max_warping_window)):
                choices = cost[i - 1, j - 1], cost[i, j-1], cost[i-1, j]
                cost[i, j] = min(choices) + d(ts_a[i], ts_b[j])

        # Return DTW distance given window 
        return cost[-1, -1]
    
    def _dist_matrix(self, x, y):
        """Computes the M x N distance matrix between the training
        dataset and testing dataset (y) using the DTW distance measure
        
        Arguments
        ---------
        x : array of shape [n_samples, n_timepoints]
        
        y : array of shape [n_samples, n_timepoints]
        
        Returns
        -------
        Distance matrix between each item of x and y with
            shape [training_n_samples, testing_n_samples]
        """
        
        # Compute the distance matrix        
        dm_count = 0
        
        # Compute condensed distance matrix (upper triangle) of pairwise dtw distances
        # when x and y are the same array
        if(np.array_equal(x, y)):
            x_s = shape(x)
            dm = np.zeros((x_s[0] * (x_s[0] - 1)) // 2, dtype=np.double)
            
            p = ProgressBar(shape(dm)[0])
            
            for i in range(0, x_s[0] - 1):
                for j in range(i + 1, x_s[0]):
#                     dm[dm_count] = self._dtw_distance(x[i, ::self.subsample_step],
#                                                       y[j, ::self.subsample_step])
                    dm[dm_count] = self._dtw_distance(x[i, ::self.subsample_step],
                                                      y[j, ::self.subsample_step])
                    
                    dm_count += 1
                    p.animate(dm_count)
            
            # Convert to squareform
            dm = distance.squareform(dm)
            return dm
        
        # Compute full distance matrix of dtw distnces between x and y
        else:
            x_s = np.shape(x)
            y_s = np.shape(y)
            dm = np.zeros((x_s[0], y_s[0])) 
            dm_size = x_s[0]*y_s[0]
            
            p = ProgressBar(dm_size)
        
            for i in range(0, x_s[0]):
                for j in range(0, y_s[0]):
                    dm[i, j] = self._dtw_distance(x[i, ::self.subsample_step],
                                                  y[j, ::self.subsample_step])
                                                        
                    # Update progress bar
                    dm_count += 1
                    p.animate(dm_count)
        
            return dm
        

# sliding window
def GetShiftingWindows(thelist, size):
    return [ thelist[x:x+size] for x in range( len(thelist) - size + 1 ) ]


def CalcKNNDTW(speed, index, time_horizon, no_neighbors):
    
    X_test = df.speed.iloc[index:index+time_horizon].values 
    
    # merging data after taking out part of it 
    a = df.speed[:index].values
    b = df.speed[index+time_horizon:].values
    speed_train_series =  np.concatenate([a,b])
    
    X_train = GetShiftingWindows(speed_train_series, time_horizon)
    X_train = np.array(X_train)

    m = KnnDtw(n_neighbors=no_neighbors)

    distances = []

    for i in range(0, len(X_train)):   
        dist  = m._dtw_distance(X_train[i], X_test)
        distances.append(dist)
    
    return distances 


#error definition
def mean_absolute_percentage_error_value(y_true, y_pred):
        to_mean = []
        for true, pred in zip(y_true, y_pred):
            if true != 0:
                to_mean.append(np.abs((true - pred) / true))
        return np.mean(to_mean) * 100

def mae( y_true, y_pred):
        return metrics.mean_absolute_error(y_true, y_pred)

def rmse( y_true, y_pred):
        return np.sqrt(metrics.mean_squared_error(y_true, y_pred))



#################################


data_resol = ['1h', '30min', '15min']
pred_horizons = [24, 24, 24]

# indexes for 1h, 30m, 15min
indexes = [[0, 150, 300, 450, 600, 750, 900, 1050, 1200, 1800, 1950, 2100, 2250, 
                2400, 2550, 2700, 2850, 3600, 3750, 3900, 4050],
           
           [ 550, 600,  750, 800, 870, 1000, 1050, 1200, 1300, 1350, 1500, 1600, 1650, 1900, 
              1800, 1950, 2100, 2250, 2300, 2400, 2550, 3751, 4500, 4650, 4800, 
              4950, 5100, 5300, 5400, 5500, 5550, 5600, 5700, 5850, 6005, 6150, 7450, 7600, 
              7750, 7900, 8050, 8200], 
           
           [1100, 1200, 1500,  1600,  1740, 2000, 2100, 2400,  2600,
             2680,  3000, 3200,  3300,  3800, 3600,  3900, 4200, 4500,
                4600, 4800, 5100, 7502, 9000, 9300, 9600, 9900, 10200,
             10600, 10800, 11000, 11100, 11200, 11400, 11700, 12010,
             12300, 14900, 15200, 15500, 15800, 16100, 16400]]

time_freq = ['h', '30m', '15m']


no_neigbors = [22,8,13,5,13,5,5,12,3,1,    3,17,16,9,12,10,18,14,22,11,  13,11,13,15]
# TODO: find k parameters for 30min and 15min.



for res, hor, idxs,tf in zip(data_resol, pred_horizons, indexes, time_freq): 
    print('************************** data resolution [{}] ******************************************'.format(res))
    
    df = pd.read_csv('D:\data from MnM Samdal raw\Samdal_power_{}_no_outliers.csv'.format(res), parse_dates=[0])

    df.power = df.power.interpolate(method='linear', limit = 4 )
    df.speed = df.speed.interpolate(method='linear', limit = 4 )

    df = df.rename( columns={"Unnamed: 0": "time"})
    df.time = pd.to_datetime(df.time, format='%Y%m%d %H:%M:%s')
    df = df.set_index('time')

    speed = df.speed.values
    max_power =np.max(df.power)

    time_horizon_lengths = np.arange(1,hor+1,1)

    list_nMAE_for_horizon_mtrx =[]
    list_nRMSE_for_horizon_mtrx =[]


    for t, k in zip(time_horizon_lengths, no_neigbors):
        print('time horizon [{}]: '.format(res), t, '\n')

        list_nMAE = []
        list_nRMSE = []

        for u in idxs:

            true_power_values = df.power[u:u+t].values

            if np.any(np.isnan(true_power_values))==True:
                continue        

            if len(true_power_values)!=t:
                continue

            distances = CalcKNNDTW(speed, u, t, k)


            df = df[:len(distances)]
            df['knnDTW_distances'] = distances

            sorted_neigh = df.sort(columns=['knnDTW_distances'], ascending=True)

            if np.any(np.isnan(sorted_neigh.power[:50]))==True:
                sorted_neigh = sorted_neigh.dropna(subset=['power'])
                
            ##########################

            top_neigh_indexes = sorted_neigh.iloc[:k].index    
            # find 6h windows based on indexes

            time_horizon = np.timedelta64((t.item()-1), tf) # subtract 1 because we start from i, so i+5=6

            found_6h_series = []

            for m in top_neigh_indexes:
                power_series = df.ix[m:m+time_horizon,'power'].values
                found_6h_series.append(power_series)   # first index is the tested series [true values]

            df_powers = pd.DataFrame(data=found_6h_series) # rows are neigboring series,   # calc avg for columns

            predicted_power = df_powers.mean(axis=0)

            # protection against nan
            if np.any(np.isnan(predicted_power))==True:
                continue 

            if len(predicted_power)!=t:
                continue

            ###############
            MAE_1h = mae(true_power_values, predicted_power)    # it's already an avg because i took a mean of predicted power
            RMSE_1h = rmse(true_power_values, predicted_power)

            nMAE_1h = (MAE_1h/max_power)*100
            nRMSE_1h = (RMSE_1h/max_power)*100

            list_nMAE.append(nMAE_1h)    
            list_nRMSE.append(nRMSE_1h)            

        #for k
        avg_nMAE = sum(list_nMAE)/(len(list_nMAE))
        avg_nRMSE = sum(list_nRMSE)/(len(list_nRMSE))

        list_nMAE_for_horizon_mtrx.append(avg_nMAE)
        list_nRMSE_for_horizon_mtrx.append(avg_nRMSE)

        print('list_nMAE_for_horizon_mtrx ', list_nMAE_for_horizon_mtrx, '\n')    
        print('list_nRMSE_for_horizon_mtrx ', list_nRMSE_for_horizon_mtrx, '\n')    

    # save results to columns
    csv_input = pd.read_csv('DTW_kNN_results_all_data.csv')
    csv_input['nMAE_{}'.format(res)] = list_nMAE_for_horizon_mtrx
    csv_input['nRMSE_{}'.format(res)] = list_nRMSE_for_horizon_mtrx

    csv_input.to_csv('DTW_kNN_results_all_data.csv', index=False)

************************** data resolution [1h] ******************************************
time horizon [1h]:  1 





list_nMAE_for_horizon_mtrx  [1.1737614857797585] 

list_nRMSE_for_horizon_mtrx  [1.1737614857797585] 

time horizon [1h]:  2 

list_nMAE_for_horizon_mtrx  [1.1737614857797585, 3.0464091231446635] 

list_nRMSE_for_horizon_mtrx  [1.1737614857797585, 3.350558662362233] 

time horizon [1h]:  3 

list_nMAE_for_horizon_mtrx  [1.1737614857797585, 3.0464091231446635, 3.0403333577597098] 

list_nRMSE_for_horizon_mtrx  [1.1737614857797585, 3.350558662362233, 3.4633249605018457] 

time horizon [1h]:  4 

list_nMAE_for_horizon_mtrx  [1.1737614857797585, 3.0464091231446635, 3.0403333577597098, 4.0796687400828642] 

list_nRMSE_for_horizon_mtrx  [1.1737614857797585, 3.350558662362233, 3.4633249605018457, 4.600350617699446] 

time horizon [1h]:  5 

list_nMAE_for_horizon_mtrx  [1.1737614857797585, 3.0464091231446635, 3.0403333577597098, 4.0796687400828642, 4.1950332618948121] 

list_nRMSE_for_horizon_mtrx  [1.1737614857797585, 3.350558662362233, 3.4633249605018457, 4.600350617699446, 4.979536440969872



list_nMAE_for_horizon_mtrx  [1.0412539851865563, 1.6799877923205588, 2.1958484425325842, 2.6926946601570716, 3.1113231111205182, 3.2076838554300529, 4.1481175423554379, 4.0963161797349104, 4.9094835924264579, 7.8641799960880352, 7.1784365968159918, 5.3074712820251406, 5.2526103797889379, 5.8047800683075232, 6.8794761630755401, 7.8159372132438341, 9.758366756690247, 8.8756002840512487, 7.7552317521397276, 6.8812129187222011, 10.887647184735846, 4.5605373164961236, 0.00076026532735159438, nan] 

list_nRMSE_for_horizon_mtrx  [1.0412539851865563, 1.8513136284747367, 2.4597822824844511, 3.2335253736004756, 3.5526696512259068, 3.7707149495595575, 4.96612482417276, 4.8095296761389177, 6.2107469322502684, 9.8130607855359973, 8.5405200521226341, 6.4656644324901205, 6.5487599818489688, 7.3903738288670677, 8.4356302762908335, 10.093737888728928, 12.831270601848649, 11.944242120190491, 10.501886817952622, 9.9417808368276894, 15.19288670576079, 6.425790561517811, 0.0009781183269671756, nan] 

*****

In [None]:
data_to_save = pd.DataFrame(data = {'k': no_neigbors, nMAE' : list_nMAE_for_horizon_mtrx, 'nRMSE' : list_nRMSE_for_horizon_mtrx})  
# rows are neighbors; columns are horizons 

data_to_save.to_csv('DTW_kNN_results.csv')