# Practical Exercise: Data Preparation and Visualisation

## Part I. Data Loading and Preparation

In [None]:
import pandas as pd
import numpy as np
import h5py
import os, glob
import re
from datetime import datetime, date, time
from six import iteritems



In [None]:
##########################
class Blond(object):
    """
        class blond: attributes: date, list of files
    """
    _SD_centered = []
    _SD_calibrated = []

    def __init__(self, date, day_data = {}):
        self.date = date
        self._day_data = day_data
        self.time_stamps = {}


    def list_files(self):
        return self._day_data


    @staticmethod
    def _regex_map(pattern, strings_list):

        regex_func = lambda f: re.search(pattern, f)
        filter_list = map(regex_func, strings_list)
        filter_list = filter(lambda x: x is not None, list(filter_list))
        filter_list = map(lambda x: x.group(1), list(filter_list))
        return list(filter_list)


    def find_corresponding_file(self, the_time,all_timestamps):
        current_ts = all_timestamps[0]
        i=1
        try:
            while the_time >= current_ts:
                current_ts = all_timestamps[i]
                i+=1
        except: # in case requested time is in the last index of the list, we will have a error in [i]. so this except makes the index work.
            i+=1

        return i-2
    def _read_files_from_folder(self, files, start_ts, end_ts,path_to_files):
        """This method gets file in the folder w.r.t. to the timeframe start_ts - end_ts

            How it works:
            1. gets a list of files in the folder
            2. extracts timestamps with _regex_map() method and converts it to datetime.time()
            3. fetch the first file which fits the timeframe
            4. add the remaining files by the filter start_timestamp <= file_timestamp <= end_timestamp

        """
        pattern = r'(\d{4}-\d{2}-\d{2}T\d{2}-\d{2}-\d{2})'
        timestamps = self._regex_map(pattern, files)
        time_format = '%Y-%m-%dT%H-%M-%S'
        timestamps = list(map(lambda ts: datetime.strptime(ts, time_format ).time(), timestamps))
        timestamps.sort()
        """ get the first file timestamp"""
        file_index = self.find_corresponding_file(start_ts,timestamps)
        #print("start file index:"+str(file_index))
        if file_index < 0: #no file found
            return [],[]

        """add first file timestamp to the rest"""
        res_timestamps = [timestamps[file_index]] + [ts for ts in timestamps if start_ts <= ts <= end_ts]
        timestamps_filter= map(lambda ts: datetime.combine(self.date, ts).strftime(time_format), list(res_timestamps))

        res_list = []
        for ts in timestamps_filter:
            res_list += [f for f in files if ts in f]

        data = []
        for a_file in res_list:
            try:
                data.append(h5py.File(path_to_files + a_file,'r+'))
            except:
                temp_file = h5py.File(path_to_files + a_file, 'r')
                temp_file.close()
                data.append(h5py.File(path_to_files + a_file,'r+'))

        return data , res_timestamps


    def read_files(self, start_ts, end_ts):
        """ read_files() method scans the relevant folders and return a dictionary
            with the files relevant to the timeframe (start_ts, end_ts)
                {'clear'  : [files],
                 'medal-1': [files],
                 'medal-2': [files],
                    ...
                }
        """

        """READING CLEAR UNIT"""
        path_to_clear = './data/clear/'
        files_all = next(os.walk(path_to_clear))[2]
        self._day_data['clear'], self.time_stamps["clear"] = self._read_files_from_folder(files_all, start_ts, end_ts,path_to_clear)
        #self._day_data['clear'] = [h5py.File(path_to_clear + file_name,'r+') for file_name in target_files]



        """READING MEDAL UNITS"""

        path_to_medals = './data/medal*/'
        folders_list =glob.glob(path_to_medals)
        folders_list.sort()
        for folder in folders_list:
            files_all = next(os.walk(folder))[2]
            medal_name = re.search(r'(medal-\d+)', folder).group(1)
            self._day_data[medal_name], self.time_stamps[medal_name] = self._read_files_from_folder(files_all, start_ts, end_ts,folder)
            #self._day_data[medal_name] = [h5py.File(folder + file_name,'r+') for file_name in target_files]


    """ center_inplace and calibrate inplace read file-by-file, do the corresponding operations and write back
        good thing: we can process much more files like that and not be bounded by memory since each file is less than 3 GB
        bad thing: we can't coerse from int to float
        We do not use them.
    """
    def center_inplace(self, device, signal):
        if device+signal in self._SD_centered:
            print("Signal '{}' for '{}' has been already centered.".format(signal, device))
            return
        else:
            self._SD_centered.append(device+signal)
            data_list = self._day_data[device]
            if device != 'clear': #NO OFFSET FOR CLEAR DEVICE
                for i, data_file in enumerate(data_list):
                    DC_offset = data_file[signal].attrs['removed_offset']
                    #print(DC_offset)
                    data_file[signal][:] = data_file[signal][:] + DC_offset
                    self._day_data[device][i] = data_file


    def calibrate_inplace(self, device, signal):
        if device+signal in self._SD_calibrated:
            print("Signal '{}' for '{}' has been already calibrated.".format(signal, device))
            return
        else:
            self._SD_calibrated.append(device+signal)
            data_list = self._day_data[device]
            for i, data_file in enumerate(data_list):
                factor = data_file[signal].attrs['calibration_factor']
                #print(factor)
                data_file[signal][:] = (data_file[signal][:] * factor)
                self._day_data[device][i] = data_file


    def dict_read_signal(self, device, signal):
        """reads the signal of the corresponding device and writes it to the dictionary"""
        files = self._day_data[device]
        return {'signal': device+'_'+signal,
                'attributes': list(map(lambda f: {'DC_offset': 0 if device == 'clear' else f[signal].attrs['removed_offset'],
                              'calibration_factor': f[signal].attrs['calibration_factor'],
                              'values': f[signal][:]
                             }, files))

               }

    def center_and_calibrate(self, dict_signal):
        """reads the dictionary from the dict_read_signal() method, then centers and calibrates it"""
        data_calibrated = {}
        signal_data = dict_signal['attributes']
        for data in signal_data:
            data_calibrated[dict_signal['signal']] = ((data['values'] + data['DC_offset']) * data['calibration_factor']).astype("<f4")

        return data_calibrated
##########################
def get_time_diff(t1,t2):
    t1_s = (t1.hour*60*60 + t1.minute*60 + t1.second)
    t2_s = (t2.hour*60*60 + t2.minute*60 + t2.second)

    delta_s = max([t1_s, t2_s]) - min([t1_s, t2_s])
    return delta_s
##########################
blond = Blond(date(2016,10,5))

""" Define a timeframe"""
start_ts = time(0,50,0) # start_hours_minutes
end_ts   = time(1,10,10)

"""Read MEDAL and CLEAR data """
blond.read_files(start_ts, end_ts)


"""Checking if files have been retrieved"""
print("list the files:")
print(blond.list_files())

##########################
"""signals acquisited by MEDAL"""
print("Files from Medal 1")
medal_file = blond.list_files()['medal-2'][0]
print(type(medal_file["current1"]))
print([key for key in medal_file.keys()])
##########################
"""signals acquisited by CLEAR"""
print("Keys from CLEAR")
clear_file = blond.list_files()['clear'][0]
print([key for key in clear_file.keys()])
##########################
device = 'medal-2'
signal = 'current1'

"""Raw signal with offset and calibration factor attributes"""
dict_signal = blond.dict_read_signal(device, signal)
print(dict_signal)


"""calibrated signal"""
blond.center_and_calibrate(dict_signal)
##########################

def create_options():
    seconds = get_time_diff(start_ts,end_ts)
    minutes = int(seconds/60)
    hours = int(minutes/60)

    options={
        "duration":[{"label":"Duration in Seconds", "value":"None"}]+ [{"label":str(i/10.0)+" sec","value":i/10.0} for i in range(1,101)],
        "seconds":[{"label":"Start Second", "value":"None"}]+[{"label":i,"value":i} for i in range(60)],
        "minutes":[{"label":"Start Minute", "value":"None"}]+[{"label":i,"value":i} for i in range(60)],
        "hours":[{"label":"Start Hour", "value":"None"}]+[{"label":i,"value":i} for i in range(start_ts.hour,end_ts.hour+1)],
        "critical":{
            "start":{
                "minutes":[],
                "seconds":[]
            },
            "end":{
                "minutes":[],
                "seconds":[]
            }
        }
    }

    if start_ts.hour != end_ts.hour:
        options["critical"]["start"]["minutes"] = [{"label":"Start Minute", "value":"None"}]+ [{"label":i,"value":i} for i in range(start_ts.minute,60)]
        options["critical"]["end"]["minutes"] = [{"label":"Start Minute", "value":"None"}]+ [{"label":i,"value":i} for i in range(0,end_ts.minute+1)]
        if start_ts.minute != end_ts.minute:
            options["critical"]["start"]["seconds"] = [{"label":"Start Second", "value":"None"}]+[{"label":i,"value":i} for i in range(start_ts.second,60)]
            options["critical"]["end"]["seconds"] = [{"label":"Start Second", "value":"None"}]+[{"label":i,"value":i} for i in range(0,end_ts.second+1)]
        else:
            options["critical"]["start"]["seconds"] = [{"label":"Start Second", "value":"None"}]+[{"label":i,"value":i} for i in range(start_ts.second,end_ts.second+1)]

    else:
        options["critical"]["start"]["minutes"] = [{"label":"Start Minute", "value":"None"}]+[{"label":i,"value":i} for i in range(start_ts.minute,end_ts.minute+1)]
        # in this case, hour values are same, we do not need critical end, because we will always see that, hour of time will always be equal to hour of start time

    return options

time_options = create_options()
##########################

###  1. Data Reading and Exploration

### Reading files

In [None]:
blond = Blond(date(2016,10,5))

""" Define a timeframe"""
start_ts = time(6,17,50) # start_hours_minutes
end_ts   = time(6,17,55)

"""Read MEDAL and CLEAR data """
blond.read_files(start_ts, end_ts)


"""Checking if files have been retrieved"""
blond.list_files()
    

### Exploration


In [None]:
"""signals acquisited by MEDAL"""
medal_file = blond.list_files()['medal-1'][0]
[key for key in medal_file.keys()]

In [None]:
"""signals acquisited by CLEAR"""
clear_file = blond.list_files()['clear'][0]
[key for key in clear_file.keys()]

### Centering and calibrating

In [None]:
device = 'medal-2'
signal = 'current1'

"""Raw signal with offset and calibration factor attributes"""
dict_signal = blond.dict_read_signal(device, signal)
print(dict_signal)


"""calibrated signal"""
blond.center_and_calibrate(dict_signal)




## Part II. Data Visualisation

In [None]:
"""
Function to run dash app in jupyter:
https://community.plot.ly/t/can-i-run-dash-app-in-jupyter/5235
"""
from IPython import display
def show_app(app,  # type: dash.Dash
             port=8050,
             width=700,
             height=350,
             offline=True,
             style=True,
             **dash_flask_kwargs):
    """
    Run the application inside a Jupyter notebook and show an iframe with it
    :param app:
    :param port:
    :param width:
    :param height:
    :param offline:
    :return:
    """
    url = 'http://localhost:' + str(port)  + '/notebooks'
    iframe = '<iframe src="{url}" width={width} height={height}></iframe>'.format(url=url,
                                                                                  width=width,
                                                                                  height=height)
    display.display_html(iframe, raw=True)
    if offline:
        app.css.config.serve_locally = True
        app.scripts.config.serve_locally = True
    if style:
        external_css = ["https://fonts.googleapis.com/css?family=Raleway:400,300,600",
                        "https://maxcdn.bootstrapcdn.com/font-awesome/4.7.0/css/font-awesome.min.css",
                        "http://getbootstrap.com/dist/css/bootstrap.min.css", ]

        for css in external_css:
            app.css.append_css({"external_url": css})

        external_js = ["https://code.jquery.com/jquery-3.2.1.min.js",
                       "https://cdn.rawgit.com/plotly/dash-app-stylesheets/a3401de132a6d0b652ba11548736b1d1e80aa10d/dash-goldman-sachs-report-js.js",
                       "http://getbootstrap.com/dist/js/bootstrap.min.js"]

        for js in external_js:
            app.scripts.append_script({"external_url": js})

    return app.run_server(debug=False,  # needs to be false in Jupyter
                          port=port,
                          **dash_flask_kwargs)

In [None]:
import dash
import dash_core_components as dcc
import dash_html_components as html

app = dash.Dash()

app.layout = html.Div(children=[
    html.H1(children='Hello Dash'),

    html.Div(children='''
        Dash: A web application framework for Python.
    '''),

    dcc.Graph(
        id='example-graph',
        figure={
            'data': [
                {'x': [1, 2, 3], 'y': [4, 1, 5], 'type': 'line', 'name': 'SF'}
            ],
            'layout': {
                'title': 'Dash Data Visualization'
            }
        }
    )
])


show_app(app)

## Plot of appliances

In [None]:
import matplotlib.pyplot as plt

sps = 6400
phase = 1
requested_time = time(1,10,10)
file_index = blond.find_corresponding_file(requested_time, blond.time_stamps['clear'])
time_diff = get_time_diff(requested_time , blond.time_stamps['clear'][file_index])
data_index_shift = time_diff * sps
temp_data = blond.list_files()['clear'][file_index]["current"+str(phase)][data_index_shift:data_index_shift + int(sps*0.1)]

curr_rms = []
for curr in temp_data:
    curr_rms.append(np.sqrt(np.mean(curr**2)))
    
fig, ax = plt.subplots(nrows=1, ncols=3, figsize=(18, 5))

plot1 = ax[0]
plot1.plot(temp_data, color='b', label='Current')
plot1.plot(curr_rms, color='r', label='Period RMS of Current')
plot1.set_title('Current of AC Electrolux')
plot1.set_xlabel('Time [s]')
plot1.set_ylabel('Current [A]')
plot1.legend()

plot2 = ax[1]
plot2.plot(temp_data, color='b', label='Current')
plot2.plot(curr_rms, color='r', label='Period RMS of Current')
plot2.set_title('Waveform Comparison of AC Electrolux')
plot2.set_xlabel('Time [ms]')
plot2.legend()

plot3 = ax[2]
plot3.plot(temp_data, color='b', label='Current')
plot3.plot(curr_rms, color='r', label='Period RMS of Current')
plot3.set_title('Power over Time of AC Electrolux')
plot3.set_xlabel('Time [s]')
plot3.set_ylabel('Power [W]')
plot3.legend()

plt.show()