<span style="color:gray">
Fraunhofer Institute for Integrated Circuits IIS, Division Engineering of Adaptive Systems EAS<br>
Münchner Straße 16, 01187 Dresden, Germany
</span>

---

## ESB - Energy Saving by Blockchain

---

## Detection of Electric Vehicles and Photovoltaic Systems in Smart Meter Data

---

# 1: Data Preprocessing


This notebook imports the time series from the Pecan Street dataset and the EV charging events from the ACN dataset. This is followed by the artificial addition of electric vehicle charging on selected California households from the Pecan Street dataset

### Requirements for running the notebook:

1. Pecan Street Dataset: Register at https://dataport.pecanstreet.org/ and download the following three csv files:
    * 15minute_data_newyork.csv
    * 15minute_data_california.csv
    * 15minute_data_austin.csv


2. ACN Dataset: Register at https://ev.caltech.edu/register and receive a access token to download the electric vehicle charging events


In [None]:
import pandas as pd
import numpy as np
import urllib.request
import base64
import json
import random

from dateutil import parser
from datetime import datetime, timedelta

#### Import the smart meter datasets of the Pecan Street Dataset

In [None]:
path=''

newyork = pd.read_csv (path + '15minute_data_newyork.csv', header=0, sep=',',decimal=".")
california = pd.read_csv (path + '15minute_data_california.csv', header=0, sep=',',decimal=".")
austin = pd.read_csv (path + '15minute_data_austin.csv', header=0, sep=',',decimal=".")

#### Sort data by the columns 'dataid' and 'local_15min'

In [None]:
newyork = newyork[['dataid', 'local_15min','grid','car1','car2','solar','solar2']].sort_values(['dataid', 'local_15min']).fillna(0).reset_index(drop=True)
california = california[['dataid', 'local_15min','grid','car1','car2','solar','solar2']].sort_values(['dataid', 'local_15min']).fillna(0).reset_index(drop=True)
austin = austin[['dataid', 'local_15min','grid','car1','car2','solar','solar2']].sort_values(['dataid', 'local_15min']).fillna(0).reset_index(drop=True)

#### Convert the column 'local_15min' to datetime

In [None]:
newyork['local_15min'] = pd.to_datetime(newyork['local_15min'], format='%Y-%m-%d %H:%M:%S-%f')
california['local_15min'] = pd.to_datetime(california['local_15min'], format='%Y-%m-%d %H:%M:%S-%f')
austin['local_15min'] = pd.to_datetime(austin['local_15min'], format='%Y-%m-%d %H:%M:%S-%f')

#### Group data into an hourly sampling rate

In [None]:
newyork = newyork.set_index('local_15min').groupby(['dataid',pd.Grouper(freq='1h')]).agg({'grid': 'mean', 
                                                                                        'car1':'mean', 
                                                                                        'car2':'mean', 
                                                                                        'solar': 'mean',
                                                                                        'solar2': 'mean'}).reset_index()

california = california.set_index('local_15min').groupby(['dataid',pd.Grouper(freq='1h')]).agg({'grid': 'mean', 
                                                                                        'car1':'mean', 
                                                                                        'car2':'mean', 
                                                                                        'solar': 'mean',
                                                                                        'solar2': 'mean'}).reset_index()

austin = austin.set_index('local_15min').groupby(['dataid',pd.Grouper(freq='1h')]).agg({'grid': 'mean', 
                                                                                        'car1':'mean', 
                                                                                        'car2':'mean', 
                                                                                        'solar': 'mean',
                                                                                        'solar2': 'mean'}).reset_index()



#### Insert ACN Access Token

In [None]:
access_token = ''

#### Download charging Session_IDs from the location office001, jpl and caltech, which have a charging time between 1.5 h and 3.5 h

In [None]:
def charging_cycle_session_ids(access_token, location, number_of_session , min_charging_time, max_charging_time):
    
    credentials = ('%s:%s' % (access_token, ''))
    encoded_credentials = base64.b64encode(credentials.encode('ascii'))
        
    acn_session_ids = pd.DataFrame()
    max_charging_time = timedelta(hours=max_charging_time)
    min_charging_time = timedelta(hours=min_charging_time)
    
    for i in range(1,60):
        req = urllib.request.Request(f'https://ev.caltech.edu/api/v1/sessions/{location}?&page={i}')
        req.add_header('Authorization', 'Basic %s' % encoded_credentials.decode("ascii"))
        data = json.loads(urllib.request.urlopen(req).read())
    
        for j in range(0,25):       
            if data['_items'][j]['doneChargingTime'] != None and data['_items'][j]['connectionTime'] != None:
                charging_time = parser.parse(data['_items'][j]['doneChargingTime']) - parser.parse(data['_items'][j]['connectionTime'])
                print(f"session_id: {data['_items'][j]['_id']}, charging_time: {charging_time}")

                if max_charging_time > charging_time > min_charging_time: 
                    acn_session_ids=acn_session_ids.append({'session_id':data['_items'][j]['_id']}, ignore_index=True)
 
                if len(acn_session_ids) == number_of_session:
                    break
            
        if len(acn_session_ids) == number_of_session:
            break            
          
    return acn_session_ids

In [None]:
acn_data_office001_session = charging_cycle_session_ids(access_token, 'office001', 35, 1.5, 3.5)       
acn_data_jpl_session = charging_cycle_session_ids(access_token, 'jpl', 35, 1.5, 3.5)       
acn_data_caltech_session = charging_cycle_session_ids(access_token, 'caltech', 35, 1.5, 3.5)       

acn_data_office001_session['location'] = 'office001'
acn_data_jpl_session['location'] = 'jpl'
acn_data_caltech_session['location'] = 'caltech'

acn_data_session = pd.concat((acn_data_office001_session, acn_data_jpl_session, acn_data_caltech_session), axis=0, ignore_index=True) 

#### Read out loading time series of Session_IDs

In [None]:
def concat_charging_cycles_by_ids(access_token, session_ids):
    
    credentials = ('%s:%s' % (access_token, ''))
    encoded_credentials = base64.b64encode(credentials.encode('ascii'))

    EV_charge_ts = pd.DataFrame()

    for index, row in session_ids.iterrows():
        print(f"{index}: session_id: {row['session_id']}, location: {row['location']}")

        req = urllib.request.Request(f"https://ev.caltech.edu/api/v1/sessions/{row['location']}/ts/?where=_id=='{row['session_id']}'")
        req.add_header('Authorization', 'Basic %s' % encoded_credentials.decode("ascii"))
        data = json.loads(urllib.request.urlopen(req).read())
        
        session_ts = pd.DataFrame()
        
        if data['_items'][0]['chargingCurrent'] != None:
        
            session_ts=session_ts.append({'ts':data['_items'][0]['chargingCurrent']['current']}, ignore_index=True)
            session_ts=pd.DataFrame(np.stack(session_ts['ts'].values).T, columns=[row['session_id']])
 
            EV_charge_ts = pd.concat((EV_charge_ts, session_ts), axis=1) 
            
    return EV_charge_ts

In [None]:
EV_charge_ts = concat_charging_cycles_by_ids(access_token, acn_data_session)

#### Sorting out charging processes with a charging power greater than 22kW

In [None]:
def sort_out_column_by_value(data, greater_than):
    
    EV_charge_ts = pd.DataFrame()
    
    for i in range(data.shape[1]-1):
        greater=False
        for j in range(len(data)):
            if data.iloc[j,i] > greater_than:
                greater=True
        if greater == False:
             EV_charge_ts = pd.concat((EV_charge_ts, data.iloc[:,i]), axis=1)
             
    return EV_charge_ts

In [None]:
EV_charge_ts = sort_out_column_by_value(EV_charge_ts, 22)

#### Aggregate the sampling rate of the charging cycles from 4-secondly to hourly

In [None]:
EV_charge_ts = EV_charge_ts.groupby(EV_charge_ts.index // 900).mean().reset_index(drop=True)

#### Replace nan values with 0

In [None]:
EV_charge_ts = EV_charge_ts[:4].fillna(0)

#### Group the households of the california dataset

In [None]:
household_id=california.groupby('dataid').agg(dataid = ('dataid', 'first')).reset_index(drop=True)

#### Adding daily EV charging events for every second household in the 'california' dataset

In [None]:
for i in range(len(california)):
    
    if household_id.index[household_id['dataid']==california['dataid'][i]].tolist()[0] % 2 == 0 and california['local_15min'][i].hour == 0:
        rand_EV = EV_charge_ts.columns[random.randrange(0, EV_charge_ts.shape[1])]
        start_time = random.randrange(0, 24)
        j = 0
        print(f"i: {i}, dataid {california['dataid'][i]}, rand_EV: {rand_EV}")
        
        while  j < EV_charge_ts.shape[0] and i <= len(california):
            california.at[i + j + start_time,'car1'] = EV_charge_ts.at[j,rand_EV]
            print(f"j: {j}, index: {i + j + start_time}")
            j = j + 1

#### Add loading events to the 'grid' column

In [None]:
california['grid'] = california[['grid','car1']].apply(lambda x: x['grid'] + x['car1'], axis=1)

#### Export datasets as CSV file

In [None]:
path = ''

newyork.to_csv(path + '1hour_data_newyork.csv', sep=',', encoding='utf-8', index=False)
california.to_csv(path + '1hour_data_california.csv', sep=',', encoding='utf-8', index=False)
austin.to_csv(path + '1hour_data_austin.csv', sep=',', encoding='utf-8', index=False)