# Cleaning code for Building 59 dataset

The .csv files from the dataset are located on the path declared right below.

In [3]:
# Basic imports
import csv
import numpy as np
import pandas as pd
from pandas import Series
import datetime
import time
import os
from fancyimpute import KNN, MatrixFactorization
import math

path = "../data" #Path with raw csv files


This is the code presented on the paper, we are not able to execute it due to RAM problems, so we will try to transform it

In [3]:


def clean_data_from_path(path):
    files = os.listdir(path)
    path_postprocess = path + "_postprocess"

    #read data files and adjust time format
    for filename in files:
        print(path+'/'+filename)
        row = pd.read_csv(path+'/'+filename)
        row['date'] = pd.to_datetime(row['date']) 
        helper=pd.DataFrame({'date': pd.date_range(row['date'].min(), row['date'].max(), freq='15min')})
        row = pd.merge(row, helper, on='date', how='outer').sort_values('date')
        count_out = Series([0],index=['date']) #count of outlier values
        count_gap = Series([0],index=['date']) #count of gap
        count_outgap = Series([0],index=['date']) #count of large gap (e.g., one day)
        gap_max=Series([0],index=['date']) #maximum gap
        #calculate the count of gap and do the interpolation based on the gap size 
        for i in range(1, len(row.columns)):
            k = 0
            out_gapcount=0
            start_index = {}
            starttime = {}
            end_index = {}
            endtime = {}
            gap = {}
            
    
            if pd.isnull(row.iloc[len(row.index)-1,i]) == True or math.isnan(row.iloc[len(row.index)-1,i])==True:
                row.iloc[len(row.index)-1,i]=0
            for j in range(0, len(row.index)):
                if (pd.isnull(row.iloc[j,i]) or math.isnan(row.iloc[j,i]))and pd.isnull(row.iloc[j-1,i]) == False:
                    starttime[k]=row.iloc[j-1,0] #start time of the gap
                    start_index[k]=j-1
                elif (pd.isnull(row.iloc[j-1,i]) or math.isnan(row.iloc[j-1,i])) and pd.isnull(row.iloc[j,i]) == False:
                    endtime[k]=row.iloc[j,0] #end time of the gap
                    end_index[k]=j
                    k=k+1
            if k != 0:
                for m in range(k):
                    starttime_struct=datetime.datetime.strptime(str(starttime[m]), '%Y-%m-%d %H:%M:%S')
                    endtime_struct = datetime.datetime.strptime(str(endtime[m]), '%Y-%m-%d %H:%M:%S')
                    gap[m]=(endtime_struct-starttime_struct).total_seconds()
                    if  gap[m]<= 3600: #linear interpolation if the gap is less than one hour
                        row.iloc[start_index[m]:end_index[m]+1,i]=row.iloc[start_index[m]:end_index[m]+1,i].interpolate(method='linear')
                    elif gap[m] >3600*24:
                        out_gapcount=out_gapcount+1
                maxgap = max(gap.values())/60
                gap_max=gap_max.append(Series(maxgap,index=[row.columns[i]]))
            outcount=np.sum(row.iloc[:, i]<0)/len(row)
            count_out=count_out.append(Series(outcount, index=[row.columns[i]]))
            count_gap= count_gap.append(Series(k, index=[row.columns[i]]))
            count_outgap = count_outgap.append(Series(out_gapcount,index=[row.columns[i]]))
            row_interpolation=np.array(row.iloc[:,1:])
        row_interpolation= KNN(k=3).fit_transform(row_interpolation) #Apply knn algorithm if the gap is larger than one hour
        for i in range(1, len(row.columns)):
            k=0
            start_index = {}
            starttime = {}
            end_index = {}
            endtime = {}
            for j in range(0, len(row.index)):
                if pd.isnull(row.iloc[j,i]) and pd.isnull(row.iloc[j-1,i]) == False:
                    starttime[k]=row.iloc[j-1,0]
                    start_index[k]=j-1
                elif pd.isnull(row.iloc[j-1,i]) and pd.isnull(row.iloc[j,i]) == False:
                    endtime[k]=row.iloc[j,0]
                    end_index[k]=j
                    k=k+1
            for m in range(k):
                starttime_struct=datetime.datetime.strptime(str(starttime[m]), '%Y-%m-%d %H:%M:%S')
                endtime_struct = datetime.datetime.strptime(str(endtime[m]), '%Y-%m-%d %H:%M:%S')
                gap[m]=(endtime_struct-starttime_struct).total_seconds()
                if  gap[m]>= 3600*24:
                    row_interpolation[start_index[m]:end_index[m]+1,i-1]=None
        if out_gapcount !=0:
            row_interpolation= MatrixFactorization().fit_transform(row_interpolation) #Apply MF algorithm if the gap is larger than one day         
        row.iloc[:,1:]=row_interpolation
        cols_not_null = (len(row)-row.count(axis=0))/len(row)
        data=pd.DataFrame({'missingrate':cols_not_null,'outrate':count_out,'count_outgap':count_outgap,'count_gap':count_gap,'maxgap':gap_max})
        data.to_csv(path_postprocess+'\\'+'parameter_'+filename, sep=',', header=True, index=True)
        row.to_csv(path_postprocess+'\\'+'data_'+filename, sep=',', header=True, index=False)


We have a problem with ele.csv (energy use), because it doesn't follow the same csv format as the other files: it includes an unnamed column without data. We solve this problem with the following code (run only once)

In [1]:
#datos = pd.read_csv(path+ '/ele.csv')
#datos.drop('Unnamed: 6', axis=1, inplace=True)
#datos = datos.set_index('date')
#datos.to_csv(path+ '/ele.csv')

### Study of null values by column
For a file, we will study the percentage of missing values it includes.

In [4]:
def summary(path, filename):
    print("SUMMARY OF " + filename)
    dataframe = pd.read_csv(path+'/'+filename)
    dataframe=dataframe.set_index('date')
    for i in range(dataframe.shape[1]):
        print(dataframe.columns[i])
        n_miss = dataframe.iloc[:,i].isnull().sum()
        perc = n_miss / dataframe.shape[0] * 100
        print('> %d, Missing: %d (%.1f%%)' % (i, n_miss, perc))

def summary2(dataframe):
    for i in range(dataframe.shape[1]):
        print(dataframe.columns[i])
        n_miss = dataframe.iloc[:,i].isnull().sum()
        perc = n_miss / dataframe.shape[0] * 100
        print('> %d, Missing: %d (%.1f%%)' % (i, n_miss, perc))

In [5]:
summary(path, 'ele.csv')

SUMMARY OF ele.csv
mels_S
> 0, Missing: 38 (0.0%)
lig_S
> 1, Missing: 34 (0.0%)
mels_N
> 2, Missing: 24 (0.0%)
hvac_N
> 3, Missing: 1542 (1.5%)
hvac_S
> 4, Missing: 1542 (1.5%)


Interpolation with KNN: 

https://machinelearningmastery.com/knn-imputation-for-missing-values-in-machine-learning/

In [8]:

def get_csv(path, filename):
    print(path+'/'+filename)
    path_postprocess = path+'/'+filename[:-4]+ "_postprocess.csv"
    row = pd.read_csv(path+'/'+filename)
    row['date'] = pd.to_datetime(row['date']) 
    helper=pd.DataFrame({'date': pd.date_range(row['date'].min(), row['date'].max(), freq='15min')})
    row = pd.merge(row, helper, on='date', how='outer').sort_values('date')
    count_out = Series([0],index=['date']) #count of outlier values
    count_gap = Series([0],index=['date']) #count of gap
    count_outgap = Series([0],index=['date']) #count of large gap (e.g., one day)
    gap_max=Series([0],index=['date']) #maximum gap
    summary2(row)
    #calculate the count of gap and do the interpolation based on the gap size 
    for i in range(1, len(row.columns)):
        print("Estamos en: ", i)
        k = 0
        out_gapcount=0
        start_index = {}
        starttime = {}
        end_index = {}
        endtime = {}
        gap = {}
        if pd.isnull(row.iloc[len(row.index)-1,i]) == True or math.isnan(row.iloc[len(row.index)-1,i])==True:
            row.iloc[len(row.index)-1,i]=0
        for j in range(0, len(row.index)):
            if (pd.isnull(row.iloc[j,i]) or math.isnan(row.iloc[j,i]))and pd.isnull(row.iloc[j-1,i]) == False:
                starttime[k]=row.iloc[j-1,0] #start time of the gap
                start_index[k]=j-1
            elif (pd.isnull(row.iloc[j-1,i]) or math.isnan(row.iloc[j-1,i])) and pd.isnull(row.iloc[j,i]) == False:
                endtime[k]=row.iloc[j,0] #end time of the gap
                end_index[k]=j
                k=k+1
        if k != 0:
            for m in range(k):
                starttime_struct=datetime.datetime.strptime(str(starttime[m]), '%Y-%m-%d %H:%M:%S')
                endtime_struct = datetime.datetime.strptime(str(endtime[m]), '%Y-%m-%d %H:%M:%S')
                gap[m]=(endtime_struct-starttime_struct).total_seconds()
                if  gap[m]<= 3600: #linear interpolation if the gap is less than one hour
                    print("Interpolation linear")
                    row.iloc[start_index[m]:end_index[m]+1,i]=row.iloc[start_index[m]:end_index[m]+1,i].interpolate(method='linear')
                elif gap[m] >3600*24:
                    out_gapcount=out_gapcount+1
            maxgap = max(gap.values())/60
            gap_max=gap_max.append(Series(maxgap,index=[row.columns[i]]))
        
    
    print("New summary: ")
    summary2(row)
    row.to_csv(path_postprocess, sep=',', header=True, index=False)

        

In [9]:
filename = 'ele.csv'
get_csv(path, filename)

../data/ele.csv
date
> 0, Missing: 0 (0.0%)
mels_S
> 1, Missing: 3728 (3.5%)
lig_S
> 2, Missing: 3724 (3.5%)
mels_N
> 3, Missing: 3714 (3.5%)
hvac_N
> 4, Missing: 5232 (4.9%)
hvac_S
> 5, Missing: 5232 (4.9%)
Estamos en:  1
Interpolation linear
Interpolation linear
Estamos en:  2
Interpolation linear
Interpolation linear
Estamos en:  3
Interpolation linear
Interpolation linear
Estamos en:  4
Interpolation linear
Interpolation linear
Estamos en:  5
Interpolation linear
Interpolation linear
New summary: 
date
> 0, Missing: 0 (0.0%)
mels_S
> 1, Missing: 3726 (3.5%)
lig_S
> 2, Missing: 3722 (3.5%)
mels_N
> 3, Missing: 3712 (3.5%)
hvac_N
> 4, Missing: 5230 (4.9%)
hvac_S
> 5, Missing: 5230 (4.9%)


In [None]:
outcount=np.sum(row.iloc[:, i]<0)/len(row)
        count_out=count_out.append(Series(outcount, index=[row.columns[i]]))
        count_gap= count_gap.append(Series(k, index=[row.columns[i]]))
        count_outgap = count_outgap.append(Series(out_gapcount,index=[row.columns[i]]))
        row_interpolation=np.array(row.iloc[:,1:])
        
        
        
    row_interpolation= KNN(k=3).fit_transform(row_interpolation) #Apply knn algorithm if the gap is larger than one hour
    for i in range(1, len(row.columns)):
        k=0
        start_index = {}
        starttime = {}
        end_index = {}
        endtime = {}
        for j in range(0, len(row.index)):
            if pd.isnull(row.iloc[j,i]) and pd.isnull(row.iloc[j-1,i]) == False:
                starttime[k]=row.iloc[j-1,0]
                start_index[k]=j-1
            elif pd.isnull(row.iloc[j-1,i]) and pd.isnull(row.iloc[j,i]) == False:
                endtime[k]=row.iloc[j,0]
                end_index[k]=j
                k=k+1
        for m in range(k):
            starttime_struct=datetime.datetime.strptime(str(starttime[m]), '%Y-%m-%d %H:%M:%S')
            endtime_struct = datetime.datetime.strptime(str(endtime[m]), '%Y-%m-%d %H:%M:%S')
            gap[m]=(endtime_struct-starttime_struct).total_seconds()
            if  gap[m]>= 3600*24:
                row_interpolation[start_index[m]:end_index[m]+1,i-1]=None
    if out_gapcount !=0:
        row_interpolation= MatrixFactorization().fit_transform(row_interpolation) #Apply MF algorithm if the gap is larger than one day         
    row.iloc[:,1:]=row_interpolation
    cols_not_null = (len(row)-row.count(axis=0))/len(row)
    data=pd.DataFrame({'missingrate':cols_not_null,'outrate':count_out,'count_outgap':count_outgap,'count_gap':count_gap,'maxgap':gap_max})
    data.to_csv(path_postprocess+'\\'+'parameter_'+filename, sep=',', header=True, index=True)
    row.to_csv(path_postprocess+'\\'+'data_'+filename, sep=',', header=True, index=False)

