# Cleaning code for Building 59 dataset

The .csv files from the dataset are located on the path declared right below.

In [1]:
# Basic imports
import csv
import numpy as np
import pandas as pd
from pandas import Series
import datetime
import time
import os
from fancyimpute import KNN, MatrixFactorization
import math

path = "../data" #Path with raw csv files


This is the code presented on the paper, we are not able to execute it due to RAM problems, so we will try to transform it

In [3]:


def clean_data_from_path(path):
    files = os.listdir(path)
    path_postprocess = path + "_postprocess"

    #read data files and adjust time format
    for filename in files:
        print(path+'/'+filename)
        df = pd.read_csv(path+'/'+filename)
        df['date'] = pd.to_datetime(df['date']) 
        helper=pd.DataFrame({'date': pd.date_range(df['date'].min(), df['date'].max(), freq='15min')})
        df = pd.merge(df, helper, on='date', how='outer').sort_values('date')
        count_out = Series([0],index=['date']) #count of outlier values
        count_gap = Series([0],index=['date']) #count of gap
        count_outgap = Series([0],index=['date']) #count of large gap (e.g., one day)
        gap_max=Series([0],index=['date']) #maximum gap
        #calculate the count of gap and do the interpolation based on the gap size 
        for i in range(1, len(df.columns)):
            k = 0
            out_gapcount=0
            start_index = {}
            starttime = {}
            end_index = {}
            endtime = {}
            gap = {}
            
    
            if pd.isnull(df.iloc[len(df.index)-1,i]) == True or math.isnan(df.iloc[len(df.index)-1,i])==True:
                df.iloc[len(df.index)-1,i]=0
            for j in range(0, len(df.index)):
                if (pd.isnull(df.iloc[j,i]) or math.isnan(df.iloc[j,i]))and pd.isnull(df.iloc[j-1,i]) == False:
                    starttime[k]=df.iloc[j-1,0] #start time of the gap
                    start_index[k]=j-1
                elif (pd.isnull(df.iloc[j-1,i]) or math.isnan(df.iloc[j-1,i])) and pd.isnull(df.iloc[j,i]) == False:
                    endtime[k]=df.iloc[j,0] #end time of the gap
                    end_index[k]=j
                    k=k+1
            if k != 0:
                for m in range(k):
                    starttime_struct=datetime.datetime.strptime(str(starttime[m]), '%Y-%m-%d %H:%M:%S')
                    endtime_struct = datetime.datetime.strptime(str(endtime[m]), '%Y-%m-%d %H:%M:%S')
                    gap[m]=(endtime_struct-starttime_struct).total_seconds()
                    if  gap[m]<= 3600: #linear interpolation if the gap is less than one hour
                        df.iloc[start_index[m]:end_index[m]+1,i]=df.iloc[start_index[m]:end_index[m]+1,i].interpolate(method='linear')
                    elif gap[m] >3600*24:
                        out_gapcount=out_gapcount+1
                maxgap = max(gap.values())/60
                gap_max=gap_max.append(Series(maxgap,index=[df.columns[i]]))
            outcount=np.sum(df.iloc[:, i]<0)/len(df)
            count_out=count_out.append(Series(outcount, index=[df.columns[i]]))
            count_gap= count_gap.append(Series(k, index=[df.columns[i]]))
            count_outgap = count_outgap.append(Series(out_gapcount,index=[df.columns[i]]))
            df_interpolation=np.array(df.iloc[:,1:])
        df_interpolation= KNN(k=3).fit_transform(df_interpolation) #Apply knn algorithm if the gap is larger than one hour
        unfill_large_gaps(df_filled, df)
        if out_gapcount !=0:
            df_interpolation= MatrixFactorization().fit_transform(df_interpolation) #Apply MF algorithm if the gap is larger than one day         
        df.iloc[:,1:]=df_interpolation
        cols_not_null = (len(df)-df.count(axis=0))/len(df)
        data=pd.DataFrame({'missingrate':cols_not_null,'outrate':count_out,'count_outgap':count_outgap,'count_gap':count_gap,'maxgap':gap_max})
        data.to_csv(path_postprocess+'\\'+'parameter_'+filename, sep=',', header=True, index=True)
        df.to_csv(path_postprocess+'\\'+'data_'+filename, sep=',', header=True, index=False)


We have a problem with ele.csv (energy use), because it doesn't follow the same csv format as the other files: it includes an unnamed column without data. We solve this problem with the following code (run only once)

In [1]:
#datos = pd.read_csv(path+ '/ele.csv')
#datos.drop('Unnamed: 6', axis=1, inplace=True)
#datos = datos.set_index('date')
#datos.to_csv(path+ '/ele.csv')

### Study of null values by column
For a file, we will study the percentage of missing values it includes.

In [2]:
def summary(path, filename):
    print("SUMMARY OF " + filename)
    dataframe = pd.read_csv(path+'/'+filename)
    dataframe=dataframe.set_index('date')
    for i in range(dataframe.shape[1]):
        print(dataframe.columns[i])
        n_miss = dataframe.iloc[:,i].isnull().sum()
        perc = n_miss / dataframe.shape[0] * 100
        print('> %d, Missing: %d (%.1f%%)' % (i, n_miss, perc))

def summary2(dataframe):
    for i in range(dataframe.shape[1]):
        print(dataframe.columns[i])
        n_miss = dataframe.iloc[:,i].isnull().sum()
        perc = n_miss / dataframe.shape[0] * 100
        print('> %d, Missing: %d (%.1f%%)' % (i, n_miss, perc))

In [3]:
summary(path, 'ele.csv')

SUMMARY OF ele.csv
mels_S
> 0, Missing: 38 (0.0%)
lig_S
> 1, Missing: 34 (0.0%)
mels_N
> 2, Missing: 24 (0.0%)
hvac_N
> 3, Missing: 1542 (1.5%)
hvac_S
> 4, Missing: 1542 (1.5%)


## Interpolation depending on the size of the gap:

- If it's smaller than 1h, we use linear interpolation
- If it's bigger than 1 day, we use KNN with n=3


Interpolation with KNN: 

https://machinelearningmastery.com/knn-imputation-for-missing-values-in-machine-learning/

In [4]:
from sklearn.impute import KNNImputer
files = os.listdir(path)
path_postprocess = path + "_postprocess"
#in the cleaning code we insert the rows that are apparently missing

freqs = {'zone_co2.csv':'1min', 'ele.csv': '15min', 'zone_temp_sp_c.csv':'5min', 'occ.csv':'1min', 'zone_temp_exterior.csv':'1min', 'zone_temp_sp_h.csv':'5min', 'site_weather.csv':'15min', 'wifi.csv': '10min', 'zone_temp_interior.csv':'10min'}

In [5]:
def unfill_large_gaps(df_filled, df):
    
    for i in range(1, len(df.columns)):
        k=0
        start_index = {}
        starttime = {}
        end_index = {}
        endtime = {}
        gap={}
        for j in range(0, len(df.index)):
            if pd.isnull(df.iloc[j,i]) and pd.isnull(df.iloc[j-1,i]) == False:
                starttime[k]=df.iloc[j-1,0]
                start_index[k]=j-1
            elif pd.isnull(df.iloc[j-1,i]) and pd.isnull(df.iloc[j,i]) == False:
                endtime[k]=df.iloc[j,0]
                end_index[k]=j
                k=k+1
        for m in range(k):
            starttime_struct=datetime.datetime.strptime(str(starttime[m]), '%Y-%m-%d %H:%M:%S')
            endtime_struct = datetime.datetime.strptime(str(endtime[m]), '%Y-%m-%d %H:%M:%S')
            gap[m]=(endtime_struct-starttime_struct).total_seconds()
            if  gap[m]>= 3600*24:
                df_filled.iloc[start_index[m]:end_index[m]+1,i-1]= None
                

In [6]:

def get_csv(path, filename, freq):
    print(path+'/'+filename)
    path_postprocess = path+"_postprocess"+'/'+filename[:-4]+ "_postprocess.csv"
    df = pd.read_csv(path+'/'+filename)
    df['date'] = pd.to_datetime(df['date']) 
    helper=pd.DataFrame({'date': pd.date_range(df['date'].min(), df['date'].max(), freq=freq)})
    df = pd.merge(df, helper, on='date', how='outer').sort_values('date')
    count_out = Series([0],index=['date']) #count of outlier values
    count_gap = Series([0],index=['date']) #count of gap
    count_outgap = Series([0],index=['date']) #count of large gap (e.g., one day)
    gap_max=Series([0],index=['date']) #maximum gap
    out_gapcount=0
    summary2(df)
    #calculate the count of gap and do the interpolation based on the gap size 
    for i in range(1, len(df.columns)):
        print("Estamos en: ", i)
        k = 0
        
        start_index = {}
        starttime = {}
        end_index = {}
        endtime = {}
        gap = {}
        if pd.isnull(df.iloc[len(df.index)-1,i]) == True or math.isnan(df.iloc[len(df.index)-1,i])==True:
            df.iloc[len(df.index)-1,i]=0
        for j in range(0, len(df.index)):
            if (pd.isnull(df.iloc[j,i]) or math.isnan(df.iloc[j,i]))and pd.isnull(df.iloc[j-1,i]) == False:
                starttime[k]=df.iloc[j-1,0] #start time of the gap
                start_index[k]=j-1
            elif (pd.isnull(df.iloc[j-1,i]) or math.isnan(df.iloc[j-1,i])) and pd.isnull(df.iloc[j,i]) == False:
                endtime[k]=df.iloc[j,0] #end time of the gap
                end_index[k]=j
                k=k+1
        if k != 0:
            for m in range(k):
                starttime_struct=datetime.datetime.strptime(str(starttime[m]), '%Y-%m-%d %H:%M:%S')
                endtime_struct = datetime.datetime.strptime(str(endtime[m]), '%Y-%m-%d %H:%M:%S')
                gap[m]=(endtime_struct-starttime_struct).total_seconds()
                if  gap[m]<= 3600: #linear interpolation if the gap is less than one hour
                    print("Interpolation linear")
                    df.iloc[start_index[m]:end_index[m]+1,i]=df.iloc[start_index[m]:end_index[m]+1,i].interpolate(method='linear')
                elif gap[m] >3600*24:
                    out_gapcount=out_gapcount+1
            maxgap = max(gap.values())/60
            gap_max=gap_max.append(Series(maxgap,index=[df.columns[i]]))
        outcount=np.sum(df.iloc[:, i]<0)/len(df)
        count_out=count_out.append(Series(outcount, index=[df.columns[i]]))
        count_gap= count_gap.append(Series(k, index=[df.columns[i]]))
        count_outgap = count_outgap.append(Series(out_gapcount,index=[df.columns[i]]))
    #Interpolate whole dataframe with KNN
    df_interpolated = df.iloc[:,1:]
    imputer = KNNImputer(n_neighbors=3, weights='distance', metric='nan_euclidean')
    imputer.fit(df_interpolated)
    df_interpolated = pd.DataFrame(imputer.transform(df_interpolated), columns=df_interpolated.columns)

    
    #Export into csv
    print("New summary(final): ")
    summary2(df_interpolated)
    df.iloc[:,1:] = df_interpolated
    df.to_csv(path_postprocess, sep=',', header=True, index=False)

    #Final set of information
    cols_not_null = (len(df)-df.count(axis=0))/len(df)
    data=pd.DataFrame({'missingrate':cols_not_null,'outrate':count_out,'count_outgap':count_outgap,'count_gap':count_gap,'maxgap':gap_max})
    return data

In [7]:
files = os.listdir(path)
path_postprocess = path + "_postprocess"

get_csv(path, 'site_weather.csv', '15min')


../data/site_weather.csv
date
> 0, Missing: 0 (0.0%)
air_temp_set_1
> 1, Missing: 0 (0.0%)
air_temp_set_2
> 2, Missing: 0 (0.0%)
dew_point_temperature_set_1d
> 3, Missing: 0 (0.0%)
relative_humidity_set_1
> 4, Missing: 0 (0.0%)
solar_radiation_set_1
> 5, Missing: 0 (0.0%)
Estamos en:  1
Estamos en:  2
Estamos en:  3
Estamos en:  4
Estamos en:  5
New summary(final): 
air_temp_set_1
> 0, Missing: 0 (0.0%)
air_temp_set_2
> 1, Missing: 0 (0.0%)
dew_point_temperature_set_1d
> 2, Missing: 0 (0.0%)
relative_humidity_set_1
> 3, Missing: 0 (0.0%)
solar_radiation_set_1
> 4, Missing: 0 (0.0%)


Unnamed: 0,missingrate,outrate,count_outgap,count_gap,maxgap
air_temp_set_1,0.0,0.0,0,0,
air_temp_set_2,0.0,0.0,0,0,
date,0.0,0.0,0,0,0.0
dew_point_temperature_set_1d,0.0,0.100326,0,0,
relative_humidity_set_1,0.0,0.0,0,0,
solar_radiation_set_1,0.0,0.0,0,0,


In [21]:
df = pd.read_csv('../data/ele.csv')

df['date'] = pd.to_datetime(df['date']) 
helper=pd.DataFrame({'date': pd.date_range(df['date'].min(), df['date'].max(), freq='15min')})
prueba = pd.merge(df, helper, on='date', how='outer').sort_values('date')
prueba.head()

Unnamed: 0,date,mels_S,lig_S,mels_N,hvac_N,hvac_S
0,2018-01-01 01:00:00,1.2,0.2,7.5,37.400002,19.5
1,2018-01-01 01:15:00,1.3,0.2,6.8,37.5,19.889999
2,2018-01-01 01:30:00,1.1,0.2,7.4,38.0,19.299999
3,2018-01-01 01:45:00,1.2,0.2,7.7,37.200001,18.889999
4,2018-01-01 02:00:00,1.1,0.2,7.3,37.400002,24.700001
