# Data cleaning of the 7 buildings dataset

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import datetime
import os

In [2]:
path_data = 'C:/Users/Orson/Documents/Digital_Lab/Projet_GAC/Datasets/7buildings/'

## Creating each building consumption file

In [3]:
def dateparse (timestamp): 
    return datetime.datetime.strptime(timestamp,'%m/%d/%Y %H:%M')

In [4]:
list_filenames = []
for filename in os.listdir(path_data+'original'):
    list_filenames.append(filename)

In [5]:
for i in range(5):
    filename = list_filenames[i]
    df = pd.read_csv(path_data + 'original/' + filename,sep = ',',
                     parse_dates = ['Timestamp'],
                     date_parser = dateparse)
    df = df.dropna(how='all')
    df = df.rename(columns={'Timestamp':'timestamp','OAT (F)':'temperature','Power (kW)':'active_power'})\
                        .reindex(columns=['timestamp','active_power','temperature'])
    df['active_power'] = df['active_power'].apply(lambda x: x/1000)
    df.to_csv('C:/Users/Orson/Documents/Digital_Lab/Projet_GAC/Datasets/7buildings/100{}.csv'.format(i+1),sep=';',index=False)

In [6]:
dfs = []
for i in range(5,8):
    df = pd.read_csv(path_data + 'original/' +  list_filenames[i],sep=',')
    df = df.dropna(how='all')
    df['Date'] = df['Date'].apply((lambda x: dateparse(x)))
    dfs.append(df)

In [7]:
df = pd.concat(dfs)
df = df.rename(columns={'Date':'timestamp','OAT':'temperature','Building 6 kW':'active_power'})\
                        .reindex(columns=['timestamp','active_power','temperature'])
df['active_power'] = df['active_power'].apply(lambda x: x/1000)

In [9]:
df.to_csv(path_data + '1006.csv',sep=';',index=False)

In [10]:
dfs = []
for i in range(8,11):
    df = pd.read_csv(path_data + 'original/' +  list_filenames[i],sep=',')
    df = df.dropna(how='all')
    df['Date'] = df['Date'].apply((lambda x: dateparse(x)))
    dfs.append(df)

In [11]:
df = pd.concat(dfs)
df = df.rename(columns={'Date':'timestamp','OAT':'temperature','Building 7 kW':'active_power'})\
                        .reindex(columns=['timestamp','active_power','temperature'])
df['active_power'] = df['active_power'].apply(lambda x: x/1000)

In [12]:
df.to_csv(path_data+'1007.csv',sep=';',index=False)

## Creating metadata file

In [52]:
metadata1000 = pd.DataFrame(columns=['bat_id','is_house','time_step','lat','long','surface',
                                     'monday_is_off',
                                     'tuesday_is_off',
                                     'wednesday_is_off',
                                     'thursday_is_off',
                                     'friday_is_off',
                                     'saturday_is_off',
                                     'sunday_is_off'
                                    ])

for i in range(8):
    if i < 6:
        metadata1000 = metadata1000.append({'bat_id':1000+i,
                        'is_house':False,
                        'time_step':15,
                        'lat':np.nan,
                        'long':np.nan,
                        'surface':np.nan,
                        'monday_is_off':np.nan,
                        'tuesday_is_off':np.nan,
                        'wednesday_is_off':np.nan,
                        'thursday_is_off':np.nan,
                        'friday_is_off':np.nan,
                        'saturday_is_off':np.nan,
                        'sunday_is_off':np.nan
                        },ignore_index=True
                       )
    else:
        metadata1000 = metadata1000.append({'bat_id':1000+i,
                            'is_house':False,
                            'time_step':60,
                            'lat':np.nan,
                            'long':np.nan,
                            'surface':np.nan,
                            'monday_is_off':np.nan,
                            'tuesday_is_off':np.nan,
                            'wednesday_is_off':np.nan,
                            'thursday_is_off':np.nan,
                            'friday_is_off':np.nan,
                            'saturday_is_off':np.nan,
                            'sunday_is_off':np.nan
                            },ignore_index=True
                           )
metadata1000 = metadata1000.set_index('bat_id')

In [54]:
metadata1000.to_csv(path_data+'metadata1000.csv')