# Cleaning code for Building 59 dataset

The .csv files from the dataset are located on the path declared right below.

In [1]:
# Basic imports
import csv
import numpy as np
import pandas as pd
from pandas import Series
import datetime
import time
import os
from fancyimpute import KNN, MatrixFactorization
import math
import plotly.graph_objects as go
import seaborn as sns
from plotly.subplots import make_subplots
from plotly.offline import plot

path = "../data" #Path with raw csv files


This is the code presented on the paper, we are not able to execute it due to RAM problems, so we will try to transform it

In [None]:


def clean_data_from_path(path):
    files = os.listdir(path)
    path_postprocess = path + "_postprocess"

    #read data files and adjust time format
    for filename in files:
        print(path+'/'+filename)
        df = pd.read_csv(path+'/'+filename)
        df['date'] = pd.to_datetime(df['date']) 
        helper=pd.DataFrame({'date': pd.date_range(df['date'].min(), df['date'].max(), freq='15min')})
        df = pd.merge(df, helper, on='date', how='outer').sort_values('date')
        count_out = Series([0],index=['date']) #count of outlier values
        count_gap = Series([0],index=['date']) #count of gap
        count_outgap = Series([0],index=['date']) #count of large gap (e.g., one day)
        gap_max=Series([0],index=['date']) #maximum gap
        #calculate the count of gap and do the interpolation based on the gap size 
        for i in range(1, len(df.columns)):
            k = 0
            out_gapcount=0
            start_index = {}
            starttime = {}
            end_index = {}
            endtime = {}
            gap = {}
            
    
            if pd.isnull(df.iloc[len(df.index)-1,i]) == True or math.isnan(df.iloc[len(df.index)-1,i])==True:
                df.iloc[len(df.index)-1,i]=0
            for j in range(0, len(df.index)):
                if (pd.isnull(df.iloc[j,i]) or math.isnan(df.iloc[j,i]))and pd.isnull(df.iloc[j-1,i]) == False:
                    starttime[k]=df.iloc[j-1,0] #start time of the gap
                    start_index[k]=j-1
                elif (pd.isnull(df.iloc[j-1,i]) or math.isnan(df.iloc[j-1,i])) and pd.isnull(df.iloc[j,i]) == False:
                    endtime[k]=df.iloc[j,0] #end time of the gap
                    end_index[k]=j
                    k=k+1
            if k != 0:
                for m in range(k):
                    starttime_struct=datetime.datetime.strptime(str(starttime[m]), '%Y-%m-%d %H:%M:%S')
                    endtime_struct = datetime.datetime.strptime(str(endtime[m]), '%Y-%m-%d %H:%M:%S')
                    gap[m]=(endtime_struct-starttime_struct).total_seconds()
                    if  gap[m]<= 3600: #linear interpolation if the gap is less than one hour
                        df.iloc[start_index[m]:end_index[m]+1,i]=df.iloc[start_index[m]:end_index[m]+1,i].interpolate(method='linear')
                    elif gap[m] >3600*24:
                        out_gapcount=out_gapcount+1
                maxgap = max(gap.values())/60
                gap_max=gap_max.append(Series(maxgap,index=[df.columns[i]]))
            outcount=np.sum(df.iloc[:, i]<0)/len(df)
            count_out=count_out.append(Series(outcount, index=[df.columns[i]]))
            count_gap= count_gap.append(Series(k, index=[df.columns[i]]))
            count_outgap = count_outgap.append(Series(out_gapcount,index=[df.columns[i]]))
            df_interpolation=np.array(df.iloc[:,1:])
        df_interpolation= KNN(k=3).fit_transform(df_interpolation) #Apply knn algorithm if the gap is larger than one hour
        unfill_large_gaps(df_filled, df)
        if out_gapcount !=0:
            df_interpolation= MatrixFactorization().fit_transform(df_interpolation) #Apply MF algorithm if the gap is larger than one day         
        df.iloc[:,1:]=df_interpolation
        cols_not_null = (len(df)-df.count(axis=0))/len(df)
        data=pd.DataFrame({'missingrate':cols_not_null,'outrate':count_out,'count_outgap':count_outgap,'count_gap':count_gap,'maxgap':gap_max})
        data.to_csv(path_postprocess+'\\'+'parameter_'+filename, sep=',', header=True, index=True)
        df.to_csv(path_postprocess+'\\'+'data_'+filename, sep=',', header=True, index=False)


We have a problem with ele.csv (energy use), because it doesn't follow the same csv format as the other files: it includes an unnamed column without data. We solve this problem with the following code (run only once)

In [None]:
#datos = pd.read_csv(path+ '/ele.csv')
#datos.drop('Unnamed: 6', axis=1, inplace=True)
#datos = datos.set_index('date')
#datos.to_csv(path+ '/ele.csv')

In [None]:
# The same problem appears in zone_temp_sp_h and zone_temp_sp_c (unnamed features, all of which have more than 50% missing data), we will just drop then
# We also drop the column zone_070_cooling_sp because it has over 97% of missing values in the interval we will consider
datos = pd.read_csv(path+ '/zone_temp_sp_c.csv')
datos = datos.iloc[:,:40]
datos = datos.set_index('date')
#datos.drop('Unnamed: 0', axis=1, inplace=True)
datos.to_csv(path+ '/zone_temp_sp_c.csv')

In [None]:
datos = pd.read_csv(path+ '/zone_temp_sp_h.csv')
datos = datos.iloc[:,:40]
datos = datos.set_index('date')
#datos.drop('Unnamed: 0', axis=1, inplace=True)

datos.to_csv(path+ '/zone_temp_sp_h.csv')


### Study of null values by column
For a file, we will study the percentage of missing values it includes.

In [2]:
def summary(path, filename):
    print("SUMMARY OF " + filename)
    dataframe = pd.read_csv(path+'/'+filename)
    dataframe=dataframe.set_index('date')
    for i in range(dataframe.shape[1]):
        print(dataframe.columns[i])
        n_miss = dataframe.iloc[:,i].isnull().sum()
        perc = n_miss / dataframe.shape[0] * 100
        print('> %d, Missing: %d (%.1f%%)' % (i, n_miss, perc))

def summary2(dataframe):
    for i in range(dataframe.shape[1]):
        print(dataframe.columns[i])
        n_miss = dataframe.iloc[:,i].isnull().sum()
        perc = n_miss / dataframe.shape[0] * 100
        print('> %d, Missing: %d (%.1f%%)' % (i, n_miss, perc))

In [None]:
summary(path, 'wifi.csv')

## Interpolation depending on the size of the gap:

- If it's smaller than 1h, we use linear interpolation
- If it's bigger than 1 day, we use KNN with n=3


Interpolation with KNN: 

https://machinelearningmastery.com/knn-imputation-for-missing-values-in-machine-learning/

In [3]:
from sklearn.impute import KNNImputer
files = os.listdir(path)
path_postprocess = path + "_postprocess"
#in the cleaning code we insert the rows that are apparently missing

freqs = {'zone_co2.csv':'1min', 'ele.csv': '15min', 'zone_temp_sp_c.csv':'5min', 'occ.csv':'1min', 'zone_temp_exterior.csv':'1min', 'zone_temp_sp_h.csv':'5min', 'site_weather.csv':'15min', 'wifi.csv': '10min', 'zone_temp_interior.csv':'10min'}
starts = {'ele.csv': '2018/1/1 1:00', 'zone_temp_sp_c.csv':'2018/9/15 10:00', 'occ.csv':'2018-05-22 07:00:00', 'zone_temp_exterior.csv':'2018-01-01 00:00:00', 'zone_temp_sp_h.csv':'2018/9/15 10:00', 'site_weather.csv':'2018-01-01 00:00:00', 'wifi.csv': '2018/5/22 00:00', 'zone_temp_interior.csv':'2018-02-22 00:30:00'}
ends = {'ele.csv': '2021/1/1 0:00', 'zone_temp_sp_c.csv':'2021/1/1 0:00', 'occ.csv':'2019-02-21 10:12:00', 'zone_temp_exterior.csv':'2021-01-01 00:00:00', 'zone_temp_sp_h.csv':'2021/1/1 0:00', 'site_weather.csv':'2021-01-01 00:00:00', 'wifi.csv': '2018/7/11 23:50', 'zone_temp_interior.csv':'2021-01-01 00:00:00'}

In [None]:
def unfill_large_gaps(df_filled, df):
    
    for i in range(1, len(df.columns)):
        k=0
        start_index = {}
        starttime = {}
        end_index = {}
        endtime = {}
        gap={}
        for j in range(0, len(df.index)):
            if pd.isnull(df.iloc[j,i]) and pd.isnull(df.iloc[j-1,i]) == False:
                starttime[k]=df.iloc[j-1,0]
                start_index[k]=j-1
            elif pd.isnull(df.iloc[j-1,i]) and pd.isnull(df.iloc[j,i]) == False:
                endtime[k]=df.iloc[j,0]
                end_index[k]=j
                k=k+1
        for m in range(k):
            starttime_struct=datetime.datetime.strptime(str(starttime[m]), '%Y-%m-%d %H:%M:%S')
            endtime_struct = datetime.datetime.strptime(str(endtime[m]), '%Y-%m-%d %H:%M:%S')
            gap[m]=(endtime_struct-starttime_struct).total_seconds()
            if  gap[m]>= 3600*24:
                df_filled.iloc[start_index[m]:end_index[m]+1,i-1]= None
                

In [None]:

def get_csv(path, filename, freq):
    print(path+'/'+filename)
    path_postprocess = path+"_postprocess/data_definitivo"+'/'+filename[:-4]+ "_postprocess.csv"
    df = pd.read_csv(path+'/'+filename)
    df['date'] = pd.to_datetime(df['date']) 
    helper=pd.DataFrame({'date': pd.date_range(df['date'].min(), df['date'].max(), freq=freq)})
    df = pd.merge(df, helper, on='date', how='outer').sort_values('date')
    count_out = Series([0],index=['date']) #count of outlier values
    count_gap = Series([0],index=['date']) #count of gap
    count_outgap = Series([0],index=['date']) #count of large gap (e.g., one day)
    gap_max=Series([0],index=['date']) #maximum gap
    out_gapcount=0
    summary2(df)
    #calculate the count of gap and do the interpolation based on the gap size 
    for i in range(1, len(df.columns)):
        print("Estamos en: ", i)
        k = 0
        
        start_index = {}
        starttime = {}
        end_index = {}
        endtime = {}
        gap = {}
        if pd.isnull(df.iloc[len(df.index)-1,i]) == True or math.isnan(df.iloc[len(df.index)-1,i])==True:
            df.iloc[len(df.index)-1,i]=0
        for j in range(0, len(df.index)):
            if (pd.isnull(df.iloc[j,i]) or math.isnan(df.iloc[j,i]))and pd.isnull(df.iloc[j-1,i]) == False:
                starttime[k]=df.iloc[j-1,0] #start time of the gap
                start_index[k]=j-1
            elif (pd.isnull(df.iloc[j-1,i]) or math.isnan(df.iloc[j-1,i])) and pd.isnull(df.iloc[j,i]) == False:
                endtime[k]=df.iloc[j,0] #end time of the gap
                end_index[k]=j
                k=k+1
        if k != 0:
            for m in range(k):
                starttime_struct=datetime.datetime.strptime(str(starttime[m]), '%Y-%m-%d %H:%M:%S')
                endtime_struct = datetime.datetime.strptime(str(endtime[m]), '%Y-%m-%d %H:%M:%S')
                gap[m]=(endtime_struct-starttime_struct).total_seconds()
                if  gap[m]<= 3600: #linear interpolation if the gap is less than one hour
                    print("Interpolation linear")
                    df.iloc[start_index[m]:end_index[m]+1,i]=df.iloc[start_index[m]:end_index[m]+1,i].interpolate(method='linear')
                elif gap[m] >3600*24:
                    out_gapcount=out_gapcount+1
            maxgap = max(gap.values())/60
            gap_max=gap_max.append(Series(maxgap,index=[df.columns[i]]))
        outcount=np.sum(df.iloc[:, i]<0)/len(df)
        count_out=count_out.append(Series(outcount, index=[df.columns[i]]))
        count_gap= count_gap.append(Series(k, index=[df.columns[i]]))
        count_outgap = count_outgap.append(Series(out_gapcount,index=[df.columns[i]]))
    #Interpolate whole dataframe with KNN
    df_interpolated = df.iloc[:,1:]
    imputer = KNNImputer(n_neighbors=3, weights='distance', metric='nan_euclidean')
    imputer.fit(df_interpolated)
    df_interpolated = pd.DataFrame(imputer.transform(df_interpolated), columns=df_interpolated.columns)

    
    #Export into csv
    print("New summary(final): ")
    summary2(df_interpolated)
    df.iloc[:,1:] = df_interpolated
    df.to_csv(path_postprocess, sep=',', header=True, index=False)

    #Final set of information
    cols_not_null = (len(df)-df.count(axis=0))/len(df)
    data=pd.DataFrame({'missingrate':cols_not_null,'outrate':count_out,'count_outgap':count_outgap,'count_gap':count_gap,'maxgap':gap_max})
    return data

However, we have a problem with available data. To solve this problem, we will only keep values between '2018-05-22 07:00:00'
and '2019-02-21 10:11:00' (maximum interval where all the variables are available), in order to do so, we will "crop" the data before interpolating it:


In [None]:

datos = pd.read_csv(path+ '/zone_temp_sp_c.csv')
datos.head(3)

In [None]:
datos = pd.read_csv(path+ '/occ.csv')
datos.head(-1) #Decidir si nos quedamos con este límite o pasamos de los datos de ocupación

In [None]:

def get_csv_cortado(path, filename, freq, start, end):
    print(path+'/'+filename)
    path_postprocess = path+"_postprocess/data_definitivo"+'/'+filename[:-4]+ "_postprocess.csv"
    df = pd.read_csv(path+'/'+filename)
    df['date'] = pd.to_datetime(df['date']) 
    df =df[(df.date>=start)&(df.date<=end)]
    helper=pd.DataFrame({'date': pd.date_range(start, end, freq=freq)})
    df = pd.merge(df, helper, on='date', how='outer').sort_values('date')
    count_out = Series([0],index=['date']) #count of outlier values
    count_gap = Series([0],index=['date']) #count of gap
    count_outgap = Series([0],index=['date']) #count of large gap (e.g., one day)
    gap_max=Series([0],index=['date']) #maximum gap
    out_gapcount=0
    summary2(df)
    #calculate the count of gap and do the interpolation based on the gap size 
    for i in range(1, len(df.columns)):
        print("Estamos en: ", i)
        k = 0
        
        start_index = {}
        starttime = {}
        end_index = {}
        endtime = {}
        gap = {}
        if pd.isnull(df.iloc[len(df.index)-1,i]) == True or math.isnan(df.iloc[len(df.index)-1,i])==True:
            df.iloc[len(df.index)-1,i]=0
        for j in range(0, len(df.index)):
            if (pd.isnull(df.iloc[j,i]) or math.isnan(df.iloc[j,i]))and pd.isnull(df.iloc[j-1,i]) == False:
                starttime[k]=df.iloc[j-1,0] #start time of the gap
                start_index[k]=j-1
            elif (pd.isnull(df.iloc[j-1,i]) or math.isnan(df.iloc[j-1,i])) and pd.isnull(df.iloc[j,i]) == False:
                endtime[k]=df.iloc[j,0] #end time of the gap
                end_index[k]=j
                k=k+1
        if k != 0:
            for m in range(k):
                starttime_struct=datetime.datetime.strptime(str(starttime[m]), '%Y-%m-%d %H:%M:%S')
                endtime_struct = datetime.datetime.strptime(str(endtime[m]), '%Y-%m-%d %H:%M:%S')
                gap[m]=(endtime_struct-starttime_struct).total_seconds()
                if  gap[m]<= 3600: #linear interpolation if the gap is less than one hour
                    print("Interpolation linear")
                    df.iloc[start_index[m]:end_index[m]+1,i]=df.iloc[start_index[m]:end_index[m]+1,i].interpolate(method='linear')
                elif gap[m] >3600:
                    print("Gap greater than one hour")
                    out_gapcount=out_gapcount+1
            maxgap = max(gap.values())/60
            gap_max=gap_max.append(Series(maxgap,index=[df.columns[i]]))
        outcount=np.sum(df.iloc[:, i]<0)/len(df)
        count_out=count_out.append(Series(outcount, index=[df.columns[i]]))
        count_gap= count_gap.append(Series(k, index=[df.columns[i]]))
        count_outgap = count_outgap.append(Series(out_gapcount,index=[df.columns[i]]))
    #Interpolate whole dataframe with KNN
    
    imputer = KNNImputer(n_neighbors=3, weights='distance', metric='nan_euclidean')
    
    
    df.iloc[:,1:] = imputer.fit_transform(df.iloc[:,1:])

    
    #Export into csv
    print("New summary(final): ")
    summary2(df)
    df['date']=df['date']
    df = df.set_index('date')
    df.to_csv(path_postprocess, sep=',', header=True, index=True)

    #Final set of information
    cols_not_null = (len(df)-df.count(axis=0))/len(df)
    data=pd.DataFrame({'missingrate':cols_not_null,'outrate':count_out,'count_outgap':count_outgap,'count_gap':count_gap,'maxgap':gap_max})
    return data
    

In [None]:
files = os.listdir(path)
path_postprocess = path + "_postprocess"

for file in files:
    if file != '.DS_Store':
        print(file, freqs[file])
        get_csv_cortado(path, file, freqs[file], starts[file], ends[file])


In [None]:
#In these files, there is no linear interpolation, so we just use KNN
file = 'zone_temp_exterior.csv'
df = pd.read_csv(path+'/'+file)
start = starts[file]
end = ends[file]
freq = freqs[file]
path_postprocess = path+"_postprocess/data_definitivo"+'/'+file[:-4]+ "_postprocess.csv"
df['date'] = pd.to_datetime(df['date']) 
df =df[(df.date>=start)&(df.date<=end)]
helper=pd.DataFrame({'date': pd.date_range(start, end, freq=freq)})
df = pd.merge(df, helper, on='date', how='outer').sort_values('date')
imputer = KNNImputer(n_neighbors=3, weights='distance', metric='nan_euclidean')
    
df.iloc[:,1:] = imputer.fit_transform(df.iloc[:,1:])


#Export into csv
print("New summary(final): ")
summary2(df)
df['date']=df['date']
df = df.set_index('date')
df.to_csv(path_postprocess, sep=',', header=True, index=True)

## Media movil: primer intento


In [None]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from plotly.offline import plot

In [None]:
fig = go.Figure()
fig.add_trace(
    go.Scatter(x=df.date , y=df['mels_S'])
)

In [None]:
df[df['mels_S'].isnull()]

In [None]:
date = df.loc[31113]['date']

dates = df[~df['mels_S'].isnull()][['date']]
dates['diff']=abs(date-dates)

In [None]:
dates = dates.sort_values(by='diff', ascending=True)
dates = dates.iloc[:1000]
dates

In [None]:
aux = df[['date', 'mels_S']].merge(dates[['date']], how ='inner', right_on = 'date', left_on='date')
temp.loc[temp.date ==date,'mels_S']=aux['mels_S'].mean()

In [None]:
df.loc[31114,'mels_S']

In [None]:
def impute(df, k, var, dynamic=True):
    nulls_date = df[df[var].isnull()]['date'].unique()
    imputed = []
    j =0
    if (dynamic):
        for i in nulls_date:
            df.loc[df.date==i, var]=impute_individual(df,2*k-j, var, i)
            j+=1
    else:
        for i in nulls_date:
            imputed.append(impute_individual(df,k, var, i)) 
        for i in range(nulls_date.size):
            df.loc[df.date==nulls_date[i], var]=imputed[i]
            
    fig = go.Figure()
    fig.add_trace(
        go.Scatter(x=df.date , y=df[var])
    )
    fig.show()

In [None]:
def impute_individual(df,k, var, date):
    #Sacamos las fechas donde no es null (las que nos interesan para imputar) y las ordenamos por cercanía
    dates = df[~df[var].isnull()][['date']]
    dates['diff']=abs(date-dates)
    dates = dates.sort_values(by='diff', ascending=True).iloc[:k] # Nos quedamos con las k primeras y calculamos la media
    value = df[['date', var]].merge(dates[['date']], how ='inner', right_on = 'date', left_on='date')[var].mean()
    return value

In [None]:
# Reinicio para probar
file = 'ele.csv'
df = pd.read_csv(path+'/'+file)
start = starts[file]
end = ends[file]
freq = freqs[file]
path_postprocess = path+"_postprocess/data_definitivo"+'/'+file[:-4]+ "_postprocess.csv"
df['date'] = pd.to_datetime(df['date']) 
df =df[(df.date>=start)&(df.date<=end)]
helper=pd.DataFrame({'date': pd.date_range(start, end, freq=freq)})
df = pd.merge(df, helper, on='date', how='outer').sort_values('date')
df = df[(df.date.dt.month ==11) & (df.date.dt.year==2018)]


In [None]:
df[df.lig_S.isnull()]

In [None]:
impute(df, 10, 'mels_S', dynamic=True)

### Pruebas para corrección

In [None]:
fig = make_subplots(rows=2, cols=1,  shared_xaxes=True, subplot_titles=("Final",  "Bruto") )

fig.add_trace(
    go.Scatter(x=df.date , y=df['mels_S']),
    row=1, col=1
)

fig.add_trace(
    go.Scatter(x=mels_S.date , y=mels_S['mels_S']),
    row=2, col=1
)



fig.update_layout(legend_orientation="h", 
             xaxis1_rangeslider_visible=True, xaxis1_rangeslider_thickness=0.1, height=600, width=800, title_text="Mels_S" )
fig.show()

In [None]:
df_2 = pd.read_csv(path+'/'+file)

In [None]:
cooling_a = pd.read_csv(path+'/zone_temp_sp_h.csv')
cooling_a.shape


In [None]:
df=cooling_a
df.date=pd.to_datetime(df.date)

In [None]:
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from plotly.offline import plot

In [None]:
fig = make_subplots(rows=2, cols=1,  shared_xaxes=True, subplot_titles=("Final",  "Bruto") )

fig.add_trace(
    go.Scatter(x=df_2.date , y=df_2['mels_S']),
    row=1, col=1
)

fig.add_trace(
    go.Scatter(x=df.index , y=df['mels_S']),
    row=2, col=1
)



fig.update_layout(legend_orientation="h", 
             xaxis1_rangeslider_visible=True, xaxis1_rangeslider_thickness=0.1, height=600, width=800, title_text="Mels_S" )
fig.show()

In [None]:

filename = 'ele.csv'
path_2 = '../data_postprocess/data_done'
print(path_2+'/'+filename)
path_postprocess = path+"_postprocess/data_definitivo"+'/'+filename[:-4]+ "_postprocess.csv"
df = pd.read_csv(path+'/'+filename)
df['date'] = pd.to_datetime(df['date']) 
df =df[(df.date>=start)&(df.date<=end)]
helper=pd.DataFrame({'date': pd.date_range(start, end, freq=freq)})
df = pd.merge(df, helper, on='date', how='outer').sort_values('date')
count_out = Series([0],index=['date']) #count of outlier values
count_gap = Series([0],index=['date']) #count of gap
count_outgap = Series([0],index=['date']) #count of large gap (e.g., one day)
gap_max=Series([0],index=['date']) #maximum gap
out_gapcount=0
summary2(df)
#calculate the count of gap and do the interpolation based on the gap size 
for i in range(1, len(df.columns)):
    print("Estamos en: ", i)
    k = 0
    
    start_index = {}
    starttime = {}
    end_index = {}
    endtime = {}
    gap = {}
    if pd.isnull(df.iloc[len(df.index)-1,i]) == True or math.isnan(df.iloc[len(df.index)-1,i])==True:
        df.iloc[len(df.index)-1,i]=0
    for j in range(0, len(df.index)):
        if (pd.isnull(df.iloc[j,i]) or math.isnan(df.iloc[j,i]))and pd.isnull(df.iloc[j-1,i]) == False:
            starttime[k]=df.iloc[j-1,0] #start time of the gap
            start_index[k]=j-1
        elif (pd.isnull(df.iloc[j-1,i]) or math.isnan(df.iloc[j-1,i])) and pd.isnull(df.iloc[j,i]) == False:
            endtime[k]=df.iloc[j,0] #end time of the gap
            end_index[k]=j
            k=k+1
    if k != 0:
        for m in range(k):
            starttime_struct=datetime.datetime.strptime(str(starttime[m]), '%Y-%m-%d %H:%M:%S')
            endtime_struct = datetime.datetime.strptime(str(endtime[m]), '%Y-%m-%d %H:%M:%S')
            gap[m]=(endtime_struct-starttime_struct).total_seconds()
            if  gap[m]<= 3600: #linear interpolation if the gap is less than one hour
                print("Interpolation linear")
                df.iloc[start_index[m]:end_index[m]+1,i]=df.iloc[start_index[m]:end_index[m]+1,i].interpolate(method='linear')
            elif gap[m] >3600*24:
                out_gapcount=out_gapcount+1
        maxgap = max(gap.values())/60
        gap_max=gap_max.append(Series(maxgap,index=[df.columns[i]]))
    outcount=np.sum(df.iloc[:, i]<0)/len(df)
    count_out=count_out.append(Series(outcount, index=[df.columns[i]]))
    count_gap= count_gap.append(Series(k, index=[df.columns[i]]))
    count_outgap = count_outgap.append(Series(out_gapcount,index=[df.columns[i]]))
#Interpolate whole dataframe with KNN

imputer = KNNImputer(n_neighbors=3, weights='distance', metric='nan_euclidean')


df.iloc[:,1:] = imputer.fit_transform(df.iloc[:,1:])


#Export into csv
print("New summary(final): ")
summary2(df)
df['date']=df['date']
df = df.set_index('date')
df.to_csv(path_postprocess, sep=',', header=True, index=True)

#Final set of information
cols_not_null = (len(df)-df.count(axis=0))/len(df)
data=pd.DataFrame({'missingrate':cols_not_null,'outrate':count_out,'count_outgap':count_outgap,'count_gap':count_gap,'maxgap':gap_max})
return data


# Limpieza de datos- código propio

A continuación vamos a proceder a imputar los datos que faltan, utilizando el código final descrito en el fichero de experimentos.

## Descripción del algoritmo desarrollado:


Los huecos en el dataset son de uno o dos días, o de muchos días. No tiene mucho sentido utilizar el dataframe entero cuando encontramos un hueco relativamente pequeño, por lo que en los huecos más pequeños (1 hora), seguimos usando interpolación lineal. Para los huecos más grandes, vamos a usar dos días de información por cada día en el hueco.


Sin embargo, es necesario decidir cómo tratamos los huecos que estén muy seguidos, porque para usar el algoritmo implementado necesitamos tener algo de información antes y después del hueco. Para ello, antes de empezar a rellenar los datos nulos, iteramos a través de los huecos fusionando los que estén a una distancia menor que el hueco que llevemos acumulado (para así asegurarnos de que tenemos datos suficientes).

Además, hemos establecido ciertos criterios (ficheros de experimento), para mejorar la eficiencia del algoritmo:
- Si el hueco está tan al principio o tan al final que no podemos usar el algoritmo SVR con 2 días de información, utilizamos Prophet
- Si el hueco está en medio:
    - Si es fusión de 5 o más huecos, usamos Prophet
    - En caso contrario, utilizamos SVR entrenando con dos días por cada día que falte

## Imputación

Vamos a separar la imputación en tres pasos:
1. Localización de los huecos (separamos los huecos pequeños, de menos de una hora, de los huecos grandes)
2. Fusión de los huecos que estén muy juntos
3. Imputación de datos, primero los huecos pequeños y luego los grandes con la función recién implementada.
    

In [3]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from prophet import Prophet
import random
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error


In [None]:
def interpolate_Prophet(df,i, start_index, end_index):
    m = Prophet()
    s=start_index+1
    e = end_index
    
    df_medio= df.iloc[s:e, i]


    df_prophet = df[df[df.columns[i]].notna()][['date', df.columns[i]]]
    df_medio = df.iloc[s:e, [0, i]]
    df_prophet.columns = ['ds', 'y']
    m.fit(df_prophet)


    future = df_medio[['date']]
    future.columns=['ds']
    forecast_antes = m.predict(future)[['ds', 'yhat']]
    gap = forecast_antes['yhat']
    return gap

In [None]:
def interpolateGAP(df,i, start_df, end_df, start_index, end_index):
    


    #print("Gap: \ndf before[", start_df, ':', 'start_index', start_index, ']\ndf after[', end_index, ':', end_df, ']')
    s=start_index+1
    e = end_index


    df_antes = df.iloc[start_df:s, i]
    df_despues= df.iloc[e:end_df, i].loc[::-1]
    df_medio= df.iloc[s:e, i]
    #print("Df_antes: ", df_antes.shape[0], "Df_medio: ", df_medio.shape[0], "Df_despues: ", df_despues.shape[0])

    assert df_medio.shape[0]+df_despues.shape[0]+df_antes.shape[0]==end_df-start_df
    # PASO 2: CREAR DATAFRAMES !!! SI el de después  es menor que el medio (antes siempre tenemos el resto del df), solamente ejecutamos con el de después
    if (df_despues.shape[0]<df_medio.shape[0]):
        print("Muy al final: usamos Prophet")
        gap = interpolate_Prophet(df, i, start_index, end_index)
        
    else:
        # Creamos los dataframes
        a = np.array(df_antes)
        b=np.array(df_despues)
        T = df_medio.shape[0]

        train_antes =np.lib.stride_tricks.sliding_window_view(a, T+1)[:-1]
        test_antes = a[T+1:]
        #print("train_antes, ", len(train_antes), "test_antes, ", len(test_antes))
        train_despues = np.lib.stride_tricks.sliding_window_view(b, T+1)[:-1]
        test_despues = b[T+1:]

        # PASO 3: ENTRENAR MODELOS


        svr_antes = make_pipeline(StandardScaler(), SVR(C=1.0, epsilon=0.2))

        svr_antes.fit(train_antes, test_antes)
        svr_despues = make_pipeline(StandardScaler(), SVR(C=1.0, epsilon=0.2))

        svr_despues.fit(train_despues, test_despues)

        # PASO 4: OBTENER PREDICCIONES

        
        start = start_index-start_df
        for i in range(T):
            at = a[start-T-1+i:start+i]
            a=np.append(a,svr_antes.predict(at.reshape(1,-1)))

            dp = b[df_despues.shape[0]-T-1+i:df_despues.shape[0]+i]
            b=np.append(b,svr_despues.predict(dp.reshape(1,-1)))
        
        # PASO 5: COMBINAR
        predictions_antes = a[start:start+T]
        predictions_despues = b[df_despues.shape[0]:df_despues.shape[0]+T]
        gap=(predictions_antes+predictions_despues[::-1])/2
    return gap

In [None]:
def obtain_data(df, i):



    #print("Estamos en: ", i)
    k = 0

    start_index = {}
    starttime = {}
    end_index = {}
    endtime = {}
    tipo = {}

    k2 =0
    start_index_peque = {}
    starttime_peque = {}
    end_index_peque = {}
    endtime_peque = {}


    gap = {}
    start_gap_index =0
    end_gap_index =0
    start_gap_dt=datetime.datetime.strptime(str(df.iloc[0,0]), '%Y-%m-%d %H:%M:%S')
    end_gap_dt = datetime.datetime.strptime(str(df.iloc[0,0]), '%Y-%m-%d %H:%M:%S')
    for j in range(0, len(df.index)):
        

        if (pd.isnull(df.iloc[j,i]) or math.isnan(df.iloc[j,i]))and pd.isnull(df.iloc[j-1,i]) == False:
            start_gap_index = j-1
            start_gap_dt = datetime.datetime.strptime(str(df.iloc[j-1,0]), '%Y-%m-%d %H:%M:%S')
            starttime[k]=df.iloc[j-1,0] #start time of the gap
            start_index[k]=j-1
        elif (pd.isnull(df.iloc[j-1,i]) or math.isnan(df.iloc[j-1,i])) and pd.isnull(df.iloc[j,i]) == False:
            end_gap_index = j
            end_gap_dt=datetime.datetime.strptime(str(df.iloc[end_gap_index,0]), '%Y-%m-%d %H:%M:%S')
            if (abs(end_gap_dt-start_gap_dt).total_seconds()<3600):
                start_index_peque[k2] = start_gap_index
                end_index_peque[k2]=end_gap_index
                starttime_peque[k2]= df.iloc[start_gap_index,0]
                endtime_peque[k2]= df.iloc[end_gap_index,0]
                k2=k2+1
            else:
                start_index[k] = start_gap_index
                end_index[k]=end_gap_index
                starttime[k]= df.iloc[start_gap_index,0]
                endtime[k]= df.iloc[end_gap_index,0]
                tipo[k] = abs((end_gap_dt-start_gap_dt).days)+1
                
                k=k+1
    return [(start_index, end_index,tipo ), (start_index_peque, end_index_peque)]
    



In [None]:
def fill_small_gaps(df, start_index_peque, end_index_peque, i):
    k2 = len(start_index_peque)
    if k2!=0:
        for m in range(k2): 
            df.iloc[start_index_peque[m]:end_index_peque[m]+1,i]=df.iloc[start_index_peque[m]:end_index_peque[m]+1,i].interpolate(method='linear')

In [None]:
def mix_big_gaps(start_index, end_index, tipo, freq=4):
    k = len(start_index)
    gaps = {}
    days_interpolation ={}
    t =0
    m =0
    if k!=0:
        while (m<k):
            #print("Estoy en: ", m)
            gap = m
            next_gap = m+1 
            start = start_index[m]
            end_index_t = end_index[m]
            current_gap = [[start_index[m], end_index[m], 0, end_index[m]-start_index[m]]]
            type = tipo[m]
            T = abs(end_index[m]-start_index[m])
            
            while (next_gap<k and abs(end_index_t-start_index[next_gap])<T ):
                #print("Fusión en: ", gap, "con ", next_gap)
                #print("T: ", abs(end_index_t-start))
                m= m+1
                local_index = current_gap[-1][3]+(start_index[next_gap]-end_index_t)
                #Si está muy cerca se lo metemos al current_gap
                current_gap.append([start_index[next_gap], end_index[next_gap], local_index,local_index+(end_index[next_gap]-start_index[next_gap])])
                
                type = type + tipo[m]
                end_index_t = end_index[next_gap]
                next_gap= next_gap +1
                T = abs(end_index_t-start)
                
            m = m+1
            gaps[t]=current_gap
            days_interpolation[t]=type
            t=t+1   
        return (gaps, days_interpolation)


In [None]:
def fill_big_gaps(df, gaps, days_interpolation, i, freq=4):
    k = len(gaps)
    if k!=0:
        for m in range(k):
            
            days = days_interpolation[m]
            start_i = gaps[m][0][0]
            end_i = gaps[m][-1][1]
            print("We are in gap: ", m, "\nMixed gaps: ", gaps[m])
            print("Gap of " , days, 'days')

            if (len(gaps[m])>2):
                print("Demasiados huecos, usamos Prophet")
                results =interpolate_Prophet(df, i, start_i, end_i)
            else:
                end_gap_anterior = (gaps[m-1][-1][1])if m>0 else 0
                start_gap_siguiente = (gaps[m+1][0][0])if m+1!=k else len(df.index)
                # Si el anterior ha acabado hace más de una semana, nos quedamos con el que se hace una semana justa

                start_df = start_i-(days*7*freq*24) if (start_i-days*7*freq*24>=0 ) else 0
                
                
                # Si el sigueinte empieza más tarde de una semana, nos quedamos con la semana siguiente solamente
                end_df = end_i+(days*2*freq*24) if (start_gap_siguiente-end_i > days*2*freq*24 ) else start_gap_siguiente
                results = interpolateGAP(df,i, start_df, end_df, start_i, end_i)

            for independent_gap in gaps[m]:
                
                df.iloc[independent_gap[0]+1:independent_gap[1],i]=results[independent_gap[2]:independent_gap[3]-1]
            
                
        
        

In [None]:
def clean_df_var(df, i, show=False):
    copia = df[['date', df.columns[i]]].copy()
    data = obtain_data(df, i)
    start_index, end_index,tipo = data[0]
    start_index_peque, end_index_peque=data[1]
    fill_small_gaps(df, start_index_peque, end_index_peque, i)
    gaps, days_interpolation = mix_big_gaps(start_index, end_index, tipo)
    fill_big_gaps(df, gaps, days_interpolation, i)
    print("Finished: ", df.columns[i])
    if(show):
        fig = make_subplots(rows=2, cols=1,  shared_xaxes=True, subplot_titles=("Bruto",  "Final") )


        fig.add_trace(
            go.Scatter(x=copia.date , y=copia[df.columns[i]]),
            row=1, col=1
        )

        fig.add_trace(
            go.Scatter(x=df.date , y=df[df.columns[i]]),
            row=2, col=1
        )



        fig.update_layout(legend_orientation="h", 
                    xaxis2_rangeslider_visible=True, xaxis2_rangeslider_thickness=0.1, height=600, width=800, title_text=df.columns[i] )
        fig.show()


## 1. Energy Use
Este fichero csv tiene 5 variables distintas, y casi todas fallan en el mismo intervalo, por lo que vamos a usar el algoritmo desarrollado.


In [4]:
file = 'ele.csv'
df = pd.read_csv(path+'/'+file)
path_2 = '../data_postprocess/data_done'
print(path_2+'/'+file)
start = starts[file]
end = ends[file]
freq = freqs[file]
df['date'] = pd.to_datetime(df['date']) 
df =df[(df.date>=start)&(df.date<=end)]
helper=pd.DataFrame({'date': pd.date_range(start, end, freq=freq)})
df = pd.merge(df, helper, on='date', how='outer').sort_values('date')

df.sort_values(by='date', inplace=True)
summary2(df)

../data_postprocess/data_done/ele.csv
date
> 0, Missing: 0 (0.0%)
mels_S
> 1, Missing: 3728 (3.5%)
lig_S
> 2, Missing: 3724 (3.5%)
mels_N
> 3, Missing: 3714 (3.5%)
hvac_N
> 4, Missing: 5232 (4.9%)
hvac_S
> 5, Missing: 5232 (4.9%)


In [None]:
umbral = '00:15:00'
a = df.apply(lambda x: x.diff())
df[a['date']>umbral]

In [None]:
fig = go.Figure()
fig.add_trace(
    go.Scatter(x=df.date , y=df['mels_S'])
)
fig.update_layout(legend_orientation="v", 
             xaxis_rangeslider_visible=True, xaxis_rangeslider_thickness=0.1, height=600, width=800 )
fig.show()

In [5]:
df=df[df.date<'2020-03-01']

In [None]:
copia = df.copy()

In [None]:
for i in range(1,df.columns.size):
    clean_df_var(df, i, show=True)


In [None]:
df.to_csv('../data_postprocess/data_nuevo/ele_def.csv', sep=',', header=True, index=False)

'/Users/maguado/Desktop/TFG/time-series-building59'

In [25]:
# Importamos csv para comparar 
energy_use = pd.read_csv('../data_postprocess/data_nuevo/ele_def.csv')
energy_use['date']= pd.to_datetime(energy_use['date'])

In [27]:
i=4

fig = make_subplots(rows=2, cols=1,  shared_xaxes=True, subplot_titles=("Final",  "Bruto") )

fig.add_trace(
    go.Scatter(x=energy_use.date , y=energy_use[energy_use.columns[i]]),
    row=1, col=1
)

fig.add_trace(
    go.Scatter(x=df.date , y=df[df.columns[i]]),
    row=2, col=1
)



fig.update_layout(legend_orientation="h", 
             xaxis2_rangeslider_visible=True, xaxis2_rangeslider_thickness=0.1, height=600, width=800, title_text=df.columns[i] )
fig.show()

## Site Weather
Este fichero csv tiene 5 variables distintas, vamos a comprobar en qué intervalo fallan:


In [38]:
file = 'site_weather.csv'
df = pd.read_csv(path+'/'+file)
path_2 = '../data_postprocess/data_done'
print(path+'/'+file)
start = starts[file]
end = ends[file]
freq = freqs[file]
df['date'] = pd.to_datetime(df['date']) 
df =df[(df.date>=start)&(df.date<=end)]
helper=pd.DataFrame({'date': pd.date_range(start, end, freq=freq)})
df = pd.merge(df, helper, on='date', how='outer').sort_values('date')

df.sort_values(by='date', inplace=True)
summary2(df)

../data/site_weather.csv
date
> 0, Missing: 0 (0.0%)
air_temp_set_1
> 1, Missing: 0 (0.0%)
air_temp_set_2
> 2, Missing: 0 (0.0%)
dew_point_temperature_set_1d
> 3, Missing: 0 (0.0%)
relative_humidity_set_1
> 4, Missing: 0 (0.0%)
solar_radiation_set_1
> 5, Missing: 0 (0.0%)


In [None]:
import missingno as msno
missingdata_df = df.columns[df.isnull().any()].tolist()


In [None]:
missingdata_df

In [40]:
df.to_csv('../data_postprocess/data_nuevo/site_weather_def.csv', sep=',', header=True, index=False)

## Zone Temp Exterior


In [None]:
file = 'zone_temp_exterior.csv'
df = pd.read_csv(path+'/'+file)
path_2 = '../data_postprocess/data_done'
print(path+'/'+file)
start = starts[file]
end = ends[file]
freq = freqs[file]
df['date'] = pd.to_datetime(df['date']) 
df =df[(df.date>=start)&(df.date<=end)]
helper=pd.DataFrame({'date': pd.date_range(start, end, freq=freq)})
df = pd.merge(df, helper, on='date', how='outer').sort_values('date')

df.sort_values(by='date', inplace=True)
summary2(df)

In [None]:
fig = go.Figure()
fig.add_trace(
    go.Scatter(x=df.date , y=df['zone_016_temp'])
)
fig.update_layout(legend_orientation="v", 
             xaxis_rangeslider_visible=True, xaxis_rangeslider_thickness=0.1, height=600, width=800 )
fig.show()

In [None]:
import missingno as msno
missingdata_df = df.columns[df.isnull().any()].tolist()
msno.matrix(df[missingdata_df])

Más o menos todas las variables fallan en un mismo intervalo, por lo que vamos a utilizar el algoritmo anterior. Sin embargo, como tenemos una densidad mayor de huecos en el principio de la serie temporal, vamos a proceder a la inversa, es decir, aplicando el algoritmo de atrás hacia delante. 
Tal y como hemos visto en otras variables, el año 2020 a partir de marzo no sigue los valores esperados, por lo que no lo vamos a utilizar para imputar los datos.

Para comprobar cuánto tarda y poder ajustar si hace falta el criterio de días en el hueco-semanas de datos, vamos a ejecutar por separado el algoritmo en la primera variable.

In [None]:
df = df[df.date <= '2020-03-01']

In [None]:
df =df.iloc[::-1].reset_index(drop=True)

#### Downsampling

Vamos a calcular la varianza media en intervalos de 15 minutos para comprobar que no perdemos demasiada información:

In [None]:
df['group']=df['date'].apply(lambda x: 0 if x.minute < 15 else 1 if x.minute < 30 else 2 if x.minute < 45  else 3)
df['day']=df.date.dt.day
df['month']=df.date.dt.month
df['year']=df.date.dt.year
df['hour']=df.date.dt.hour

In [None]:
std =df.groupby(['group', 'day', 'month', 'year', 'hour']).std()

In [None]:
std.head()

In [None]:
i=2
sns.displot(std[std.columns[i]])

In [None]:
df = df.resample('15T', on='date').mean().reset_index().sort_values(by='date', ascending=False).reset_index(drop=True)

In [None]:
df.head()

In [None]:
fig = go.Figure()
fig.add_trace(
    go.Scatter(x=df.date , y=df['zone_016_temp'])
)
fig.update_layout(legend_orientation="v", 
             xaxis_rangeslider_visible=True, xaxis_rangeslider_thickness=0.1, height=600, width=800 )
fig.show()

In [None]:
i=1
clean_df_var(df, i)


In [None]:
for i in range(2, df.columns.size):
    clean_df_var(df, i)

In [None]:
fig = go.Figure()
fig.add_trace(
    go.Scatter(x=df.date , y=df['zone_016_temp'])
)
fig.update_layout(legend_orientation="v", 
             xaxis_rangeslider_visible=True, xaxis_rangeslider_thickness=0.1, height=600, width=800 )
fig.show()

In [None]:
#df.to_csv('../data_postprocess/data_nuevo/zone_temp_exterior_def.csv', index=False, header=True)

In [31]:
df=pd.read_csv('../data_postprocess/data_nuevo/zone_temp_exterior_def.csv')
df['date']=pd.to_datetime(df['date'])

In [None]:
#Queremos comparar una a una las variables que hemos completado:
file = 'zone_temp_exterior.csv'
df_2 = pd.read_csv(path+'/'+file)
start = starts[file]
end = ends[file]
freq = freqs[file]
df_2['date'] = pd.to_datetime(df_2['date']) 
df_2 =df_2[(df_2.date>=start)&(df_2.date<=end)]
helper=pd.DataFrame({'date': pd.date_range(start, end, freq=freq)})
df_2 = pd.merge(df_2, helper, on='date', how='outer').sort_values('date')

df_2.sort_values(by='date', inplace=True)
df_2 = df_2[df_2.date <= '2020-03-01']
df_2 =df_2.iloc[::-1].reset_index(drop=True)
df_2 = df_2.resample('15T', on='date').mean().reset_index().sort_values(by='date', ascending=False).reset_index(drop=True)


In [None]:
i=2

fig = make_subplots(rows=2, cols=1,  shared_xaxes=True, subplot_titles=("Final",  "Bruto") )

fig.add_trace(
    go.Scatter(x=df_2.date , y=df_2[df_2.columns[i]]),
    row=1, col=1
)

fig.add_trace(
    go.Scatter(x=df.date , y=df[df.columns[i]]),
    row=2, col=1
)



fig.update_layout(legend_orientation="h", 
             xaxis2_rangeslider_visible=True, xaxis2_rangeslider_thickness=0.1, height=600, width=800, title_text=df.columns[i] )
fig.show()

## Zone Temp Interior


In [None]:
file = 'zone_temp_interior.csv'
df = pd.read_csv(path+'/'+file)
path_2 = '../data_postprocess/data_done'
print(path+'/'+file)
start = starts[file]
end = ends[file]
freq = freqs[file]
df['date'] = pd.to_datetime(df['date']) 
df =df[(df.date>=start)&(df.date<=end)]
helper=pd.DataFrame({'date': pd.date_range(start, end, freq=freq)})
df = pd.merge(df, helper, on='date', how='outer').sort_values('date')

df.sort_values(by='date', inplace=True)
summary2(df)

In [None]:
fig = go.Figure()
fig.add_trace(
    go.Scatter(x=df.date , y=df['cerc_templogger_1'])
)
fig.update_layout(legend_orientation="v", 
             xaxis_rangeslider_visible=True, xaxis_rangeslider_thickness=0.1, height=600, width=800 )
fig.show()

In [None]:
import missingno as msno
missingdata_df = df.columns[df.isnull().any()].tolist()
msno.matrix(df[missingdata_df])

In [None]:
df = df[df.date <= '2020-03-01']

In [None]:
#df =df.iloc[::-1].reset_index(drop=True)

#### Downsampling

Vamos a calcular la varianza media en intervalos de 15 minutos para comprobar que no perdemos demasiada información:

In [None]:
df['group']=df['date'].apply(lambda x: 0 if x.minute < 15 else 1 if x.minute < 30 else 2 if x.minute < 45  else 3)
df['day']=df.date.dt.day
df['month']=df.date.dt.month
df['year']=df.date.dt.year
df['hour']=df.date.dt.hour

In [None]:
std =df.groupby(['group', 'day', 'month', 'year', 'hour']).std()

In [None]:
std.head()

In [None]:
i=2
sns.displot(std[std.columns[i]])

### Outlier detection

Si jugamos un poco con el gráfico de las variables de este csv, vemos que hay algunos valores que no encajan con la tendencia de la serie temporal. Estos picos son candidatos a ser outliers. Para confirmar que son outliers, estudiamos el histograma:

In [None]:
values = df['cerc_templogger_1'].unique()
values.sort()
values[::-1][:3]

In [None]:
df['cerc_templogger_13'].describe()

In [None]:
sns.displot(df['cerc_templogger_1'], kde=True)

Claramente, los valores de 0 y 85 son valores no válidos, es decir, suponemos que son los valores máximos y mínimso de los sensores (CONFIRMAR). Los eliminamos

In [None]:
df.iloc[:,1:] = df.iloc[:,1:].mask(np.greater(df.iloc[:,1:].values, 53.00))

In [None]:
df.iloc[:,1:] = df.iloc[:,1:].mask(np.less(df.iloc[:,1:].values, 1.00))

In [None]:
fig = go.Figure()
fig.add_trace(
    go.Scatter(x=df.date , y=df['cerc_templogger_1'])
)
fig.update_layout(legend_orientation="v", 
             xaxis_rangeslider_visible=True, xaxis_rangeslider_thickness=0.1, height=600, width=800 )
fig.show()

In [None]:
df = df.resample('15T', on='date').mean().reset_index()

In [None]:
df.head()

In [None]:
fig = go.Figure()
fig.add_trace(
    go.Scatter(x=df.date , y=df['cerc_templogger_10'])
)
fig.update_layout(legend_orientation="v", 
             xaxis_rangeslider_visible=True, xaxis_rangeslider_thickness=0.1, height=600, width=800 )
fig.show()

In [None]:
for i in range(11, df.columns.size):
    clean_df_var(df, i)


In [None]:
clean_df_var(df,10)

In [None]:
df.columns

In [None]:
df.to_csv('../data_postprocess/data_nuevo/zone_temp_interior_def.csv')

In [33]:
df=pd.read_csv('../data_postprocess/data_nuevo/zone_temp_interior_def.csv')
df['date']=pd.to_datetime(df['date'])

In [None]:

#Queremos comparar una a una las variables que hemos completado:
file = 'zone_temp_interior.csv'
df_2 = pd.read_csv(path+'/'+file)
start = starts[file]
end = ends[file]
freq = freqs[file]
df_2['date'] = pd.to_datetime(df_2['date']) 
df_2 =df_2[(df_2.date>=start)&(df_2.date<=end)]
helper=pd.DataFrame({'date': pd.date_range(start, end, freq=freq)})
df_2 = pd.merge(df_2, helper, on='date', how='outer').sort_values('date')

df_2.sort_values(by='date', inplace=True)
df_2 = df_2[df_2.date <= '2020-03-01']
df_2 =df_2.iloc[::-1].reset_index(drop=True)
df_2 = df_2.resample('15T', on='date').mean().reset_index()
df_2.iloc[:,1:] = df_2.iloc[:,1:].mask(np.greater(df_2.iloc[:,1:].values, 53.00))

df_2.iloc[:,1:] = df_2.iloc[:,1:].mask(np.less(df_2.iloc[:,1:].values, 1.00))


In [None]:
i=11

fig = make_subplots(rows=2, cols=1,  shared_xaxes=True, subplot_titles=("Bruto",  "Final") )

fig.add_trace(
    go.Scatter(x=df_2.date , y=df_2[df_2.columns[i]]),
    row=1, col=1
)

fig.add_trace(
    go.Scatter(x=df.date , y=df[df.columns[i]]),
    row=2, col=1
)



fig.update_layout(legend_orientation="h", 
             xaxis2_rangeslider_visible=True, xaxis2_rangeslider_thickness=0.1, height=600, width=800, title_text=df.columns[i] )
fig.show()

## Indoor environmental data- Heating


In [None]:
file = 'zone_temp_sp_h.csv'
df = pd.read_csv(path+'/'+file)
path_2 = '../data_postprocess/data_done'
print(path+'/'+file)
start = starts[file]
end = ends[file]
freq = freqs[file]
df['date'] = pd.to_datetime(df['date']) 
df =df[(df.date>=start)&(df.date<=end)]
helper=pd.DataFrame({'date': pd.date_range(start, end, freq=freq)})
df = pd.merge(df, helper, on='date', how='outer').sort_values('date')

df.sort_values(by='date', inplace=True)
summary2(df)

In [None]:
fig = go.Figure()
fig.add_trace(
    go.Scatter(x=df.date , y=df['zone_064_heating_sp'])
)
fig.update_layout(legend_orientation="v", 
             xaxis_rangeslider_visible=True, xaxis_rangeslider_thickness=0.1, height=600, width=800 )
fig.show()

In [None]:
import missingno as msno
missingdata_df = df.columns[df.isnull().any()].tolist()
msno.matrix(df[missingdata_df])

In [None]:
df = df[df.date <= '2020-03-01']

In [None]:
#df =df.iloc[::-1].reset_index(drop=True)

#### Downsampling

Vamos a calcular la varianza media en intervalos de 15 minutos para comprobar que no perdemos demasiada información:

In [None]:
df['group']=df['date'].apply(lambda x: 0 if x.minute < 15 else 1 if x.minute < 30 else 2 if x.minute < 45  else 3)
df['day']=df.date.dt.day
df['month']=df.date.dt.month
df['year']=df.date.dt.year
df['hour']=df.date.dt.hour

In [None]:
std =df.groupby(['group', 'day', 'month', 'year', 'hour']).std()

In [None]:
std.head()

In [None]:
i=2
sns.displot(std[std.columns[i]])

In [None]:
df = df.resample('15T', on='date').mean().reset_index()

In [None]:
df.head()

In [None]:
fig = go.Figure()
fig.add_trace(
    go.Scatter(x=df.date , y=df['zone_063_heating_sp'])
)
fig.update_layout(legend_orientation="v", 
             xaxis_rangeslider_visible=True, xaxis_rangeslider_thickness=0.1, height=600, width=800 )
fig.show()

In [None]:
df.drop(['group', 'day', 'month', 'year', 'hour'], axis=1, inplace=True)

In [None]:
for i in range(1, df.columns.size):
    clean_df_var(df, i)


In [None]:
df.columns

In [None]:
df.to_csv('../data_postprocess/data_nuevo/zone_temp_sp_h_def.csv')

In [35]:
df=pd.read_csv('../data_postprocess/data_nuevo/zone_temp_sp_h_def.csv')
df['date']=pd.to_datetime(df['date'])

In [None]:

#Queremos comparar una a una las variables que hemos completado:
file = 'zone_temp_sp_h.csv'
df_2 = pd.read_csv(path+'/'+file)
start = starts[file]
end = ends[file]
freq = freqs[file]
df_2['date'] = pd.to_datetime(df_2['date']) 
df_2 =df_2[(df_2.date>=start)&(df_2.date<=end)]
helper=pd.DataFrame({'date': pd.date_range(start, end, freq=freq)})
df_2 = pd.merge(df_2, helper, on='date', how='outer').sort_values('date')

df_2.sort_values(by='date', inplace=True)
df_2 = df_2[df_2.date <= '2020-03-01']
df_2 =df_2.iloc[::-1].reset_index(drop=True)
df_2 = df_2.resample('15T', on='date').mean().reset_index()



In [None]:
i=5

fig = make_subplots(rows=2, cols=1,  shared_xaxes=True, subplot_titles=("Bruto",  "Final") )

fig.add_trace(
    go.Scatter(x=df_2.date , y=df_2[df_2.columns[i]]),
    row=1, col=1
)

fig.add_trace(
    go.Scatter(x=df.date , y=df[df.columns[i]]),
    row=2, col=1
)



fig.update_layout(legend_orientation="h", 
             xaxis2_rangeslider_visible=True, xaxis2_rangeslider_thickness=0.1, height=600, width=800, title_text=df.columns[i] )
fig.show()

## Indoor environmental data- Cooling


In [None]:
file = 'zone_temp_sp_c.csv'
df = pd.read_csv(path+'/'+file)
path_2 = '../data_postprocess/data_done'
print(path+'/'+file)
start = starts[file]
end = ends[file]
freq = freqs[file]
df['date'] = pd.to_datetime(df['date']) 
df =df[(df.date>=start)&(df.date<=end)]
helper=pd.DataFrame({'date': pd.date_range(start, end, freq=freq)})
df = pd.merge(df, helper, on='date', how='outer').sort_values('date')

df.sort_values(by='date', inplace=True)
summary2(df)

In [None]:
fig = go.Figure()
fig.add_trace(
    go.Scatter(x=df.date , y=df['zone_064_cooling_sp'])
)
fig.update_layout(legend_orientation="v", 
             xaxis_rangeslider_visible=True, xaxis_rangeslider_thickness=0.1, height=600, width=800 )
fig.show()

In [None]:
import missingno as msno
missingdata_df = df.columns[df.isnull().any()].tolist()
msno.matrix(df[missingdata_df])

In [None]:
df = df[df.date <= '2020-03-01']

In [None]:
#df =df.iloc[::-1].reset_index(drop=True)

#### Downsampling

Vamos a calcular la varianza media en intervalos de 15 minutos para comprobar que no perdemos demasiada información:

In [None]:
df['group']=df['date'].apply(lambda x: 0 if x.minute < 15 else 1 if x.minute < 30 else 2 if x.minute < 45  else 3)
df['day']=df.date.dt.day
df['month']=df.date.dt.month
df['year']=df.date.dt.year
df['hour']=df.date.dt.hour

In [None]:
std =df.groupby(['group', 'day', 'month', 'year', 'hour']).std()

In [None]:
std.head()

In [None]:
i=2
sns.displot(std[std.columns[i]])

In [None]:
df = df.resample('15T', on='date').mean().reset_index()

In [None]:
df.head()

In [None]:
df.drop(['group', 'day', 'month', 'year', 'hour'], axis=1, inplace=True)

In [None]:
for i in range(1, df.columns.size):
    clean_df_var(df, i)


In [None]:
df.to_csv('../data_postprocess/data_nuevo/zone_temp_sp_c_def.csv')

In [37]:
df=pd.read_csv('../data_postprocess/data_nuevo/zone_temp_sp_c_def.csv')
df['date']=pd.to_datetime(df['date'])

FileNotFoundError: [Errno 2] No such file or directory: '../data_postprocess/data_nuevo/zone_temp_sp_c_def.csv'

In [None]:

#Queremos comparar una a una las variables que hemos completado:
file = 'zone_temp_sp_c.csv'
df_2 = pd.read_csv(path+'/'+file)
start = starts[file]
end = ends[file]
freq = freqs[file]
df_2['date'] = pd.to_datetime(df_2['date']) 
df_2 =df_2[(df_2.date>=start)&(df_2.date<=end)]
helper=pd.DataFrame({'date': pd.date_range(start, end, freq=freq)})
df_2 = pd.merge(df_2, helper, on='date', how='outer').sort_values('date')

df_2.sort_values(by='date', inplace=True)
df_2 = df_2[df_2.date <= '2020-03-01']
df_2 =df_2.iloc[::-1].reset_index(drop=True)
df_2 = df_2.resample('15T', on='date').mean().reset_index()



In [None]:
i=5

fig = make_subplots(rows=2, cols=1,  shared_xaxes=True, subplot_titles=("Bruto",  "Final") )

fig.add_trace(
    go.Scatter(x=df_2.date , y=df_2[df_2.columns[i]]),
    row=1, col=1
)

fig.add_trace(
    go.Scatter(x=df.date , y=df[df.columns[i]]),
    row=2, col=1
)



fig.update_layout(legend_orientation="h", 
             xaxis2_rangeslider_visible=True, xaxis2_rangeslider_thickness=0.1, height=600, width=800, title_text=df.columns[i] )
fig.show()