In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import sklearn.neighbors
import sys
sys.path.append('C:/Users/petera/Documents/Envirodual/paths')
sys.path.append('C:/Users/petera/Documents/Envirodual/values')
sys.path.append('C:/Users/petera/Documents/Envirodual/utils')
from path_definition import all_cities, city_paths, model
from preprocessing import one_hot_encode_column, decode_column
from values import OB_MID_dict, area_use_codes, area_use_columns, heated_area_codes
import warnings
warnings.filterwarnings('ignore')

## Read REN data

In [2]:
# columns to load from tables
cols_stavbe = ['STA_SID',
               'OB_MID',
               'ST_ETAZ',
               'ST_PRIT_ETAZE',
               'ST_STANOVANJ',
               'ST_POSLOVNIH_PROSTOROV',
               'DEJANSKA_RABA',
               'ID_TIP_STAVBE',
               'LETO_IZG_STA',
               'LETO_OBN_STREHE',
               'LETO_OBN_FASADE',
               'ID_KONSTRUKCIJE',
               'ID_OGREVANJE']

In [3]:
# read stavbe
stavbe_path = all_cities['root'] / all_cities['stavbe']
stavbe = pd.read_csv(
    stavbe_path,
    usecols=cols_stavbe,
    index_col='STA_SID',
    # encoding='cp1250',
    sep=';'
)

In [4]:
cols_delistavb = ['STA_SID',
                  'DEJANSKA_RABA',
                  'UPOR_POV_STAN',
                  'NETO_TLORIS_POV_DST',
                  'LETO_OBN_OKEN',
                  'ID_POCIT_RABA']

# read delistavb
delistavb_path = all_cities['root'] / all_cities['delistavb']
delistavb = pd.read_csv(
    delistavb_path,
    usecols=cols_delistavb,
    # encoding='cp1250',
    decimal=',',
    sep=';'
)
delistavb = delistavb[delistavb.STA_SID.isin(stavbe.index.unique())]

In [5]:
# read sifranti
sifranti_path = all_cities['root'] / all_cities['sifranti']
sifranti = pd.read_csv(
    sifranti_path,
    # encoding='cp1250',
    sep=';'
)

# Temperature deficit

In [6]:
file_path_temp_deficit = all_cities['root'] / all_cities['temperature_deficit_cleaned']
temp_deficit = pd.read_csv(
    file_path_temp_deficit,
    index_col='STA_SID',
    usecols=['TEMP_DEFICIT','STA_SID']
)

# Read audits

In [7]:
dtype = {'AREA_AUDIT': 'int', 'TOTAL_HEAT': 'int', 'HEAT_m2': 'int', 'YEAR_MEASURED': 'int', 'STA_SID': 'int',
         'Y_C': 'int', 'X_C': 'int'}
file_path_audits = all_cities['root'] / all_cities['audits_cleaned_for_model']
audits = pd.read_csv(
    file_path_audits,
    #usecols=['ZP', 'STA_SID'],
    dtype=dtype,
    index_col='STA_SID'
)

In [8]:
df = audits.join(stavbe, how='inner').join(temp_deficit, how='left')

In [9]:
audits.shape, df.shape

((2826, 6), (2826, 19))

# Missing data REN

In [10]:
# drop entries where areas m2 not given
delistavb.fillna(0, inplace=True)

In [11]:
# collect total areas of building
df['UPORABNA_POVRSINA'] = delistavb.groupby('STA_SID')['UPOR_POV_STAN'].sum()
df['NETO_TLORIS'] = delistavb.groupby('STA_SID')['NETO_TLORIS_POV_DST'].sum()

In [12]:
# drop nans
df.dropna(subset=['UPORABNA_POVRSINA', 'NETO_TLORIS'], inplace=True)

In [13]:
# MISSING DATA
all_data_na = (df.isnull().sum() / len(df)) * 100
all_data_na = all_data_na.drop(all_data_na[all_data_na == 0].index).sort_values(ascending=False)[:]
missing_data = pd.DataFrame({'Missing Ratio' :all_data_na})
missing_data.head(20)

Unnamed: 0,Missing Ratio
LETO_OBN_FASADE,71.868365
LETO_OBN_STREHE,56.298655
TEMP_DEFICIT,14.968153
ST_PRIT_ETAZE,0.424628
ID_OGREVANJE,0.353857
ST_ETAZ,0.353857
ID_KONSTRUKCIJE,0.318471
ID_TIP_STAVBE,0.318471
LETO_IZG_STA,0.141543
DEJANSKA_RABA,0.106157


In [14]:
# get mean windows age
df['LETO_OBN_OKEN'] = delistavb[delistavb.LETO_OBN_OKEN>0].groupby('STA_SID')['LETO_OBN_OKEN'].median().astype(int)

In [15]:
df.head(15).T

STA_SID,10001395,10001395.1,10001527,10001528,10001557,10001557.1,10001677,10001679,10001688,10001710,10001901,10001980,10001986,10001986.1,10002001
AREA_AUDIT,385.0,385.0,2411.0,416.0,979.0,979.0,1771.0,378.0,6635.0,2948.0,650.0,3621.0,4152.0,4152.0,604.0
TOTAL_HEAT,47020.0,47020.0,126354.0,20875.0,47161.0,47161.0,78073.0,19265.0,885431.0,194138.0,74132.0,292799.0,149418.0,149418.0,31438.0
HEAT_m2,122.0,122.0,52.0,50.0,48.0,48.0,44.0,51.0,133.0,66.0,114.0,81.0,36.0,36.0,52.0
YEAR_MEASURED,2014.0,2014.0,2014.0,2014.0,2014.0,2014.0,2014.0,2014.0,2013.0,2014.0,2013.0,2014.0,2013.0,2013.0,2014.0
Y_C,401277.0,401275.0,401188.0,401181.0,401042.0,401035.0,401272.0,401312.0,401373.0,401389.0,401342.0,401473.0,401350.0,401361.0,401029.0
X_C,45364.0,45376.0,45772.0,45735.0,45680.0,45674.0,45684.0,45684.0,45687.0,45658.0,45222.0,44958.0,45065.0,45047.0,45424.0
OB_MID,11027776.0,11027776.0,11027776.0,11027776.0,11027776.0,11027776.0,11027776.0,11027776.0,11027776.0,11027776.0,11027776.0,11027776.0,11027776.0,11027776.0,11027776.0
ST_ETAZ,6.0,6.0,3.0,3.0,4.0,4.0,5.0,3.0,6.0,3.0,1.0,2.0,2.0,2.0,3.0
ST_PRIT_ETAZE,3.0,3.0,3.0,3.0,1.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0
ST_STANOVANJ,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
def fill_and_mark(df, column, value=0):
    '''fills missing values and creates new column marking which entries have been filled'''
    df_ = df.copy()
    df_['NA_'+column] = 0
    idx = df_[df_[column].isna()].index.astype('int32')
    df_.loc[idx,'NA_'+column] = 1
    df_[column].fillna(value, inplace=True)
    return df_

In [17]:
# If building rennovation year not given, set rennovation to construction year
v = df['LETO_IZG_STA'].median()

df = fill_and_mark(df, 'LETO_IZG_STA', v)
#df = fill_and_mark(df, 'LETO_OBN_OKEN', v)
#df = fill_and_mark(df, 'LETO_OBN_STREHE', v)
#df = fill_and_mark(df, 'LETO_OBN_FASADE', v)

In [18]:
for c in ['LETO_OBN_OKEN', 'LETO_OBN_STREHE', 'LETO_OBN_FASADE']:
    df['NA_'+c] = 0
    idx = (df[c].isna()) | (df[c]==0)
    df.loc[idx,'NA_'+c] = 1
    df.loc[idx, c] = df.loc[idx]['LETO_IZG_STA']

In [19]:
for c in ['ID_TIP_STAVBE', 'ID_OGREVANJE', 'ID_KONSTRUKCIJE', 'DEJANSKA_RABA', 'TEMP_DEFICIT']:
    v = df[c].median()
    df = fill_and_mark(df, c, v)

In [20]:
# Impute remaining missing values
def imputer_kNeighbors(df, attrib_to_impute, reference, n_neighbors=3):

    neigh = sklearn.neighbors.KNeighborsRegressor(n_neighbors=n_neighbors)

    y = df[df[attrib_to_impute] > 0][attrib_to_impute]
    x = df[df[attrib_to_impute] > 0][reference].values.reshape(-1, 1)

    neigh.fit(x, y)

    x_missing = df[df[attrib_to_impute].isna()][reference].values.reshape(-1, 1)
    imputed_values = neigh.predict(x_missing)
    # df.loc[df['attrib_to_impute'].isna(), 'attrib_to_impute'] = imputed_values
    return imputed_values

In [21]:
df.loc[df['ST_ETAZ'].isna(), 'ST_ETAZ'] = (
    imputer_kNeighbors(df, attrib_to_impute='ST_ETAZ', reference='NETO_TLORIS', n_neighbors=3)
)

df.loc[df['ST_PRIT_ETAZE'].isna(), 'ST_PRIT_ETAZE'] = (
    imputer_kNeighbors(df, attrib_to_impute='ST_PRIT_ETAZE', reference='NETO_TLORIS', n_neighbors=3)
)

In [22]:
all_data_na = (df.isnull().sum() / len(df)) * 100
all_data_na = all_data_na.drop(all_data_na[all_data_na == 0].index).sort_values(ascending=False)[:]
missing_data = pd.DataFrame({'Missing Ratio' :all_data_na})
missing_data.head(20)

Unnamed: 0,Missing Ratio
OB_MID,0.035386


## Engineer new variables

In [23]:
# add variable to represent the share of the area dedicated to second holiday houses/apartments
df['SHARE_HOLIDAY'] = delistavb[delistavb['ID_POCIT_RABA'] == 1622].groupby('STA_SID')['UPOR_POV_STAN'].sum()
df['SHARE_HOLIDAY'] = df['SHARE_HOLIDAY'] / df['UPORABNA_POVRSINA']
df['SHARE_HOLIDAY'].fillna(0, inplace=True)

In [24]:
columns_one_hot_encode = ['ID_TIP_STAVBE',
                          'DEJANSKA_RABA',
                          'ID_KONSTRUKCIJE',
                          'ID_OGREVANJE']

# decode columns
for col in columns_one_hot_encode:
    codes = df[col].unique()
    categories = sifranti.loc[sifranti.ID.isin(codes)]['IME'].values
    d = dict(zip(codes, categories))
    df = decode_column(df, col, d)

for c in columns_one_hot_encode:
    df = one_hot_encode_column(df, column=c)

## Primary energy per m2

Measured sources contain mostly gas consumption. In the model development script we introduce the fuel efficiency factor 
to get to the HEAT value. Here we create PRIMARY that can be used in the same procedure with the other sources.

In [25]:
efficiency = 0.909
df['PRIMARY'] = df.TOTAL_HEAT / efficiency 

In [26]:
df.drop(columns=['AREA_AUDIT', 'HEAT_m2', 'TOTAL_HEAT', 'X_C', 'Y_C'], inplace=True)
df.shape

(2826, 41)

# Create area use variables

In [27]:
area_use_cols = sifranti.loc[sifranti.POLJE_PK == 'DEJANSKA_RABA']['IME']
for c in area_use_cols:
    df[c] = 0

In [28]:
#areas_use = sifranti.loc[sifranti.POLJE_PK == 'DEJANSKA_RABA'][['ID', 'IME']]

# drop STA_SID that are not in the final table
delistavb = delistavb[delistavb.STA_SID.isin(df.index)]

In [29]:
for r, c in zip(area_use_codes, area_use_columns):
    tmp = delistavb.loc[delistavb.DEJANSKA_RABA == r][['STA_SID', 'UPOR_POV_STAN']]
    tmp = tmp.groupby('STA_SID')['UPOR_POV_STAN'].sum()
    df.loc[tmp.index, c] = tmp

In [30]:
# sum up heated area
ds = delistavb.loc[delistavb.DEJANSKA_RABA.isin(heated_area_codes)]
df['HEATED_AREA'] = ds.groupby('STA_SID')['UPOR_POV_STAN'].sum()
df.HEATED_AREA.fillna(0, inplace=True)

# Save result

In [31]:
all_data_na = (df.isnull().sum() / len(df)) * 100
all_data_na = all_data_na.drop(all_data_na[all_data_na == 0].index).sort_values(ascending=False)[:]
missing_data = pd.DataFrame({'Missing Ratio' :all_data_na})
missing_data.head(20)

Unnamed: 0,Missing Ratio
OB_MID,0.035386


In [32]:
file_path = all_cities['root'] / model['audits_cleaned_for_model']
df.to_csv(
    file_path,
)

In [33]:
#df[df.HEAT_m2.between(5, 1300)].HEAT_m2.mean() # Ljubljana 158.64 kWh

In [37]:
df.NA_ID_OGREVANJE.sum()

10