In [9]:
import pandas as pd
import numpy as np
from pathlib import Path
import sklearn.neighbors
import sys
sys.path.append('C:/Users/petera/Documents/Envirodual/paths')
sys.path.append('C:/Users/petera/Documents/Envirodual/values')
from path_definition import all_cities, city_paths, model
from values import OB_MID_dict, area_use_codes, area_use_columns, heated_area_codes
import warnings
warnings.filterwarnings('ignore')

In [10]:
city_name = 'kranj'
OB_MID = OB_MID_dict[city_name]
paths = city_paths[city_name]

## Read REN data

In [11]:
# columns to load from tables
cols_stavbe = ['STA_SID',
               'OB_MID',
               'ST_ETAZ',
               'ST_PRIT_ETAZE',
               'ST_STANOVANJ',
               'ST_POSLOVNIH_PROSTOROV',
               'DEJANSKA_RABA',
               'ID_TIP_STAVBE',
               'LETO_IZG_STA',
               'LETO_OBN_STREHE',
               'LETO_OBN_FASADE',
               'ID_KONSTRUKCIJE',
               'ID_OGREVANJE']

In [12]:
# read stavbe
stavbe_path = all_cities['root'] / all_cities['stavbe']
stavbe = pd.read_csv(
    stavbe_path,
    usecols=cols_stavbe,
    index_col='STA_SID',
    # encoding='cp1250',
    sep=';'
)

# keep only current municipality
stavbe = stavbe[stavbe.OB_MID.eq(OB_MID)]

In [13]:
cols_delistavb = ['STA_SID',
                  'DEJANSKA_RABA',
                  'UPOR_POV_STAN',
                  'NETO_TLORIS_POV_DST',
                  'LETO_OBN_OKEN',
                  'ID_POCIT_RABA']

# read delistavb
delistavb_path = all_cities['root'] / all_cities['delistavb']
delistavb = pd.read_csv(
    delistavb_path,
    usecols=cols_delistavb,
    # encoding='cp1250',
    decimal=',',
    sep=';'
)
delistavb = delistavb[delistavb.STA_SID.isin(stavbe.index.unique())]

In [14]:
# read sifranti
sifranti_path = all_cities['root'] / all_cities['sifranti']
sifranti = pd.read_csv(
    sifranti_path,
    # encoding='cp1250',
    sep=';'
)

# Temperature deficit

In [15]:
file_path_temp_deficit = all_cities['root'] / all_cities['temperature_deficit_cleaned']
temp_deficit = pd.read_csv(
    file_path_temp_deficit,
    index_col='STA_SID',
    usecols=['TEMP_DEFICIT','STA_SID']
)

# Read gas data

In [16]:
dtype = {'ZP_gas': 'int', 'STA_SID': 'int'}
file_path_gas = all_cities['root'] / paths['gas_cleaned']
gas = pd.read_csv(
    file_path_gas,
    usecols=['ZP_gas', 'STA_SID'],
    dtype=dtype,
    index_col='STA_SID'
)

In [17]:
df = gas.join(stavbe, how='inner').join(temp_deficit, how='left')

In [18]:
gas.shape, df.shape

((11667, 1), (11665, 14))

# Missing data REN

In [19]:
# drop entries where areas m2 not given
delistavb.fillna(0, inplace=True)

In [20]:
# collect total areas of building
df['UPORABNA_POVRSINA'] = delistavb.groupby('STA_SID')['UPOR_POV_STAN'].sum()
df['NETO_TLORIS'] = delistavb.groupby('STA_SID')['NETO_TLORIS_POV_DST'].sum()

In [21]:
# drop nans
df.dropna(subset=['UPORABNA_POVRSINA', 'NETO_TLORIS'], inplace=True)

In [22]:
# get mean windows age
df['LETO_OBN_OKEN'] = delistavb.groupby('STA_SID')['LETO_OBN_OKEN'].median().astype(int)

In [23]:
# MISSING DATA
all_data_na = (df.isnull().sum() / len(df)) * 100
all_data_na = all_data_na.drop(all_data_na[all_data_na == 0].index).sort_values(ascending=False)[:]
missing_data = pd.DataFrame({'Missing Ratio' :all_data_na})
missing_data.head(20)

Unnamed: 0,Missing Ratio
LETO_OBN_FASADE,69.898834
LETO_OBN_STREHE,42.189643
TEMP_DEFICIT,15.972222
ST_PRIT_ETAZE,0.342936
ID_OGREVANJE,0.248628
ST_ETAZ,0.240055
ID_KONSTRUKCIJE,0.214335
ID_TIP_STAVBE,0.205761
LETO_IZG_STA,0.120027
DEJANSKA_RABA,0.042867


In [24]:
def fill_and_mark(df, column, value=0):
    '''fills missing values and creates new column marking which entries have been filled'''
    df_ = df.copy()
    df_['NA_'+column] = 0
    idx = df_[df_[column].isna()].index.astype('int32')
    df_.loc[idx,'NA_'+column] = 1
    df_[column].fillna(value, inplace=True)
    return df_

In [25]:
# If building rennovation year not given, set rennovation to construction year
v = df['LETO_IZG_STA'].median()

df = fill_and_mark(df, 'LETO_IZG_STA', v)
df = fill_and_mark(df, 'LETO_OBN_OKEN', v)
df = fill_and_mark(df, 'LETO_OBN_STREHE', v)
df = fill_and_mark(df, 'LETO_OBN_FASADE', v)

In [26]:
for c in ['ID_TIP_STAVBE', 'ID_OGREVANJE', 'ID_KONSTRUKCIJE', 'DEJANSKA_RABA', 'TEMP_DEFICIT']:
    v = df[c].median()
    df = fill_and_mark(df, c, v)

In [27]:
# Impute remaining missing values
def imputer_kNeighbors(df, attrib_to_impute, reference, n_neighbors=3):

    neigh = sklearn.neighbors.KNeighborsRegressor(n_neighbors=n_neighbors)

    y = df[df[attrib_to_impute] > 0][attrib_to_impute]
    x = df[df[attrib_to_impute] > 0][reference].values.reshape(-1, 1)

    neigh.fit(x, y)

    x_missing = df[df[attrib_to_impute].isna()][reference].values.reshape(-1, 1)
    imputed_values = neigh.predict(x_missing)
    # df.loc[df['attrib_to_impute'].isna(), 'attrib_to_impute'] = imputed_values
    return imputed_values

In [28]:
df.loc[df['ST_ETAZ'].isna(), 'ST_ETAZ'] = (
    imputer_kNeighbors(df, attrib_to_impute='ST_ETAZ', reference='NETO_TLORIS', n_neighbors=3)
)

df.loc[df['ST_PRIT_ETAZE'].isna(), 'ST_PRIT_ETAZE'] = (
    imputer_kNeighbors(df, attrib_to_impute='ST_PRIT_ETAZE', reference='NETO_TLORIS', n_neighbors=3)
)

In [29]:
all_data_na = (df.isnull().sum() / len(df)) * 100
all_data_na = all_data_na.drop(all_data_na[all_data_na == 0].index).sort_values(ascending=False)[:]
missing_data = pd.DataFrame({'Missing Ratio' :all_data_na})
missing_data.head(20)

Unnamed: 0,Missing Ratio


## Engineer new variables

In [30]:
# add variable to represent the share of the area dedicated to second holiday houses/apartments
df['SHARE_HOLIDAY'] = delistavb[delistavb['ID_POCIT_RABA'] == 1622].groupby('STA_SID')['UPOR_POV_STAN'].sum()
df['SHARE_HOLIDAY'] = df['SHARE_HOLIDAY'] / df['UPORABNA_POVRSINA']
df['SHARE_HOLIDAY'].fillna(0, inplace=True)

In [31]:
ORIGINAL_ATTRIBUTES = ['DEJANSKA_RABA',
                       'ID_KONSTRUKCIJE',
                       'ID_OGREVANJE']

NEW_ATTRIBUTES = ['NAMENSKA_RABA',
                  'MATERIAL_NOSILNE_KONSTRUKCIJE',
                  'VRSTA_OGREVANJA']


def decode_attributes(df):
    ''' takes original categorical attributes from stavbe and produces new attributes
    based on sifranti '''
    for attrib, new_attrib in zip(ORIGINAL_ATTRIBUTES, NEW_ATTRIBUTES):
        for i in df[attrib].dropna().unique():
            df.at[df[attrib] == i, new_attrib] = sifranti[sifranti.ID == i]['IME'].values[0]

    return df.drop(ORIGINAL_ATTRIBUTES, axis=1)

df = decode_attributes(df)

In [32]:
# make categorical values with one hot encoder
def categorize(df, column):
    df_ = df.copy()
    cats = df_[column].unique()
    for c, v in enumerate(cats):
        new_col = column + '_' + str(c)
        df_[new_col] = 0
        df_.loc[df_[column] == v, new_col] = 1
    return df_.drop(column, axis=1)

In [33]:
# one hot encode category attributes
columns_one_hot_encode = ['NAMENSKA_RABA',
                          'MATERIAL_NOSILNE_KONSTRUKCIJE',
                          'VRSTA_OGREVANJA',
                          'ID_TIP_STAVBE']

for c in columns_one_hot_encode:
    df = categorize(df, column=c)

# Create area use variables

In [34]:
area_use_cols = sifranti.loc[sifranti.POLJE_PK == 'DEJANSKA_RABA']['IME']
for c in area_use_cols:
    df[c] = 0

In [35]:
#areas_use = sifranti.loc[sifranti.POLJE_PK == 'DEJANSKA_RABA'][['ID', 'IME']]

# drop STA_SID that are not in the final table
delistavb = delistavb[delistavb.STA_SID.isin(df.index)]

In [36]:
for r, c in zip(area_use_codes, area_use_columns):
    tmp = delistavb.loc[delistavb.DEJANSKA_RABA == r][['STA_SID', 'UPOR_POV_STAN']]
    tmp = tmp.groupby('STA_SID')['UPOR_POV_STAN'].sum()
    df.loc[tmp.index, c] = tmp.values
    
    # leaves out some building parts - for some reason results in higher R2
    #tmp = delistavb.loc[delistavb.DEJANSKA_RABA == r][['STA_SID', 'UPOR_POV_STAN']]
    #df.loc[tmp.STA_SID.values, c] = tmp['UPOR_POV_STAN'].values


In [37]:
# sum up heated area
ds = delistavb.loc[delistavb.DEJANSKA_RABA.isin(heated_area_codes)]
df['HEATED_AREA'] = ds.groupby('STA_SID')['UPOR_POV_STAN'].sum()
df.HEATED_AREA.fillna(0, inplace=True)

# Save result

In [38]:
all_data_na = (df.isnull().sum() / len(df)) * 100
all_data_na = all_data_na.drop(all_data_na[all_data_na == 0].index).sort_values(ascending=False)[:]
missing_data = pd.DataFrame({'Missing Ratio' :all_data_na})
missing_data.head(20)

Unnamed: 0,Missing Ratio


In [39]:
file_path = all_cities['root'] / model['gas_' + city_name]
df.to_csv(
    file_path,
)

In [40]:
d = df['ZP_gas']/df['HEATED_AREA']

In [41]:
d[(d>5) & (d<1400)].mean()

48.146366664633526

In [44]:
df.ZP_gas.sum()

306473577

In [45]:
306491872/306473577

1.0000596951951912