# Data preparation

 **Methodology**

For each postal code:

1. For each postal code (PLZ) perform cross-check with OSM data dump from Geofabrik to populate building object info with:
    1. Region name from OSM dump
    1. Geometry info (polygon coords)
1. Calculate total area for all objects
-------------
To do list:

1. Classify buildings into types (manual)
1. Group buildings by type to get for each type (residential, industrial,...etc.)
    1. Rectangularity (area of polygon / area of minimum bounding box of polygon)
    1. Total area

# Initialization

In [None]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [None]:
import pandas as pd
import numpy as np
import sys
import os

from pyrosm import OSM

## Load custom modules

In [None]:
import data_preparation as dp
import gemeindeverz
import helpers

In [None]:
# Reload module (incase new update)
import importlib
importlib.reload(gemeindeverz)

In [None]:
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

# Input

In [None]:
# Obtain from https://www.destatis.de/DE/Themen/Laender-Regionen/Regionales/Gemeindeverzeichnis/_inhalt.html
GV_path = '../data/01_raw/GV/GV100AD_301120.asc'
# plz land list
plz_ags_csv = '../data/01_raw/zuordnung_plz_ort_landkreis.csv'

In [None]:
buildings_data_location = '../data/01_raw/buildings_data/'
geofabrik_location = '../data/01_raw/geofabrik/'
buildings_int_location = '../data/02_intermediate/buildings_data/'

In [None]:
# Create ags code dict for each state / region
region_ags_dict = {
    # BE_BB
    'brandenburg-latest.osm.pbf': ['11','12'],
    # BW
    'freiburg-regbez-latest.osm.pbf': ['083'],
    'karlsruhe-regbez-latest.osm.pbf': ['082'],
    'stuttgart-regbez-latest.osm.pbf': ['081'],
    'tuebingen-regbez-latest.osm.pbf': ['084'],
    # BY
    'mittelfranken-latest.osm.pbf': ['095'],
    'niederbayern-latest.osm.pbf': ['092'],
    'oberbayern-latest.osm.pbf': ['091'],
    'oberfranken-latest.osm.pbf': ['094'],
    'oberpfalz-latest.osm.pbf': ['093'],
    'schwaben-latest.osm.pbf': ['097'],
    'unterfranken-latest.osm.pbf': ['096'],
    # HB
    'bremen-latest.osm.pbf': ['04'],
    # HE
    'hessen-latest.osm.pbf': ['06'],
    # HH
    'hamburg-latest.osm.pbf': ['02'],
    # MV
    'mecklenburg-vorpommern-latest.osm.pbf': ['13'],
    # NI
    'niedersachsen-latest.osm.pbf': ['03'],
    # NW
    'arnsberg-regbez-latest.osm.pbf': ['059'],
    'detmold-regbez-latest.osm.pbf': ['057'],
    'duesseldorf-regbez-latest.osm.pbf': ['051'],
    'koeln-regbez-latest.osm.pbf': ['053'],
    'muenster-regbez-latest.osm.pbf': ['055'],
    # RP
    'rheinland-pfalz-latest.osm.pbf': ['07'],
    # SH
    'schleswig-holstein-latest.osm.pbf': ['01'],
    # SL
    'saarland-latest.osm.pbf': ['10'],
    # SN
    'sachsen-latest.osm.pbf': ['14'],
    # ST
    'sachsen-anhalt-latest.osm.pbf': ['15'],
    # TH
    'thueringen-latest.osm.pbf': ['16']
}

In [None]:
plz_ags = pd.read_csv(plz_ags_csv, dtype = {'plz': str,
                                            'ags':str})

In [None]:
plz_ags.plz.nunique()

In [None]:
plz_ags.ags.nunique()

In [None]:
boundary_type = 'ags'

In [None]:
# Community directory dataframe
# Use this file to manually get ags code for region available on Geofabrik (inside state)
com_dir_df = gemeindeverz.einlesen(GV_path)
com_dir_df[com_dir_df.gemeinde_bez.str.contains('thüringen', case = False)]

# Process

## Get buildings in region

In [None]:
# Extract plz list
id_list = os.listdir(buildings_data_location)
id_list = [x.split('.')[0].split('_')[2] for x in id_list if 'buildings' in x]
id_list[0:10]

In [None]:
# full path pbf
region_list_path = [os.path.join(path, name) for path, subdirs, files in os.walk(geofabrik_location) for name in files]
# pbf name
pbf_list = [name for path, subdirs, files in os.walk(geofabrik_location) for name in files]
pbf_list

In [None]:
i = 8

In [None]:
# Get target region
target_region_path = region_list_path[i]
target_region = pbf_list[i]
target_region

In [None]:
# Get ags belong to the target region
target_ags_list = region_ags_dict.get(target_region)
target_ags_list

In [None]:
ags_len = len(target_ags_list[0])
ags_len

In [None]:
# Initialize the OSM parser object
osm = OSM(target_region_path)

In [None]:
%%time
buildings = osm.get_buildings()

## Get boundary_ids in region

In [None]:
# Extract info of all PLZ belong to that region
region_id_list = plz_ags[(dp.left(plz_ags.ags.str, ags_len).isin(target_ags_list))][[boundary_type]].drop_duplicates().reset_index(drop=True)

print(f'Number of {boundary_type}(s) in region of {target_region} is {region_id_list.shape[0]}')

## Read in boundary_type file

In [None]:
if not os.path.exists(buildings_int_location):
    os.makedirs(buildings_int_location)

In [None]:
# Check for progress of already enhanced areas
name_list = os.listdir(buildings_int_location)
id_list = [x.split('.')[0].split('_')[2] for x in name_list if 'buildings' in x]

# Get to-be-enhanced list
region_id_list = pd.DataFrame(np.setdiff1d(region_id_list, id_list), columns = [boundary_type])
logging.info(f'Total of {len(region_id_list)} {boundary_type}(s) in the region')

In [None]:
boundary_id = region_id_list[boundary_type].iloc[0]
boundary_id

In [None]:
buildings_boundary_path = f'../data/01_raw/buildings_data/buildings_{boundary_type}_{boundary_id}.csv'
buildings_boundary_path

In [None]:
# Read in building objects data in the postal code
df = pd.read_csv(buildings_boundary_path,
                 dtype={'tags.addr:suburb': 'object',
                        'tags.building:levels': 'object',
                        'tags.source': str,
                        'tags.addr:postcode': str},
                 converters={"nodes": lambda x: x.strip("[]").split(", ")}) # read column as list

# remove empty elements (no lat/lon)
df = df[df['center.lat'].isna() == False].reset_index(drop=True)

# replace NaN in building_levels
df = df.rename(columns = {'tags.building:levels': 'building_levels',
                          'tags.addr:postcode' : 'postcode'})

df.building_levels = df.building_levels.fillna(1)

f'Total of {len(df)} buildings in {boundary_type} {boundary_id}'

## Populate data into PLZ building objects

In [None]:
df_res = df.merge(buildings[['id','geometry','timestamp']],
                  how = 'left',
                  on = 'id')

In [None]:
df_res.geometry = df_res.geometry.fillna(np.nan)

In [None]:
# Calculate total area for all building objects
df_res['surface_area'] = df_res.geometry.apply(lambda x: dp.calculate_surface_area(x) * 10**10)
df_res['total_area'] = df['building_levels'].astype(int) * df_res['surface_area']

In [None]:
# Classify to building types
df_res['building_types'] = df_res['tags.building'].apply(lambda x: dp.manual_classify_building(x))

In [None]:
# # Save result to 02_intermediate/buildings_plz/buildings_<plz>.csv
# output_path = '../data/02_intermediate/buildings_plz/'

# # create saving location folder if not exists
# if not os.path.exists(output_path):
#     os.makedirs(output_path)
# df_res.to_csv(output_path + f'buildings_{plz}.csv', index = False)