# 1_Data Acquisition
Aim to get all buildings data from OpenStreetMap with correct administrative tag (postalcodes, regional-key, area-key):

1. Access [Postleitzahl](https://www.suche-postleitzahl.org/) to get list of postal codes in Germany (**manual**)
1. Access [Regional Statistics](https://www.regionalstatistik.de/) to get demographics data for all regional-key (regionalschluessel represents municipal) (**manual**)
1. Access Overpass API (OpenStreetMap) to get building footprints for all postal codes (**automate in this notebook**)

## Initialization

In [1]:
import pandas as pd
import numpy as np
import sys
import io

import logging 

from datetime import datetime
timestamp = datetime.now().strftime("%d%m%y_%H%M")

### Load custom modules

In [2]:
pkg_path = '../src/'

sys.path.append(pkg_path)
import data_acquisition as da

In [3]:
# Reload module (incase new update)
import importlib
importlib.reload(da)

<module 'data_acquisition' from '../src\\data_acquisition.py'>

In [4]:
# Define logging module
logging.basicConfig(filename=f'../log/{timestamp}_OSM_crawler.log', 
                    filemode='w',
                    format='%(asctime)s %(message)s', 
                    datefmt='%m/%d/%Y %I:%M:%S%p',
                    level=logging.INFO)

logging.info('Start logging...')

## Inputs / Outputs

In [5]:
plz_csv = '../data/01_raw/plz_einwohner.csv'

In [6]:
# Postal code list with population data (2011)
plz_de = pd.read_csv(plz_csv,
                    dtype={'plz': str, 'einwhoner': int}) # specify column types

In [7]:
# Get all existing PLZs
import os
name_list = os.listdir('../data/01_raw/buildings_plz/')

done_plz = [x.split('.')[0].split('_')[1] for x in name_list if 'buildings' in x]

In [8]:
# Update to get to-be-crawled list
plz_de = plz_de[plz_de.plz.isin(done_plz) == False].reset_index(drop = True)

In [9]:
start = 0
end = len(plz_de)

logging.info('Start from postal code {0} at {1}/{2}'.format(plz_de.plz[start], start, end))

## Scan OSM for buildings per postal codes

Save result to each postal code csv

In [11]:
while start <= end:
    # Get all the building foot prints in target postal code
    target_plz = plz_de.plz[start]
    
    start = start + 1
    
    # Extract buildings
    results_df = da.get_buildings_plz(target_plz)
    
    plz_path = f'../data/01_raw/buildings_plz/buildings_{target_plz}.csv'
    
    # Save results
    if results_df.empty == True:
        logging.error('Can not extract buildings for postal code {0}'.format(target_plz))
    else:
        # Saving files
        da.save_building_result(results_df, target_plz, plz_path)
        logging.info('Complete extraction for postal code {0} at position {1}/{2}'.format(target_plz, start, end))

KeyError: 354

In [None]:
logging.shutdown()