# Regionalne Izby Obrachunkowe

In [1]:
import numpy as np
import pandas as pd

df_data = pd.read_excel('../data/raw/rio/Izby-obrachunkowe.xlsx')

# drop useless record
df_data.drop([0], inplace=True)

# retrieve what interests me most
addresses = []
np.random.seed(seed=21)

for i in range(1, df_data.shape[0], 2):
    street_and_number = df_data.loc[i, "Adres"]
    post_code_and_city = df_data.loc[i+1, "Adres"]
    
    # make data a bit dirty
    if (np.random.randint(low=0, high=2) % 2 == 0):
        addresses.append("{} {}".format(street_and_number, post_code_and_city))
    else:
        addresses.append("{} {}".format(post_code_and_city, street_and_number))        

# save data to file
df_addresses1 = pd.DataFrame(addresses)
df_addresses1.to_csv('../data/processed/rio.csv', header=False, index=False, encoding='utf-8')
print('done!')

done!


# Poczta Polska

In [2]:
import xml.etree.ElementTree as ET
import os

root_directory = '../data/raw/poczta/'

addresses = []

# process all files in the directory
for filename in os.listdir(root_directory):
    full_filename = root_directory + filename
    print('reading file: ', filename)
    
    # read and parse xml file
    doc_tree = ET.parse(full_filename)
    doc_root = doc_tree.getroot()

    # retrieve what interests us most
    for record in doc_root.iter('r'):
        city = record.get('m')
        post_code = record.get('k')
        street = record.get('u')

        # make data a bit dirty
        if (np.random.randint(low=0, high=2) % 2 == 0):
            addresses.append("{} {} {}".format(street, post_code, city))
        else:
            addresses.append("{} {} {}".format(post_code, city, street)) 

# save data to file
df_addresses2 = pd.DataFrame(addresses)
df_addresses2.to_csv('../data/processed/poczta.csv', header=False, index=False, encoding='utf-8')
print('Saved all data. Done!')

reading file:  02.xml
reading file:  04.xml
reading file:  06.xml
reading file:  08.xml
reading file:  10.xml
reading file:  12.xml
reading file:  14.xml
reading file:  16.xml
reading file:  18.xml
reading file:  20.xml
reading file:  22.xml
reading file:  24.xml
reading file:  26.xml
reading file:  28.xml
reading file:  30.xml
reading file:  32.xml
Saved all data. Done!


# Panorama Firm

In [3]:
# Data retrieved from Panorama Firm does not contain post codes.
# We need to enrich our data by using post codes database.
df_postcodes = pd.read_csv('../data/raw/kody_pocztowe/kody.csv', delimiter=';')

def get_postcode (city):
    #return '00-000'
    postcodes_in_city = df_postcodes[ df_postcodes["MIEJSCOWOŚĆ"] == city ]
    if (postcodes_in_city.shape[0] > 0):
        return postcodes_in_city.sample(1).iloc[0]["KOD POCZTOWY"]
    else:
        return '00-000'

In [4]:
from bs4 import BeautifulSoup
import sys

root_directory = '../data/raw/panorama-firm/'

addresses = []

# process all files in the directory
for filename in os.listdir(root_directory):
    full_filename = root_directory + filename
    print('reading file: ', full_filename)
    
    # read file content
    file = open(full_filename, 'r', encoding="utf-8")
    html_doc = file.read()
    file.close()
    
    # parse HTML document and get what interests us most
    soup = BeautifulSoup(html_doc, 'html.parser')
    for span in soup.select("i.icon-location ~ span"):
        # get rid of whitespaces from HTML file
        address = span.text.strip()

        if "," in address:
            street_and_number = address[:address.index(",")].strip()
            city = address[address.index(","):].replace(",", "").strip()
            postcode = get_postcode(city)
            
            # make data a bit dirty
            if (np.random.randint(low=0, high=2) % 2 == 0):
                addresses.append("{} {} {}".format(street_and_number, city, postcode))
            else:
                addresses.append("{} {} {}".format(postcode, city, street_and_number))
        else:
            addresses.append(address)

# save data to file
df_addresses3 = pd.DataFrame(addresses)
df_addresses3.to_csv('../data/processed/panorama-firm.csv', header=False, index=False, encoding='utf-8')
print('Saved all data. Done!')

reading file:  ../data/raw/panorama-firm/fryzjerzy-wielkopolskie-1.html
reading file:  ../data/raw/panorama-firm/fryzjerzy-wielkopolskie-10.html
reading file:  ../data/raw/panorama-firm/fryzjerzy-wielkopolskie-11.html
reading file:  ../data/raw/panorama-firm/fryzjerzy-wielkopolskie-12.html
reading file:  ../data/raw/panorama-firm/fryzjerzy-wielkopolskie-13.html
reading file:  ../data/raw/panorama-firm/fryzjerzy-wielkopolskie-14.html
reading file:  ../data/raw/panorama-firm/fryzjerzy-wielkopolskie-15.html
reading file:  ../data/raw/panorama-firm/fryzjerzy-wielkopolskie-16.html
reading file:  ../data/raw/panorama-firm/fryzjerzy-wielkopolskie-17.html
reading file:  ../data/raw/panorama-firm/fryzjerzy-wielkopolskie-18.html
reading file:  ../data/raw/panorama-firm/fryzjerzy-wielkopolskie-19.html
reading file:  ../data/raw/panorama-firm/fryzjerzy-wielkopolskie-2.html
reading file:  ../data/raw/panorama-firm/fryzjerzy-wielkopolskie-20.html
reading file:  ../data/raw/panorama-firm/fryzjerzy-wi

#### Merged addresses

In [5]:
frames = [df_addresses1, df_addresses2, df_addresses3]
df_addresses_all = pd.concat(frames)

# save data to file
df_addresses_all.to_csv('../data/processed/addresses.csv', header=False, index=False, encoding='utf-8')
print('Saved all data. Done!')

print(df_addresses_all.shape)

Saved all data. Done!
(7570, 1)
