# Demographics
This script is used for parsing demographics data and converting GeoJson to required form.

Source:https://www.czso.cz/csu/czso/databaze-demografickych-udaju-za-obce-cr

In [2]:
import time
import pandas as pd
import numpy as np
import re
import json
import warnings
import json
import os
import unidecode

warnings.filterwarnings('ignore')

In [3]:
PATH_INTERMEDIATE = '../../data/intermediate/'
PATH_RAW = '../../data/raw/'
PATH_FINAL = '../../data/final/'
PATH_POPULATION = PATH_RAW+'population/'

## Demographics

In [4]:
## NUTS x name of the distict
# https://apl.czso.cz/irso4/export4.jsp?kodcis=101&vazcis=0&vazid=0&expvaz=1768&expatr=43&next=Dal%C5%A1%C3%AD
nuts = pd.read_excel(PATH_RAW + 'NUTS.xlsx')
nuts = nuts[['OKRES_LAUT,C,100', 'CZNUTS,C,15']]
nuts.columns = ['Okres', 'NUTS']
nuts = pd.Series(nuts.Okres.values,index=nuts.NUTS).to_dict()

In [5]:
demo = pd.DataFrame()
for name in os.listdir(PATH_POPULATION):
    district = pd.read_excel(PATH_POPULATION + name)
    district['Okres'] = nuts[name[:-5]]
    district['NUTS'] = name[:-5]
    demo = demo.append(district)

In [6]:
demo = demo[demo.Rok > 2009]

demo = demo[['Číslo\nobce', 'Název obce', 'Okres', 'NUTS', 'Rok', 'Stav 31.12.', 'Narození',
             'Zemřelí', 'Přírůstek přirozený', 'Přistě-\nhovalí',
             'Vystě-\nhovalí', 'Přírůstek migrační', 'Přírůstek celkový']]

demo.columns = ['village_num', 'village_name', 'district', 'NUTS', 'year',
                'population', 'born', 'deceased', 'born_deceased',
                'immigrants', 'emigrants', 'migration', 'change_total']

In [7]:
demo = demo[demo.population !='-']
demo = demo.applymap(lambda x: 0 if '-' == str(x) else x)

nums = demo.columns[4:]
demo[nums] = demo[nums].applymap(int)
demo

Unnamed: 0,village_num,village_name,district,NUTS,year,population,born,deceased,born_deceased,immigrants,emigrants,migration,change_total
39,539104,Bojanovice,Praha-západ,CZ020A,2010,425,5,8,-3,24,12,12,9
40,539104,Bojanovice,Praha-západ,CZ020A,2011,420,6,12,-6,17,10,7,1
41,539104,Bojanovice,Praha-západ,CZ020A,2012,417,1,4,-3,18,18,0,-3
42,539104,Bojanovice,Praha-západ,CZ020A,2013,432,5,0,5,23,13,10,15
43,539104,Bojanovice,Praha-západ,CZ020A,2014,447,2,6,-4,25,6,19,15
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3144,585050,Zaječí,Břeclav,CZ0644,2017,1442,18,11,7,27,29,-2,5
3145,585050,Zaječí,Břeclav,CZ0644,2018,1432,19,20,-1,17,26,-9,-10
3146,585050,Zaječí,Břeclav,CZ0644,2019,1424,13,14,-1,34,41,-7,-8
3147,585050,Zaječí,Břeclav,CZ0644,2020,1454,15,13,2,61,33,28,30


In [170]:
demo.to_csv(PATH_INTERMEDIATE+'demographics.csv',index=False)

In [8]:
demo = pd.read_csv(PATH_INTERMEDIATE+'demographics.csv')

In [9]:
praha = demo[demo.district == 'Praha']

In [10]:
# for year 2021 before update on 30/09/2022
age_categories = pd.read_excel(PATH_RAW+'vek_okres_2021.xlsx')
age_categories.columns = ['district', 'population_total', 'population_0_14', 'population_15_64', 'population_65_more',
                         'avg_age_total', 'male_avg_age', 'female_avg_age']
age_categories['year'] = 2021
age_categories

Unnamed: 0,district,population_total,population_0_14,population_15_64,population_65_more,avg_age_total,male_avg_age,female_avg_age,year
0,Česká republika,10524167,1691760,6684359,2148048,42.6861,41.2297,44.1012,2021
1,Praha,1301432,199369,863086,238977,41.3984,39.9391,42.7823,2021
2,Středočeský kraj,1415463,253371,895370,266722,41.5787,40.3684,42.7713,2021
3,Benešov,100347,17084,61929,21334,42.9294,41.7049,44.1339,2021
4,Beroun,98073,17754,62038,18281,41.4569,40.2803,42.6175,2021
...,...,...,...,...,...,...,...,...,...
86,Frýdek-Místek,209322,34038,132508,42776,42.9797,41.4839,44.4318,2021
87,Karviná,235579,34537,150361,50681,44.1635,42.4072,45.8441,2021
88,Nový Jičín,146366,24117,92376,29873,42.7262,41.2037,44.2110,2021
89,Opava,171502,27681,108487,35334,43.0780,41.4610,44.6246,2021


In [45]:
praha = pd.merge(praha, age_categories[age_categories.district == 'Praha'], on=['district', 'year'], how='left')

In [46]:
# Updated version for

district_ages = pd.DataFrame()

# population_age_2010-2021
for file_name in os.listdir(PATH_RAW+'population_age_2012-2021/'):
    
    tmp = pd.read_excel(PATH_RAW+ 'population_age_2012-2021/'+ file_name, index_col=[0,1], header=[0,1,2])
    village_name = tmp.columns.get_level_values(0)[1].split(' - ')[0]
    nuts_num = tmp.columns.get_level_values(0)[-1].split()[-1]
    # print(nuts_num, file_name)
    tmp.columns = [c[2] for c in tmp.columns]
    tmp = tmp.drop(columns=tmp.columns[-2:])
    idx_first = np.where(tmp.index.get_loc('Stav obyvatel k 31.12.') == True)[0][0]
    idx_last = np.where(tmp.index.get_loc('Sňatky celkem') == True)[0][0]
    tmp = tmp[idx_first:idx_last].T.reset_index()
    tmp.columns = ['year', 'population_total', 'population_0_14', 'population_15_64', 'population_65_more', 'avg_age_total', 'age_index_total',
                            'female_total', 'female_0_14', 'female_15_64', 'female_65_more', 'female_avg_age', 'female_age_index',
                            'male_total', 'male_0_14', 'male_15_64', 'male_65_more', 'male_avg_age', 'male_age_index']

    tmp['district'] = nuts[nuts_num]
    # tmp['village_name'] = village_name
    district_ages = district_ages.append(tmp)

# district_ages['district'] = district_ages['district_num'].apply(lambda x: nuts[x])
district_ages = district_ages.groupby(['district', 'year']).sum().reset_index()

In [47]:
district_ages_old = pd.DataFrame()

# population_age_2011-2020
for file_name in os.listdir(PATH_RAW+'population_age_2011-2020/'):
    
    tmp = pd.read_excel(PATH_RAW+ 'population_age_2011-2020/'+ file_name, index_col=[0,1], header=[0,1,2])
    village_name = tmp.columns.get_level_values(0)[1].split(' - ')[0]
    nuts_num = tmp.columns.get_level_values(0)[-1].split()[-1]
    # print(nuts_num, file_name)
    tmp.columns = [c[2] for c in tmp.columns]
    tmp = tmp.drop(columns=tmp.columns[-2:])
    idx_first = np.where(tmp.index.get_loc('Stav obyvatel k 31.12.') == True)[0][0]
    idx_last = np.where(tmp.index.get_loc('Sňatky') == True)[0][0]
    tmp = tmp[idx_first:idx_last].T.reset_index()
    tmp.columns = ['year', 'population_total', 'population_0_14', 'population_15_64', 'population_65_more', 'avg_age_total', 'age_index_total',
                            'female_total', 'female_0_14', 'female_15_64', 'female_65_more', 'female_avg_age', 'female_age_index',
                            'male_total', 'male_0_14', 'male_15_64', 'male_65_more', 'male_avg_age', 'male_age_index']

    tmp['district'] = nuts[nuts_num]
    # tmp['village_name'] = village_name
    district_ages_old = district_ages_old.append(tmp)

# district_ages_old['district'] = district_ages_old['district_num'].apply(lambda x: nuts[x])
district_ages_old = district_ages_old.groupby(['district', 'year']).sum().reset_index()
district_ages_old = district_ages_old[district_ages_old.year == 2011]

In [48]:
# merge all years together (irrelevant after recent update)
# total_age = pd.concat([age_categories, district_ages])

# append year 2011 to others
total_age = district_ages.append(district_ages_old)

In [49]:
districts_df = demo.groupby(['district', 'year']).sum().drop(columns='village_num').reset_index()
inv_nuts = {v: k for k, v in nuts.items()}
demographics_total = pd.merge(districts_df, total_age, on=['district', 'year'])
demographics_total = demographics_total.append(praha.drop(columns=['village_num', 'village_name']))
demographics_total['NUTS'] = demographics_total['district'].apply(lambda x: inv_nuts[x])
demographics_total

Unnamed: 0,district,year,population,born,deceased,born_deceased,immigrants,emigrants,migration,change_total,...,female_65_more,female_avg_age,female_age_index,male_total,male_0_14,male_15_64,male_65_more,male_avg_age,male_age_index,NUTS
0,Benešov,2011,95445,981,1023,-42,2550,1916,634,592,...,6422.0,120.043379,276.228688,48256.0,7024.0,32199.0,9033.0,128.037796,404.330835,CZ0201
1,Benešov,2012,95883,1059,1029,30,2547,2139,408,438,...,6736.0,120.587670,283.821274,48510.0,7183.0,31988.0,9339.0,128.639155,409.751593,CZ0201
2,Benešov,2013,96273,1015,977,38,2471,2119,352,390,...,7098.0,121.347742,297.743346,48693.0,7372.0,31653.0,9668.0,129.204642,416.108141,CZ0201
3,Benešov,2014,96718,1001,962,39,2737,2331,406,445,...,7421.0,122.069190,306.742200,48948.0,7523.0,31320.0,10105.0,129.883716,424.938527,CZ0201
4,Benešov,2015,97085,1086,1073,13,2554,2200,354,367,...,7695.0,122.635325,309.965347,49075.0,7649.0,30995.0,10431.0,130.295637,428.361639,CZ0201
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7,Praha,2017,1294513,15324,12199,3125,37976,27096,10880,14005,...,,,,,,,,,,CZ0100
8,Praha,2018,1308632,15460,12417,3043,40503,29427,11076,14119,...,,,,,,,,,,CZ0100
9,Praha,2019,1324277,14933,12178,2755,42328,29438,12890,15645,...,,,,,,,,,,CZ0100
10,Praha,2020,1335084,14713,13621,1092,40582,30867,9715,10807,...,,,,,,,,,,CZ0100


In [8]:
demographics_total['normalized'] = demographics_total['district'].apply(unidecode.unidecode)
demographics_total['normalized'] = demographics_total['normalized'].apply(lambda x: re.sub('[^a-z]', '', x.lower()))
demographics_total = demographics_total.drop(columns=['population_total'])

In [7]:
demographics_total.to_csv(PATH_FINAL+'demographics.csv')

## GeoJson

In [73]:
from random import randrange

regions = []
regions_sample = []

region_names = os.listdir(PATH_RAW+'geojson/')
for i, region_name in enumerate(region_names):
    region = dict()

    with open(PATH_RAW+f'geojson/{region_name}') as dataFile:
        data = dataFile.read()
        obj = data[data.find('[') : data.rfind(']')+1]
        # print(obj)
        obj_jsonify = obj.replace('lng', '"lng"').replace('lat', '"lat"').replace(', "', '", "').replace(': ', ':').replace(':', ':"').replace('}', '"}')
        json_coord = json.loads(obj_jsonify)

        coordinates = []
        for point in json_coord:
            coordinates.append([float(point['lng'][:9]), float(point['lat'][:9])])

        region['type'] = 'Feature'
        region['id'] = str(i)

        region['properties'] = {
            'name' : region_name[:-3],
            'value': randrange(9)
        }
        
        region['geometry'] = {
            'type' : 'Polygon',
            'coordinates': [coordinates]
        }

        regions.append(region)
        if i < 5:
            regions_sample.append(region)

In [74]:
final_regions = {
    "type":"FeatureCollection",
    "features":regions
}

final_regions_sample = {
    "type":"FeatureCollection",
    "features":regions_sample
}

In [75]:
with open(PATH_FINAL+'regions.json', 'w') as f:
    json.dump(final_regions, f)
with open(PATH_FINAL+'regions_sample.json', 'w') as f:
    json.dump(final_regions_sample, f)

In [1]:
demographics_total

NameError: name 'demographics_total' is not defined