# Demographics
This script is used for parsing demographics data.

Source: https://www.czso.cz/csu/czso/databaze-demografickych-udaju-za-obce-cr

In [2]:
import time
import pandas as pd
import numpy as np
import re
import json
import warnings
import json
import os

warnings.filterwarnings('ignore')

In [3]:
PATH_INTERMEDIATE = '../../data/intermediate/'
PATH_RAW = '../../data/raw/'
PATH_FINAL = '../../data/final/'
PATH_POPULATION = PATH_RAW+'population/'

In [64]:
## NUTS x name of the distict
# https://apl.czso.cz/irso4/export4.jsp?kodcis=101&vazcis=0&vazid=0&expvaz=1768&expatr=43&next=Dal%C5%A1%C3%AD
nuts = pd.read_excel(PATH_RAW + 'NUTS.xlsx')
nuts = nuts[['OKRES_LAUT,C,100', 'CZNUTS,C,15']]
nuts.columns = ['Okres', 'NUTS']
nuts = pd.Series(nuts.Okres.values,index=nuts.NUTS).to_dict()

In [65]:
demo = pd.DataFrame()
for name in os.listdir(PATH_POPULATION):
    district = pd.read_excel(PATH_POPULATION + name)
    district['Okres'] = nuts[name[:-5]]
    demo = demo.append(district)

In [66]:
demo = demo[demo.Rok > 2009]

demo = demo[['Číslo\nobce', 'Název obce', 'Okres', 'Rok', 'Stav 31.12.', 'Narození',
             'Zemřelí', 'Přírůstek přirozený', 'Přistě-\nhovalí',
             'Vystě-\nhovalí', 'Přírůstek migrační', 'Přírůstek celkový']]

demo.columns = ['village_num', 'village_name', 'district', 'year',
                'population', 'born', 'deceased', 'born_deceased',
                'immigrants', 'emigrants', 'migration', 'change_total']

In [141]:
demo[demo.village_name=='Otice']

Unnamed: 0,village_num,village_name,district,year,population,born,deceased,born_deceased,immigrants,emigrants,migration,change_total
36720,509612,Otice,Opava,2010,1340,7,12,-5,18,28,-10,-15
36721,509612,Otice,Opava,2011,1343,11,2,9,45,37,8,17
36722,509612,Otice,Opava,2012,1367,16,15,1,45,22,23,24
36723,509612,Otice,Opava,2013,1394,18,11,7,42,22,20,27
36724,509612,Otice,Opava,2014,1407,13,18,-5,45,27,18,13
36725,509612,Otice,Opava,2015,1418,18,10,8,50,47,3,11
36726,509612,Otice,Opava,2016,1421,17,14,3,28,28,0,3
36727,509612,Otice,Opava,2017,1407,19,14,5,27,46,-19,-14
36728,509612,Otice,Opava,2018,1436,16,13,3,53,27,26,29
36729,509612,Otice,Opava,2019,1441,18,10,8,45,48,-3,5


In [68]:
demo = demo[demo.population !='-']
demo = demo.applymap(lambda x: 0 if '-' == str(x) else x)

nums = demo.columns[3:]
demo[nums] = demo[nums].applymap(int)
demo

Unnamed: 0,village_num,village_name,district,year,population,born,deceased,born_deceased,immigrants,emigrants,migration,change_total
39,539104,Bojanovice,Praha-západ,2010,425,5,8,-3,24,12,12,9
40,539104,Bojanovice,Praha-západ,2011,420,6,12,-6,17,10,7,1
41,539104,Bojanovice,Praha-západ,2012,417,1,4,-3,18,18,0,-3
42,539104,Bojanovice,Praha-západ,2013,432,5,0,5,23,13,10,15
43,539104,Bojanovice,Praha-západ,2014,447,2,6,-4,25,6,19,15
...,...,...,...,...,...,...,...,...,...,...,...,...
3144,585050,Zaječí,Břeclav,2017,1442,18,11,7,27,29,-2,5
3145,585050,Zaječí,Břeclav,2018,1432,19,20,-1,17,26,-9,-10
3146,585050,Zaječí,Břeclav,2019,1424,13,14,-1,34,41,-7,-8
3147,585050,Zaječí,Břeclav,2020,1454,15,13,2,61,33,28,30


In [71]:
demo.to_csv(PATH_INTERMEDIATE+'demographics.csv', index=False)

In [77]:
demo = pd.read_csv(PATH_INTERMEDIATE+'demographics.csv')

In [108]:
# for year 2021 before update on 30/09/2022
age_categories = pd.read_excel(PATH_RAW+'vek_okres_2021.xlsx')
age_categories.columns = ['district', 'population_total', 'population_under_15', 'population_15_64', 'population_over_64',
                         'avg_age_total', 'avg_age_male', 'avg_age_female']
age_categories['year'] = 2021
age_categories

Unnamed: 0,district,population_total,population_under_15,population_15_64,population_over_64,avg_age_total,avg_age_male,avg_age_female,year
0,Česká republika,10524167,1691760,6684359,2148048,42.6861,41.2297,44.1012,2021
1,Praha,1301432,199369,863086,238977,41.3984,39.9391,42.7823,2021
2,Středočeský kraj,1415463,253371,895370,266722,41.5787,40.3684,42.7713,2021
3,Benešov,100347,17084,61929,21334,42.9294,41.7049,44.1339,2021
4,Beroun,98073,17754,62038,18281,41.4569,40.2803,42.6175,2021
...,...,...,...,...,...,...,...,...,...
86,Frýdek-Místek,209322,34038,132508,42776,42.9797,41.4839,44.4318,2021
87,Karviná,235579,34537,150361,50681,44.1635,42.4072,45.8441,2021
88,Nový Jičín,146366,24117,92376,29873,42.7262,41.2037,44.2110,2021
89,Opava,171502,27681,108487,35334,43.0780,41.4610,44.6246,2021


In [154]:
# Updated version for 2021

district_ages = pd.DataFrame()

# population_age_2010-2021
for file_name in os.listdir(PATH_RAW+'population_age_2012-2021/'):
    
    tmp = pd.read_excel(PATH_RAW+ 'population_age_2012-2021/'+ file_name, index_col=[0,1], header=[0,1,2])
    village_name = tmp.columns.get_level_values(0)[1].split(' - ')[0]
    nuts_num = tmp.columns.get_level_values(0)[-1].split()[-1]
    # print(nuts_num, file_name)
    tmp.columns = [c[2] for c in tmp.columns]
    tmp = tmp.drop(columns=tmp.columns[-2:])
    idx_first = np.where(tmp.index.get_loc('Stav obyvatel k 31.12.') == True)[0][0]
    idx_last = np.where(tmp.index.get_loc('Sňatky celkem') == True)[0][0]
    tmp = tmp[idx_first:idx_last].T.reset_index()
    tmp.columns = ['year', 'population_total', 'population_under_15', 'population_15_64', 'population_over_64', 'avg_age_total', 'age_index_total',
                            'female_total', 'female_under_15', 'female_15_64', 'female_over_64', 'female_avg_age', 'female_age_index',
                            'male_total', 'male_under_15', 'male_15_64', 'male_over_64', 'male_avg_age', 'male_age_index']

    tmp['district'] = nuts[nuts_num]
    # tmp['village_name'] = village_name
    district_ages = district_ages.append(tmp)

# district_ages['district'] = district_ages['district_num'].apply(lambda x: nuts[x])
district_ages = district_ages.groupby(['district', 'year']).sum().reset_index()

In [155]:
district_ages_old = pd.DataFrame()

# population_age_2011-2020
for file_name in os.listdir(PATH_RAW+'population_age_2011-2020/'):
    
    tmp = pd.read_excel(PATH_RAW+ 'population_age_2011-2020/'+ file_name, index_col=[0,1], header=[0,1,2])
    village_name = tmp.columns.get_level_values(0)[1].split(' - ')[0]
    nuts_num = tmp.columns.get_level_values(0)[-1].split()[-1]
    # print(nuts_num, file_name)
    tmp.columns = [c[2] for c in tmp.columns]
    tmp = tmp.drop(columns=tmp.columns[-2:])
    idx_first = np.where(tmp.index.get_loc('Stav obyvatel k 31.12.') == True)[0][0]
    idx_last = np.where(tmp.index.get_loc('Sňatky') == True)[0][0]
    tmp = tmp[idx_first:idx_last].T.reset_index()
    tmp.columns = ['year', 'population_total', 'population_under_15', 'population_15_64', 'population_over_64', 'avg_age_total', 'age_index_total',
                            'female_total', 'female_under_15', 'female_15_64', 'female_over_64', 'female_avg_age', 'female_age_index',
                            'male_total', 'male_under_15', 'male_15_64', 'male_over_64', 'male_avg_age', 'male_age_index']

    tmp['district'] = nuts[nuts_num]
    # tmp['village_name'] = village_name
    district_ages_old = district_ages_old.append(tmp)

# district_ages_old['district'] = district_ages_old['district_num'].apply(lambda x: nuts[x])
district_ages_old = district_ages_old.groupby(['district', 'year']).sum().reset_index()
district_ages_old = district_ages_old[district_ages_old.year == 2011]

In [156]:
# merge all years together (irrelevant after update)
# total_age = pd.concat([age_categories, district_ages])

# append year 2011 to others
total_age = district_ages.append(district_ages_old)

In [158]:
districts_df = demo.groupby(['district', 'year']).sum().drop(columns='village_num').reset_index()
demographics_total = pd.merge(districts_df, total_age, on=['district', 'year'])
demographics_total

Unnamed: 0,district,year,population,born,deceased,born_deceased,immigrants,emigrants,migration,change_total,...,female_15_64,female_over_64,female_avg_age,female_age_index,male_total,male_under_15,male_15_64,male_over_64,male_avg_age,male_age_index
0,Benešov,2011,95445,981,1023,-42,2550,1916,634,592,...,33412.0,6422.0,120.043379,276.228688,48256.0,7024.0,32199.0,9033.0,128.037796,404.330835
1,Benešov,2012,95883,1059,1029,30,2547,2139,408,438,...,33109.0,6736.0,120.587670,283.821274,48510.0,7183.0,31988.0,9339.0,128.639155,409.751593
2,Benešov,2013,96273,1015,977,38,2471,2119,352,390,...,32863.0,7098.0,121.347742,297.743346,48693.0,7372.0,31653.0,9668.0,129.204642,416.108141
3,Benešov,2014,96718,1001,962,39,2737,2331,406,445,...,32602.0,7421.0,122.069190,306.742200,48948.0,7523.0,31320.0,10105.0,129.883716,424.938527
4,Benešov,2015,97085,1086,1073,13,2554,2200,354,367,...,32427.0,7695.0,122.635325,309.965347,49075.0,7649.0,30995.0,10431.0,130.295637,428.361639
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
831,Žďár nad Sázavou,2017,117931,1281,1225,56,2110,2331,-221,-165,...,39552.0,9663.0,163.462826,421.269673,59227.0,8910.0,37290.0,13027.0,174.277300,594.543603
832,Žďár nad Sázavou,2018,118094,1435,1135,300,2237,2374,-137,163,...,39208.0,9928.0,164.163666,424.205591,59270.0,9009.0,36983.0,13278.0,175.044553,601.643160
833,Žďár nad Sázavou,2019,118158,1367,1241,126,2312,2374,-62,64,...,39089.0,10073.0,165.053276,429.508924,59278.0,9139.0,36693.0,13446.0,175.513748,601.066937
834,Žďár nad Sázavou,2020,117941,1277,1415,-138,2113,2192,-79,-217,...,38782.0,10177.0,165.514147,430.612987,59200.0,9166.0,36402.0,13632.0,176.095886,609.055387


In [160]:
demographics_total.to_csv(PATH_FINAL+'demographics.csv')