# Database 1995 - 2019

Work with the database of all the data, possibility to extract more economy data

In [1]:
import pandas as pd
import geopandas as gpd
import json
import os
import googlemaps
from collections import defaultdict, OrderedDict
from tqdm import tqdm
import cbsodata

In [2]:
os.chdir('D:/bachelors/datasets/')

In [9]:
meta = pd.DataFrame(cbsodata.get_meta('70072ned', 'DataProperties'))
meta.to_excel('territory_bases_v3/meta.xlsx', index = None)

In [141]:
df = pd.DataFrame(cbsodata.get_data('70072ned', typed = True))
df = df[(df.KoppelvariabeleRegioCode_306.str.contains('GM|gm')) & (df.Perioden.astype(int) >= 2004)]

In [142]:
meta = pd.read_excel('D:/Google Drive/Projects/bachelors/territory_bases\meta.xlsx')
meta = meta[meta.selected == 1]

In [143]:
df = df[meta['Key'].tolist()]
df = df.set_index('RegioS').apply(lambda column: pd.to_numeric(column, errors = 'coerce'))

df['Perioden'] = pd.to_datetime(df.Perioden, format = '%Y')
df = df[df.Perioden < pd.Timestamp('2018-01-01')]
df = df.reset_index()

In [144]:

communes_mapper = pd.read_excel('FINAL COMMUNES MERGE_EDITED.xlsx').set_index('index')['communes_from_geo'].to_dict()
communes_list = list(pd.read_excel('FINAL COMMUNES MERGE_EDITED.xlsx')['communes_from_geo'].unique())

In [145]:
df['commune_ok'] = df['RegioS'].str.replace(
    ' \(gemeente\)', ''
).map(communes_mapper)

df.set_index('RegioS', inplace = True)
df.loc['Beek (L.)', 'commune_ok'] = 'Beek'
df.loc['Laren (NH.)', 'commune_ok'] = 'Laren'
df.loc['Middelburg (Z.)', 'commune_ok'] = 'Middelburg'
df.loc['Rijswijk (ZH.)', 'commune_ok'] = 'Rijswijk'
df.loc['Stein (L.)', 'commune_ok'] = 'Stein'
df.loc['Valkenburg (ZH.)', 'commune_ok'] = 'Katwijk'

In [146]:
df = df.reset_index().drop(columns = ['RegioS']).set_index(['commune_ok', 'Perioden'])

In [147]:
filter_series = pd.isna(df).sum()

summarisable_variables = df[(filter_series[filter_series == 0].index)]
non_summarisable_variables = df[(filter_series[filter_series != 0].index)]

In [148]:
summarisable_variables.reset_index(inplace = True)
summarisable_variables = summarisable_variables.groupby(['commune_ok', 'Perioden']).sum()

## Non-summarizable variables

### Employment

In [149]:
def replace_zeros(series):
    series.loc[series == 0] = None
    return series

In [150]:
employment = non_summarisable_variables[['TotaalAantalBanen_112',
       'ALandbouwBosbouwEnVisserij_113', 'BFNijverheidEnEnergie_114',
       'GNCommercieleDienstverlening_115',
       'OUNietCommercieleDienstverlening_116']]

In [151]:
def precompute(df, agg_func = 'sum'):
    df = df.reset_index().groupby(
        ['commune_ok', 'Perioden']
    ).agg(agg_func).apply(
        lambda column: replace_zeros(column)
    )
    
    return df

In [152]:
employment = employment.reset_index().groupby(['commune_ok', 'Perioden']).sum().apply(lambda column: replace_zeros(column))

In [154]:
def interpolate_all(df):
    df_list = []
    for commune in communes_list:
        try:
            df_list.append(df.loc[commune].interpolate(
                method = 'spline', order = 1,
                limit_direction = 'both'
            ).assign(commune_ok = commune))
        except Exception as e:
            print(e)

    df = pd.concat(df_list)
    df = df.reset_index().set_index(['commune_ok', 'Perioden'])
    return df

In [155]:
employment = interpolate_all(employment)

(m>k) failed for hidden m: fpcurf0:m=1
(m>k) failed for hidden m: fpcurf0:m=1
(m>k) failed for hidden m: fpcurf0:m=1
(m>k) failed for hidden m: fpcurf0:m=1
(m>k) failed for hidden m: fpcurf0:m=1
(m>k) failed for hidden m: fpcurf0:m=1


### Income

In [185]:
incomes = non_summarisable_variables[['ParticuliereHuishoudensExclStudenten_122',
       'ParticuliereHuishoudensExclStudenten_132']]

In [186]:
incomes = incomes.reset_index().groupby(['commune_ok', 'Perioden']).mean().apply(lambda column: replace_zeros(column))

In [158]:
incomes = interpolate_all(incomes)

### Business

In [159]:
business = non_summarisable_variables[['BedrijfsvestigingenTotaal_164', 'ALandbouwBosbouwEnVisserij_165',
       'BFNijverheidEnEnergie_166', 'GIHandelEnHoreca_167',
       'HJVervoerInformatieEnCommunicatie_168',
       'KLFinancieleDienstenOnroerendGoed_169',
       'MNZakelijkeDienstverlening_170',
       'RUCultuurRecreatieOverigeDiensten_171']]

In [160]:
business = precompute(business, agg_func = 'sum')

In [161]:
business = interpolate_all(business)

The maximal number of iterations maxit (set to 20 by the program)
allowed for finding a smoothing spline with fp=s has been reached: s
too small.
There is an approximation returned but the corresponding weighted sum
of squared residuals does not satisfy the condition abs(fp-s)/s < tol.


In [162]:
business = business.apply(lambda column: round(column, 0))

### Distances

In [163]:
distances = non_summarisable_variables[['AfstandTotHuisartsenpraktijk_209', 'AfstandTotZiekenhuis_212',
       'AfstandTotKinderdagverblijf_214', 'AfstandTotSchoolBasisonderwijs_216',
       'AfstandTotRestaurant_224', 'AfstandTotBibliotheek_226',
       'AfstandTotBioscoop_227', 'AfstandTotZwembad_229',
       'AfstandTotSportterrein_230', 'AfstandTotOpenbaarGroen_231',
       'AfstandTotOpritHoofdverkeersweg_232', 'AfstandTotTreinstation_233']]

In [164]:
distances = precompute(distances, agg_func = 'mean')

In [165]:
distances = interpolate_all(distances)

# Rename columns to english

In [92]:
columns_map = meta.set_index('Key')['eng_desc'].to_dict()

In [167]:
business.rename(columns = columns_map, inplace = True)
distances.rename(columns = columns_map, inplace = True)
incomes.rename(columns = columns_map, inplace = True)
employment.rename(columns = columns_map, inplace = True)
summarisable_variables.rename(columns = columns_map, inplace = True)

In [118]:
coeff_func_def = (distances.unstack().apply(
    lambda column: column / column.mean()
).stack().agg(
    lambda x: x.mean(), axis = 1
).agg(
    lambda x: x / x.max()
) * -1 + 1).rename('func_access').reset_index()

In [129]:
total_business = business['total business establishments'].copy()

In [134]:
business_shares = business.drop(columns = 'total business establishments').apply(lambda column: column / total_business * 100)

In [135]:
os.mkdir('territory_bases_v3/datatables/')

In [138]:
incomes.to_csv('territory_bases_v3/datatables/incomes.csv', encoding = 'utf-8')
business.to_csv('territory_bases_v3/datatables/business_totals.csv', encoding = 'utf-8')
distances.to_csv('territory_bases_v3/datatables/distances_to_objects.csv', encoding = 'utf-8')
incomes.to_csv('territory_bases_v3/datatables/incomes.csv', encoding = 'utf-8')
employment.to_csv('territory_bases_v3/datatables/employment.csv', encoding = 'utf-8')
business_shares.to_csv('territory_bases_v3/datatables/business_shares.csv', encoding = 'utf-8')
coeff_func_def.to_csv('territory_bases_v3/datatables/coeff_func_def.csv', index = None, encoding = 'utf-8')

In [197]:
summarisable_variables.rename(columns = columns_map, inplace = True)

In [199]:
summarisable_variables.columns

Index(['Total population', 'Men', 'Women', 'Dutch background',
       'Total migration background', 'Western immigrant background',
       'Total non-Western immigrant background', 'Morocco',
       '(Former) Netherlands Antilles, Aruba', 'Suriname', 'Turkey',
       'Other non-Western immigrant background', 'Highly urbanized',
       'strong urban', 'Poor urban', 'few urban', 'not urban',
       'Population density', 'Total number of private households',
       'Households without children', 'Households with children', 'New Build',
       'Primary education', 'University education', 'Bachelor's Degree',
       'cars'],
      dtype='object')

In [201]:
summarisable_variables = summarisable_variables[['Total population', 'Total migration background', 'Total number of private households',
                        'Households with children', 'University education', "Bachelor's Degree"]]

In [202]:
summarisable_variables.columns

Index(['Total population', 'Total migration background',
       'Total number of private households', 'Households with children',
       'University education', 'Bachelor's Degree'],
      dtype='object')

In [203]:
summarisable_variables['Доля мигрантов на населении'] = summarisable_variables['Total migration background'] / summarisable_variables['Total population'] * 100

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [204]:
summarisable_variables['Доля домохозяйств с детьми'] = summarisable_variables['Households with children'] / summarisable_variables['Total number of private households'] * 100
summarisable_variables['Доля людей с высшим образованием'] = summarisable_variables['University education'] / summarisable_variables['Total population'] * 100

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [207]:
summarisable_variables = summarisable_variables.drop(
    columns = ['Total migration background', 'Households with children', 
               'University education', "Bachelor's Degree"]
)

In [212]:
summarisable_variables.сolumns = ['pop', 'hholds', 'migr_share', 'hh_with_children_share', 'high_edu_share']

In [215]:
summarisable_variables.columns = ['pop', 'hholds', 'migr_share', 'hh_with_children_share', 'high_edu_share']

In [217]:
summarisable_variables.to_csv('territory_bases_v3/datatables/other_variables.csv', encoding = 'utf-8')

In [12]:
geo_communes = gpd.read_file('../qgis/base/Netherlands_communes.gpkg')
centrality_index = gpd.read_file('D:/Google Drive/Projects/bachelors/nether_cent_clust_16.geojson')

In [13]:
centrality_join = gpd.sjoin(centrality_index, geo_communes)
centrality_join = centrality_join.groupby('NAME')['cent_idx_sum'].sum().reset_index()
centrality_join.rename(columns = {'NAME' : 'commune_ok'}, inplace = True)

In [233]:
centrality_join = pd.concat(
    [centrality_join.assign(Perioden = pd.Timestamp(f'{year}-01-01')) for year in range(2004, 2018)]
)

In [238]:
centrality_join.to_csv('territory_bases_v3/datatables/zent_idx.csv', index = None, encoding = 'utf-8')

In [14]:
gdf_centrality = geo_communes.merge(centrality_join, left_on = 'NAME', right_on = 'commune_ok', how = 'left')

In [22]:
gdf_centrality.to_file('centrality_by_municipality.gpkg', driver = 'GPKG')

ERROR:fiona._env:sqlite3_exec(CREATE VIRTUAL TABLE "rtree_centrality_by_municipality_geom" USING rtree(id, minx, maxx, miny, maxy)) failed: no such module: rtree


CPLE_AppDefinedError: b'sqlite3_exec(CREATE VIRTUAL TABLE "rtree_centrality_by_municipality_geom" USING rtree(id, minx, maxx, miny, maxy)) failed: no such module: rtree'

Exception ignored in: 'fiona._shim.gdal_flush_cache'
Traceback (most recent call last):
  File "fiona/_err.pyx", line 201, in fiona._err.GDALErrCtxManager.__exit__
fiona._err.CPLE_AppDefinedError: b'sqlite3_exec(CREATE VIRTUAL TABLE "rtree_centrality_by_municipality_geom" USING rtree(id, minx, maxx, miny, maxy)) failed: no such module: rtree'


# Population increase

In [28]:
summarisable_variables = pd.read_csv('territory_bases_v3/datatables/other_variables.csv', encoding = 'utf-8')
summarisable_variables.set_index('commune_ok', inplace = True)

In [29]:
pop_increase = summarisable_variables.loc[summarisable_variables.Perioden == '2017-01-01', 'pop']/ summarisable_variables.loc[summarisable_variables.Perioden == '2004-01-01', 'pop'] * 100

In [30]:
pop_increase = pop_increase.reset_index().rename(columns = {'commune_ok' : 'NAME', 'pop' : 'pop_increase'})

In [31]:
geo_communes = gpd.read_file('../qgis/base/Netherlands_communes.gpkg')

In [32]:
pop_increase_geo = geo_communes.merge(pop_increase, on = 'NAME', how = 'left')

In [33]:
pop_increase_geo['pop_increase'] = pop_increase_geo['pop_increase'] - 100

In [34]:
pop_increase_geo.to_file('analysis/Population_increase.gpkg', driver = 'GPKG')

ERROR:fiona._env:sqlite3_exec(CREATE VIRTUAL TABLE "rtree_Population_increase_geom" USING rtree(id, minx, maxx, miny, maxy)) failed: no such module: rtree


CPLE_AppDefinedError: b'sqlite3_exec(CREATE VIRTUAL TABLE "rtree_Population_increase_geom" USING rtree(id, minx, maxx, miny, maxy)) failed: no such module: rtree'

Exception ignored in: 'fiona._shim.gdal_flush_cache'
Traceback (most recent call last):
  File "fiona/_err.pyx", line 201, in fiona._err.GDALErrCtxManager.__exit__
fiona._err.CPLE_AppDefinedError: b'sqlite3_exec(CREATE VIRTUAL TABLE "rtree_Population_increase_geom" USING rtree(id, minx, maxx, miny, maxy)) failed: no such module: rtree'
