### The present script processes data for Amsterdam for education and income
It reads the excel files, translates fields to english and exports to geojson

In [1]:
import os
import numpy as np
import pandas as pd
import geopandas as gpd
import openpyxl
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [4]:
city="ams"
base_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath("__file__"))))
print(base_dir)
# Paths for the Population Data --------------------------------------------------------------
#path to ancillary data folder
ancillary_data_folder_path = base_dir + "/{}_Projectdata/AncillaryData".format(city)
ancillary_POPdata_folder_path = base_dir + "/{}_Projectdata/PopData".format(city)
neighPath = ancillary_POPdata_folder_path + "/Neighborhood&districtData"
#cor_path= base_dir + "/data_prep/{}_data_scripts_pop/correlations/corImages".format(city)
#image_path= base_dir + "/data_prep/{}_Projectdata/PopData/images".format(city)

C:\FUME\PopNetV2\data_prep


In [3]:
# Create a function to write Excel files
def save_EXCEL(frame,output):
    # Create a Pandas Excel writer using XlsxWriter as the engine.
    writer = pd.ExcelWriter(output,  index = False,  header=True)
    # Convert the dataframe to an XlsxWriter Excel object.
    frame.to_excel(writer, sheet_name='Sheet1')
    # Close the Pandas Excel writer and output the Excel file.
    writer.save()

## 1. Income

In [None]:
yearlist =[2014, 2016, 2018]
for year in yearlist:
    path = neighPath + "/rawDataNeighborhoods/income/Inc_Amsterdam_{}.xlsx".format(year)
    df = pd.read_excel(path, header=0, skiprows=3 )

    df.rename(columns = {'Inkomen|Inkomen van personen|Aantal inkomensontvangers  ':'Number of income recipients',
    'Inkomen|Inkomen van personen|Gemiddeld inkomen per inkomensontvanger ':'Average income per income recipient (x 1 000 euro)',
    'Inkomen|Inkomen van personen|Gemiddeld inkomen per inwoner ':'Average income per inhabitant (x 1 000 euro)',
    'Inkomen|Inkomen van personen|40% personen met laagste inkomen':'40% persons with lowest income (%)',
    'Inkomen|Inkomen van personen|20% personen met hoogste inkomen':'20% of persons with highest income(%)',
    'Inkomen|Inkomen van personen|Actieven 15-75 jaar':'Active 15-75 years (%)',
    'Inkomen|Inkomen van huishoudens|40% huishoudens met laagste inkomen':'40% households with the lowest income (%)',
    'Inkomen|Inkomen van huishoudens|20% huishoudens met hoogste inkomen':'20% households with the highest income (%)',
    'Inkomen|Inkomen van huishoudens|Huishoudens met een laag inkomen':'Low Income Households (%)',
    'Inkomen|Inkomen van huishoudens|Huish. onder of rond sociaal minimum':'Household. below or around the social minimum (%)',
    'Unnamed: 0':'Districts and neighborhoods'}, inplace = True)

    output = neighPath + "/incomeProcess/income_{}.xlsx".format(year)
    save_EXCEL(df.iloc[1:,],output)

In [None]:
path = neighPath + "/incomeProcess/income_2018.xlsx"
df = pd.read_excel(path, header=0, index_col='Districts and neighborhoods' , usecols=range(1,11))

In [None]:
df = df.replace(".", "")
df = df.replace(np.nan, "")

In [None]:
for i in df.columns:
    df['{}'.format(i)] = pd.to_numeric(df['{}'.format(i)])

In [None]:
df.dtypes

In [5]:
gdf = gpd.read_file(ancillary_data_folder_path + '/adm/nld_c03_smallneighborhoods.gpkg')#  index_col= 'Buurtcombinatie'
gdf

Unnamed: 0,BU_CODE,BU_NAAM,WK_CODE,GM_CODE,GM_NAAM,WATER,POSTCODE,Shape_Leng,Shape_Area,geometry
0,BU00109998,,WK001099,GM0010,Delfzijl,JA,,128728.007061,9.129235e+07,"MULTIPOLYGON (((4127225.805 3359186.242, 41272..."
1,BU00349997,,WK003499,GM0034,Almere,JA,,122665.358634,1.095623e+08,"MULTIPOLYGON (((4001814.393 3254037.601, 40016..."
2,BU00509997,,WK005099,GM0050,Zeewolde,JA,,83290.773617,1.633050e+07,"MULTIPOLYGON (((4006229.897 3249222.055, 40062..."
3,BU00609998,,WK006099,GM0060,Ameland,JA,,156406.894023,2.085914e+08,"MULTIPOLYGON (((4025876.409 3382805.375, 40300..."
4,BU00729998,,WK007299,GM0072,Harlingen,JA,,104274.683036,3.618323e+08,"MULTIPOLYGON (((4016414.777 3354200.836, 40164..."
...,...,...,...,...,...,...,...,...,...,...
13684,BU19781901,Waal-Dorp,WK197819,GM1978,Molenlanden,NEE,2968,2380.324142,5.334670e+04,"MULTIPOLYGON (((3970390.219 3215386.771, 39703..."
13685,BU19782001,Wijngaarden-Buitengebied,WK197820,GM1978,Molenlanden,NEE,3366,14157.217901,5.795862e+06,"MULTIPOLYGON (((3958607.101 3206653.865, 39586..."
13686,BU19782002,Kern-Dorp,WK197820,GM1978,Molenlanden,NEE,3366,2221.741394,1.143729e+05,"MULTIPOLYGON (((3960166.916 3205777.972, 39603..."
13687,BU19782003,Lintbebouwing-Oost,WK197820,GM1978,Molenlanden,NEE,3366,6244.966167,2.782070e+05,"MULTIPOLYGON (((3960926.107 3205766.988, 39609..."


In [None]:
ngdf = gdf.join(df, on ='Buurtcombinatie')

In [None]:
ngdf = ngdf.to_crs(3035)
ngdf.to_file(neighPath + "/incomeProcess/neighIncome_2018.geojson",driver='GeoJSON',crs="EPSG:3035")

## 2. Education

In [None]:
yearlist =[2019]
gdf = gpd.read_file(ancillary_data_folder_path + '/adm/neighborhood_orig.geojson')
for year in yearlist:
    path = neighPath + "/rawDataNeighborhoods/education/Educ_Amsterdam_{}.xlsx".format(year)
    df = pd.read_excel(path, header=0, skiprows=4 )
    ndf = df.iloc[2: , np.r_[0, 10:13]]
    #print(ndf)
    for col in ndf.iloc[:, 1:].columns:
        #print(pd.to_numeric(ndf[col], errors='coerce').isnull())
        ndf[col] = pd.to_numeric(ndf[col], errors='coerce')
    print(ndf)
    ngdf = gdf.join(ndf.set_index('Marges'), on ='Buurtcombinatie')   
    print(ngdf)
    ngdf = ngdf.to_crs(3035)
    ngdf.to_file(neighPath + "/educationProcess/neighEduc_2019.geojson",driver='GeoJSON',crs="EPSG:3035")  

## 3. Other Demographic

In [7]:
pathCodes = neighPath + "/rawDataNeighborhoods/demographic/codebook.xlsx"
codes = pd.read_excel(pathCodes, header=0)
codes

Unnamed: 0,variable_name_NLD,name
0,regio,region designation
1,gm_naam,name of municipality
2,recs,type of region
3,gwb_code,coding
4,ind_wbi,change in classification districts and neighbo...
...,...,...
100,a_wat_ha,Surface water
101,pst_mvp,Most common zip code
102,pst_dekp,Coverage percentage
103,ste_mvs,Degree of urbanity


In [11]:
yearlist =[2017]
for year in yearlist:
    path = neighPath + "/rawDataNeighborhoods/demographic/{}.xls".format(year)
    df = pd.read_excel(path, header=0)
    df = df.replace(".", "")
    df = df.replace(np.nan, "")
    for col in df.columns:
        print(col)
        try:
            newName = codes.loc[codes.variable_name_NLD == '{}'.format(col), 'name'].values[0]
            df.rename(columns = {'{}'.format(col):'{}'.format(newName)}, inplace = True)
            
        except:
            print("An exception occurred : {}".format(col))
    df = df.replace(".", "")
    df = df.replace(np.nan, "") 
    #print(df.head(3))
    
    ngdf = gdf.join(df.set_index('gwb_code_10'), on ='WK_CODE')  
    print(ngdf.head(3))
    ngdf = ngdf.to_crs(3035)

    ngdf.to_file(neighPath + "/demographicProcess/smallneighDemo_{}.geojson".format(year),driver='GeoJSON',crs="EPSG:3035") 
    #ngdf
    
    

gwb_code_10
An exception occurred : gwb_code_10
gwb_code_8
An exception occurred : gwb_code_8
regio
gm_naam
recs
gwb_code
ind_wbi
a_inw
a_man
a_vrouw
a_00_14
a_15_24
a_25_44
a_45_64
An exception occurred : a_45_64
a_65_oo
a_ongeh
a_gehuwd
An exception occurred : a_gehuwd
a_gesch
a_verwed
a_w_all
a_nw_all
a_marok
a_antaru
a_suri
a_tur
a_ov_nw
a_geb
p_geb
a_ste
p_ste
a_hh
a_1p_hh
a_hh_z_k
a_hh_m_k
g_hhgro
bev_dich
a_woning
g_woz
p_1gezw
p_mgezw
p_bewndw
p_leegsw
p_koopw
p_huurw
p_wcorpw
p_ov_hw
p_e_o_w
p_bjj2k
p_bjo2k
g_ele
g_ele_ap
g_ele_tw
g_ele_hw
g_ele_2w
g_ele_vw
g_ele_hu
g_ele_ko
g_gas
g_gas_ap
g_gas_tw
g_gas_hw
g_gas_2w
g_gas_vw
g_gas_hu
g_gas_ko
p_stadsv
a_inkont
g_ink_po
g_ink_pi
p_ink_li
An exception occurred : p_ink_li
p_ink_hi
p_n_act
p_hh_li
p_hh_hi
p_hh_lkk
p_hh_osm
a_soz_wb
a_soz_ao
a_soz_ww
a_soz_ow
g_wodief
g_vernoo
g_gewsek
a_bedv
a_bed_a
a_bed_bf
a_bed_gi
a_bed_hj
a_bed_kl
a_bed_mn
a_bed_ru
a_pau
a_bst_b
a_bst_nb
g_pau_hh
g_pau_km
a_m2w
g_afs_hp
g_afs_gs
g_afs_kv
g_afs

In [None]:
for year in yearlist:
    ngdf = gpd.read_file(neighPath + "/demographicProcess/neighDemo_{}.geojson".format(year)) 
    for col in ngdf.columns:
        if ngdf[col].dtype == object :
            
            ngdf[col] = ngdf[col].astype('string')
            ngdf[col] = ngdf[col].str.replace(',', '.')
            #print(ngdf[col].dtype)
    for i in ngdf.iloc[: , 10:112].columns:
        ngdf['{}'.format(i)] = pd.to_numeric(ngdf['{}'.format(i)])

    ngdf.to_file(neighPath + "/demographicProcess/neighDemo_{}.geojson".format(year),driver='GeoJSON',crs="EPSG:3035")

## Combine all of them together

In [4]:
gdf = gpd.read_file(ancillary_data_folder_path + '/adm/neighborhood.geojson')
gdfDF = gdf.iloc[:, np.r_[1, 4:19, 34]]
demoDF = gpd.read_file(neighPath + "/demographicProcess/neighDemo_2019.geojson")
demoDF= demoDF.iloc[:, np.r_[1, 10:112]]
eduDF = gpd.read_file(neighPath + "/educationProcess/neighEduc_2019.geojson")
eduDF = eduDF.iloc[:, np.r_[1, 4:7]]
incDF = gpd.read_file(neighPath + "/incomeProcess/neighIncome_2018.geojson")
incDF = incDF.iloc[:, np.r_[1, 4:13]]

In [16]:
#print(len(demoDF.columns))
print(len(eduDF.columns))
print(len(incDF.columns))
print(len(gdf.columns))
gdf.iloc[:, np.r_[1, 4:19, 34]]
#eduDF.iloc[:, np.r_[1, 4:7]]
#incDF.iloc[:, np.r_[1, 4:13]].head(2)

4
10
35


Unnamed: 0,Buurtcombinatie,Z0_totalMigmean,Z0_Oceaniamean,Z0_EuropeNotEUmean,Z0_EuropeEUnoLocalmean,Z0_Central_Asiamean,Z0_Eastern_Asiamean,Z0_Southern-Eastern_Asiamean,Z0_Southern_Asiamean,Z0_Western_Asiamean,Z0_Northern_Americamean,Z0_Latin_America_and_the_Caribbeanmean,Z0_Northern_Africamean,Z0_Sub-Saharan_Africamean,Z0_Othersmean,Z0_Coloniesmean,geometry
0,Burgwallen-Oude Zijde,39.947753,0.929245,0.388757,17.393425,0.098039,1.353986,1.544781,1.163033,1.906146,2.897742,3.677510,1.280441,1.929392,2.847948,0.411290,"POLYGON ((4.90326 52.37658, 4.90298 52.37668, ..."
1,Burgwallen-Nieuwe Zijde,25.793523,0.352035,0.444715,13.714876,0.106426,1.122117,0.846579,1.132784,1.316518,2.558255,2.254081,0.686478,0.580114,3.471729,0.083719,"POLYGON ((4.90641 52.38003, 4.90601 52.38009, ..."
2,Grachtengordel-West,32.552016,1.255617,0.165042,14.429546,0.006050,0.953645,1.159272,1.048138,1.280004,4.918566,2.173815,0.368652,0.935398,1.725428,0.627649,"POLYGON ((4.89401 52.37875, 4.89369 52.37889, ..."
3,Grachtengordel-Zuid,37.391364,0.751304,0.458965,15.750870,0.000000,0.560946,1.123838,1.335673,1.786036,4.650562,3.210037,1.506086,1.602664,2.844134,0.364422,"POLYGON ((4.90278 52.36279, 4.90156 52.36515, ..."
4,Nieuwmarkt/Lastage,21.955385,0.446231,0.147076,9.411507,0.013402,1.252798,0.777626,0.778004,1.325714,1.886912,2.672397,0.417551,0.672160,1.717421,0.306554,"POLYGON ((4.91361 52.37259, 4.91359 52.37301, ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
94,"Bijlmer Oost (E,G,K)",32.077866,0.038007,0.079519,2.816451,0.000792,0.491629,0.728148,2.338421,0.787927,0.183693,15.150100,0.700604,5.671821,0.652551,1.178030,"POLYGON ((4.99690 52.31763, 4.99679 52.31775, ..."
95,Nellestein,3.599470,0.022268,0.019985,0.732587,0.000000,0.038969,0.187866,0.241147,0.132877,0.066027,1.085195,0.054569,0.368531,0.225623,0.150678,"POLYGON ((5.01407 52.30423, 5.01354 52.30445, ..."
96,Holendrecht/Reigersbos,32.884313,0.043639,0.051399,3.111599,0.000000,0.650323,1.009374,1.764431,0.739900,0.087831,15.589230,0.860376,4.866349,0.584134,1.142149,"POLYGON ((4.98358 52.29045, 4.98228 52.29061, ..."
97,Gein,26.716381,0.049894,0.079689,2.472885,0.000000,0.957156,0.737388,1.944307,0.462830,0.144612,11.017429,0.741630,4.236686,0.743965,1.188112,"POLYGON ((5.00796 52.30154, 5.00593 52.30154, ..."


In [5]:
ngdf = gdfDF.join(incDF.set_index('Buurtcombinatie'), on ='Buurtcombinatie')

In [6]:
ngdf = ngdf.join(demoDF.set_index('Buurtcombinatie'), on ='Buurtcombinatie', how='left', lsuffix='_left', rsuffix='_right')

In [7]:
ngdf = ngdf.join(eduDF.set_index('Buurtcombinatie'), on ='Buurtcombinatie', how='left', lsuffix='_left', rsuffix='_right')

In [11]:
for i in ngdf.iloc[: , 85:102].columns:
        ngdf['{}'.format(i)] = pd.to_numeric(ngdf['{}'.format(i)])

In [14]:
ngdf= ngdf.to_crs(3035)

In [12]:
ngdf.info(verbose=True)

<class 'geopandas.geodataframe.GeoDataFrame'>
Int64Index: 160 entries, 0 to 98
Data columns (total 131 columns):
 #   Column                                               Dtype   
---  ------                                               -----   
 0   Buurtcombinatie                                      object  
 1   Z0_totalMigmean                                      float64 
 2   Z0_Oceaniamean                                       float64 
 3   Z0_EuropeNotEUmean                                   float64 
 4   Z0_EuropeEUnoLocalmean                               float64 
 5   Z0_Central_Asiamean                                  float64 
 6   Z0_Eastern_Asiamean                                  float64 
 7   Z0_Southern-Eastern_Asiamean                         float64 
 8   Z0_Southern_Asiamean                                 float64 
 9   Z0_Western_Asiamean                                  float64 
 10  Z0_Northern_Americamean                              float64 
 11  Z0_Latin_Am

## Save the final combinations files

In [15]:
ngdf.to_file(neighPath + "/neighAllDemo_2018.geojson",driver='GeoJSON',crs="EPSG:3035")
save_EXCEL(ngdf,neighPath + "/neighAllDemo_2018.xlsx")