## Notebook to process LAU and NUTS shapefile to parquet

In [54]:
# Load software
import os
import pathlib
import sys
import pandas as pd
from shapely import Polygon, geometry
from affine import Affine
from rasterio.features import shapes
import json
import itertools
import numpy as np
import geopandas as gpd
import pandas as pd
import matplotlib.pyplot as plt
import xarray as xr
from dotenv import load_dotenv
import math
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple, Union
import rioxarray as rio

# Import custom functionality
from coclicodata.drive_config import p_drive

# Define (local and) remote drives
coclico_data_dir = p_drive.joinpath("11207608-coclico", "FASTTRACK_DATA")

In [55]:
# Set path to geopackage
LAU_dir = coclico_data_dir.joinpath('XX_LAU')
NUTS_dir = coclico_data_dir.joinpath('XX_NUTS')

LAU_file = LAU_dir.joinpath('LAU_RG_01M_2020_3035.shp')
NUTS_file = NUTS_dir.joinpath('NUTS_RG_01M_2021_3035.shp')

In [56]:
# Load data
LAU = gpd.read_file(LAU_file)

In [57]:
# Load data
NUTS = gpd.read_file(NUTS_file) 

In [58]:
LAU

Unnamed: 0,GISCO_ID,CNTR_CODE,LAU_ID,LAU_NAME,POP_2020,POP_DENS_2,AREA_KM2,YEAR,FID,geometry
0,CZ_579475,CZ,579475,Libňatov,363.0,62.240158,5.832247,2020,CZ_579475,"POLYGON ((4747705.585 3057545.260, 4746836.524..."
1,CZ_579483,CZ,579483,Libotov,178.0,41.774759,4.260946,2020,CZ_579483,"POLYGON ((4731662.418 3049054.578, 4732297.093..."
2,CZ_579491,CZ,579491,Vranov,181.0,37.408873,4.838424,2020,CZ_579491,"POLYGON ((4540137.623 2963864.496, 4538706.113..."
3,CZ_579505,CZ,579505,Malá Úpa,142.0,5.318510,26.699207,2020,CZ_579505,"POLYGON ((4730399.261 3087673.334, 4730914.253..."
4,AT_41004,AT,41004,Eggendorf im Traunkreis,1041.0,116.481747,8.937023,2020,AT_41004,"POLYGON ((4631543.792 2788412.230, 4630476.583..."
...,...,...,...,...,...,...,...,...,...,...
98608,SK_599816,SK,599816,Košice - mestská časť Nad jazerom,24443.0,6686.211551,3.655732,2020,SK_599816,"POLYGON ((5149120.507 2905279.350, 5150154.014..."
98609,SK_599824,SK,599824,Košice - mestská časť Juh,22692.0,2324.881955,9.760496,2020,SK_599824,"POLYGON ((5144451.048 2904986.166, 5144591.047..."
98610,SK_599841,SK,599841,Košice - mestská časť Šaca,5969.0,124.672777,47.877333,2020,SK_599841,"POLYGON ((5141041.890 2900350.394, 5144604.564..."
98611,SK_599859,SK,599859,Košice - mestská časť Poľov,1219.0,94.049511,12.961258,2020,SK_599859,"POLYGON ((5141041.890 2900350.394, 5141299.765..."


In [59]:
NUTS

Unnamed: 0,NUTS_ID,LEVL_CODE,CNTR_CODE,NAME_LATN,NUTS_NAME,MOUNT_TYPE,URBN_TYPE,COAST_TYPE,geometry
0,AL,0,AL,b'5368716970eb726961',b'5368716970eb726961',0.0,0,0,"MULTIPOLYGON (((5121233.536 2221719.441, 51208..."
1,CZ,0,CZ,?esko,?esko,0.0,0,0,"POLYGON ((4624843.654 3112209.741, 4625546.618..."
2,DE,0,DE,Deutschland,Deutschland,0.0,0,0,"MULTIPOLYGON (((4355225.365 2715902.993, 43541..."
3,DK,0,DK,Danmark,Danmark,0.0,0,0,"MULTIPOLYGON (((4650502.736 3591342.844, 46503..."
4,CY,0,CY,b'4bfd70726f73',??????,0.0,0,0,"MULTIPOLYGON (((6527040.718 1762367.593, 65267..."
...,...,...,...,...,...,...,...,...,...
2005,NO0B1,3,NO,Jan Mayen,Jan Mayen,3.0,3,1,"POLYGON ((3623747.621 5400386.841, 3624031.138..."
2006,EE009,3,EE,Kesk-Eesti,Kesk-Eesti,4.0,3,1,"MULTIPOLYGON (((5216227.688 4159212.769, 52172..."
2007,NO0,1,NO,Norge,Norge,0.0,0,0,"MULTIPOLYGON (((4961367.759 5413266.131, 49622..."
2008,NO0B,2,NO,Jan Mayen and Svalbard,Jan Mayen and Svalbard,,0,0,"MULTIPOLYGON (((4744650.828 6379141.635, 47446..."


In [None]:
# Write data as parquet file
lau_parquet_file = str(LAU_file).replace('.shp','.parquet')
LAU.to_parquet(lau_parquet_file)

nuts_parquet_file = str(NUTS_file).replace('.shp','.parquet')
NUTS.to_parquet(nuts_parquet_file)

In [60]:
# Match LAU and NUTS using this work: https://edjnet.github.io/lau_centres/lau_nuts.html

LAU_NUTS_match = pd.read_csv(r'p:\11207608-coclico\FASTTRACK_DATA\XX_NUTS\lau_2020_nuts_2021_concordance_by_geo.csv')
# Drop unneeded columns
LAU_NUTS_match = LAU_NUTS_match.drop(columns=['gisco_id','country','lau_id','lau_name','population','area_km2','year'])
LAU_NUTS_match = LAU_NUTS_match.rename(columns={'fid': 'FID'})

# Merge the two dataframes
LAU_NUTS_data = LAU.merge(LAU_NUTS_match,on='FID')

# Reorder columns
# Retrieve column names
cols = LAU_NUTS_data.columns

# Move columns
cols = cols.insert([2],cols[-2:])

# Drop the old ones
cols = cols[:-2]

# Store in final form
LAU_NUTS_data = LAU_NUTS_data[cols]

In [61]:
LAU_NUTS_data

Unnamed: 0,GISCO_ID,CNTR_CODE,nuts_2,nuts_3,LAU_ID,LAU_NAME,POP_2020,POP_DENS_2,AREA_KM2,YEAR,FID,geometry
0,CZ_579475,CZ,CZ05,CZ052,579475,Libňatov,363.0,62.240158,5.832247,2020,CZ_579475,"POLYGON ((4747705.585 3057545.260, 4746836.524..."
1,CZ_579483,CZ,CZ05,CZ052,579483,Libotov,178.0,41.774759,4.260946,2020,CZ_579483,"POLYGON ((4731662.418 3049054.578, 4732297.093..."
2,CZ_579491,CZ,CZ03,CZ032,579491,Vranov,181.0,37.408873,4.838424,2020,CZ_579491,"POLYGON ((4540137.623 2963864.496, 4538706.113..."
3,CZ_579505,CZ,CZ05,CZ052,579505,Malá Úpa,142.0,5.318510,26.699207,2020,CZ_579505,"POLYGON ((4730399.261 3087673.334, 4730914.253..."
4,AT_41004,AT,AT31,AT312,41004,Eggendorf im Traunkreis,1041.0,116.481747,8.937023,2020,AT_41004,"POLYGON ((4631543.792 2788412.230, 4630476.583..."
...,...,...,...,...,...,...,...,...,...,...,...,...
98608,SK_599816,SK,SK04,SK042,599816,Košice - mestská časť Nad jazerom,24443.0,6686.211551,3.655732,2020,SK_599816,"POLYGON ((5149120.507 2905279.350, 5150154.014..."
98609,SK_599824,SK,SK04,SK042,599824,Košice - mestská časť Juh,22692.0,2324.881955,9.760496,2020,SK_599824,"POLYGON ((5144451.048 2904986.166, 5144591.047..."
98610,SK_599841,SK,SK04,SK042,599841,Košice - mestská časť Šaca,5969.0,124.672777,47.877333,2020,SK_599841,"POLYGON ((5141041.890 2900350.394, 5144604.564..."
98611,SK_599859,SK,SK04,SK042,599859,Košice - mestská časť Poľov,1219.0,94.049511,12.961258,2020,SK_599859,"POLYGON ((5141041.890 2900350.394, 5141299.765..."


In [63]:
# After LAU and NUTS are match we will only select the LAU's that overlap with the Coastal Mask

# masking the data with the Coastal Mask (takes abou 45 min)
folder_mask = r"p:\11207608-coclico\FASTTRACK_DATA\19_coastal_mask\cogs"

# Retrieve total bounds in CRS of coastal mask
totalbbox = LAU_NUTS_data.to_crs("EPSG:4326").total_bounds

# Polygonized, empty vecotr for storing the Coastal Mask Polygons
pgnzd = []
# read tif files in the coastal mask folder
for idx, file in enumerate(os.listdir(folder_mask)):
    print(idx)
    if file.endswith(".tif"):
        img = xr.open_dataset(os.path.join(folder_mask, file), engine="rasterio", mask_and_scale=False) 
        if geometry.box(*img.rio.bounds()).intersects(geometry.box(*totalbbox)): # only do the ones in the LAU_NUTS bbox --> speed up to 17 min
            print(os.path.join(folder_mask, file))

            # get the data
            data_array = img.band_data.values
            mask = data_array == 1
            
            # Split the string into a list of values and correct the gdal affine transformation
            geotrans = list(map(float, img.spatial_ref.GeoTransform.split(' ')))
            afn = Affine.from_gdal(*geotrans)

            # Extract shapes
            results = (
                {'properties': {'value': v}, 'geometry': s}
                for i, (s, v) in enumerate(
                    shapes(data_array, mask=mask, transform=afn))
            )

            pgnzd.append(list(results))

0
1
2
3
4
5
6
p:\11207608-coclico\FASTTRACK_DATA\19_coastal_mask\cogs\B01_x-63.95_y-3.75.tif
7
8
9
10
11
p:\11207608-coclico\FASTTRACK_DATA\19_coastal_mask\cogs\B01_x-26.4_y13.32.tif
12
13
14
15
16
17
18
p:\11207608-coclico\FASTTRACK_DATA\19_coastal_mask\cogs\B01_x-5.92_y-20.81.tif
19
20
21
p:\11207608-coclico\FASTTRACK_DATA\19_coastal_mask\cogs\B01_x-33.23_y33.8.tif
22
23
24
25
p:\11207608-coclico\FASTTRACK_DATA\19_coastal_mask\cogs\B01_x-60.53_y40.63.tif
26
p:\11207608-coclico\FASTTRACK_DATA\19_coastal_mask\cogs\B01_x-9.33_y61.11.tif
27
28
29
30
p:\11207608-coclico\FASTTRACK_DATA\19_coastal_mask\cogs\B01_x41.87_y44.04.tif
31
p:\11207608-coclico\FASTTRACK_DATA\19_coastal_mask\cogs\B01_x48.69_y33.8.tif
32
33
34
35
36
37
38
39
p:\11207608-coclico\FASTTRACK_DATA\19_coastal_mask\cogs\B01_x-16.16_y-3.75.tif
40
41
42
p:\11207608-coclico\FASTTRACK_DATA\19_coastal_mask\cogs\B01_x-40.05_y40.63.tif
43
44
45
46
47
48
p:\11207608-coclico\FASTTRACK_DATA\19_coastal_mask\cogs\B01_x38.45_y44.04.tif
4

In [64]:
# flatten the list
fpgnzd = list(itertools.chain(*pgnzd))
len(fpgnzd)

167957

In [66]:
# Convert shapes to a GeoDataFrame
shapes_gdf = gpd.GeoDataFrame.from_features(fpgnzd)

CRSError: Invalid projection: ESPG:4326: (Internal Proj Error: proj_create: crs not found)

In [71]:
shapes_gdf = shapes_gdf.set_crs("EPSG:4326")
shapes_gdf = shapes_gdf.to_crs("EPSG:3035")

In [72]:
# Perform a spatial join to find intersections
LAU_NUTS_reproj = LAU_NUTS_data.set_crs("EPSG:3035")
joined_gdf = gpd.sjoin(LAU_NUTS_reproj, shapes_gdf, how='inner', predicate='intersects')

# remove columns added by shapes_gdf
joined_gdf.drop(columns=['index_right', 'value'])

# Remove duplicate rows based on the index of gdf1
LAU_NUTS_CLEANCM = joined_gdf[~joined_gdf.index.duplicated(keep='first')]

# Remove extra columns resulting from the spatial join
LAU_NUTS_CLEANCM = LAU_NUTS_CLEANCM[LAU_NUTS_CLEANCM.columns[:-2]]

LAU_NUTS_CLEANCM


Unnamed: 0,GISCO_ID,CNTR_CODE,nuts_2,nuts_3,LAU_ID,LAU_NAME,POP_2020,POP_DENS_2,AREA_KM2,YEAR,FID,geometry,index_right,value
1192,DE_03361001,DE,DE93,DE93B,03361001,"Achim, Stadt",31923.0,471.398771,67.719735,2020,DE_03361001,"POLYGON ((4258927.295 3320011.519, 4259193.583...",29151,1.0
1193,DE_03361002,DE,DE93,DE93B,03361002,Blender,2885.0,75.034640,38.448908,2020,DE_03361002,"POLYGON ((4264556.862 3317435.819, 4265677.354...",29151,1.0
1195,DE_03361003,DE,DE93,DE93B,03361003,Dörverden,9009.0,107.940908,83.462333,2020,DE_03361003,"POLYGON ((4275479.269 3295636.790, 4275328.930...",29151,1.0
1196,DE_03361004,DE,DE93,DE93B,03361004,Emtinghausen,1464.0,68.573846,21.349247,2020,DE_03361004,"POLYGON ((4250468.027 3309420.666, 4250432.180...",29151,1.0
1197,DE_03361005,DE,DE93,DE93B,03361005,Kirchlinteln,9911.0,56.675823,174.871744,2020,DE_03361005,"POLYGON ((4272425.673 3310568.711, 4274136.202...",29151,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
97667,FR_97614,FR,FRY5,FRY50,97614,Ouangani,10203.0,555.838786,18.356042,2020,FR_97614,"POLYGON ((8727685.023 -2785490.190, 8727139.84...",165557,1.0
97669,FR_97616,FR,FRY5,FRY50,97616,Sada,11156.0,1012.643086,11.016715,2020,FR_97616,"POLYGON ((8727114.505 -2787670.745, 8725517.75...",165557,1.0
97670,FR_97617,FR,FRY5,FRY50,97617,Tsingoni,13934.0,406.428213,34.284037,2020,FR_97617,"POLYGON ((8730964.436 -2773602.954, 8730972.75...",165557,1.0
97660,FR_97608,FR,FRY5,FRY50,97608,Dzaoudzi,17831.0,2797.974566,6.372824,2020,FR_97608,"MULTIPOLYGON (((8745852.102 -2769772.861, 8742...",165548,1.0


In [80]:
# Check if its the same as original parquet is the same
test_parq = gpd.read_parquet(r"p:\11207608-coclico\FULLTRACK_DATA\WP4\LAU_stats\LAU_2020_NUTS_2021_01M_3035_CM_original.parquet")

if test_parq.equals(LAU_NUTS_CLEANCM):
    print('Hell yeah!')

Hell yeah!


In [79]:
# Write geodataframe to parquet
coclico_data_dir = p_drive.joinpath("11207608-coclico", "FULLTRACK_DATA")
ds_dir = coclico_data_dir.joinpath('WP4','LAU_stats')
out_file= ds_dir.joinpath('LAU_2020_NUTS_2021_01M_3035_CM.parquet')

# altered_lau_data.to_parquet(out_file_altered)
LAU_NUTS_CLEANCM.to_parquet(out_file)