## Config

In [1]:
# change to raw data file
INPUT_DATA_PATH = "../../data/raw/saldo-van-wisselingen-private-verhuur-naar-koopwoning_raw.csv"
INPUT_SHAPEFILE_PATH = "../../data/gemeente_shapefiles/gemeente_shapefile.shp"

# set true or false
SAVE_OUTPUT = True
# change to filename
OUTPUT_DATA_PATH = "../../data/processed/saldo-wisselingen-huur-naar-koop_clean.csv"

In [2]:
import numpy as np
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt

## Load data

In [3]:
# load data
df = pd.read_csv(INPUT_DATA_PATH, sep = ',')
gdf = gpd.read_file(INPUT_SHAPEFILE_PATH, sep = ';')

In [4]:
gdf = gdf.to_crs("EPSG:4326")

## Data management

In [5]:
df.head()

Unnamed: 0,Gemeente,statcode,Saldo private verhuur naar koop
0,Aa en Hunze,GM1680,0
1,Aalsmeer,GM0358,1
2,Aalten,GM0197,2
3,Achtkarspelen,GM0059,-1
4,Alblasserdam,GM0482,1


In [6]:
# make names clearer; Saldo private verhuur naar koop = perc_transitie_verhuur_koop
df.rename(columns = {'Saldo private verhuur naar koop': 'perc_transitie_verhuur_koop', 'statcode':'gm_code' }, inplace = True)
df.head()

Unnamed: 0,Gemeente,gm_code,perc_transitie_verhuur_koop
0,Aa en Hunze,GM1680,0
1,Aalsmeer,GM0358,1
2,Aalten,GM0197,2
3,Achtkarspelen,GM0059,-1
4,Alblasserdam,GM0482,1


In [7]:
merged_df = df.merge(gdf, on = 'gm_code', how = 'right')
merged_df.head()

Unnamed: 0,Gemeente,gm_code,perc_transitie_verhuur_koop,gm_naam,geometry
0,,GM0003,,Appingedam,"POLYGON ((6.80583 53.31703, 6.80588 53.31704, ..."
1,,GM0010,,Delfzijl,"MULTIPOLYGON (((7.07833 53.26658, 7.07826 53.2..."
2,Groningen,GM0014,1.0,Groningen,"POLYGON ((6.48047 53.24551, 6.48050 53.24579, ..."
3,,GM0024,,Loppersum,"POLYGON ((6.73921 53.38812, 6.73925 53.38815, ..."
4,Almere,GM0034,0.0,Almere,"MULTIPOLYGON (((5.21106 52.33159, 5.21106 52.3..."


In [8]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 355 entries, 0 to 354
Data columns (total 5 columns):
 #   Column                       Non-Null Count  Dtype   
---  ------                       --------------  -----   
 0   Gemeente                     337 non-null    object  
 1   gm_code                      355 non-null    object  
 2   perc_transitie_verhuur_koop  337 non-null    float64 
 3   gm_naam                      355 non-null    object  
 4   geometry                     355 non-null    geometry
dtypes: float64(1), geometry(1), object(3)
memory usage: 14.0+ KB


In [9]:
# drop column Gemeente
merged_df.drop(columns = ['Gemeente'], inplace = True)

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 342 entries, 0 to 341
Data columns (total 3 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   Gemeente                     342 non-null    object
 1   gm_code                      342 non-null    object
 2   perc_transitie_verhuur_koop  342 non-null    int64 
dtypes: int64(1), object(2)
memory usage: 8.1+ KB


In [11]:
gdf.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 355 entries, 0 to 354
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype   
---  ------    --------------  -----   
 0   gm_code   355 non-null    object  
 1   gm_naam   355 non-null    object  
 2   geometry  355 non-null    geometry
dtypes: geometry(1), object(2)
memory usage: 8.4+ KB


In [12]:
# look at the rows with nan perc_transitie_verhuur_koop
missing_values = merged_df[merged_df['perc_transitie_verhuur_koop'].isna()]
missing_values

Unnamed: 0,gm_code,perc_transitie_verhuur_koop,gm_naam,geometry
0,GM0003,,Appingedam,"POLYGON ((6.80583 53.31703, 6.80588 53.31704, ..."
1,GM0010,,Delfzijl,"MULTIPOLYGON (((7.07833 53.26658, 7.07826 53.2..."
3,GM0024,,Loppersum,"POLYGON ((6.73921 53.38812, 6.73925 53.38815, ..."
110,GM0370,,Beemster,"POLYGON ((4.86299 52.56828, 4.86301 52.56852, ..."
123,GM0398,,Heerhugowaard,"POLYGON ((4.82572 52.68498, 4.82572 52.68499, ..."
130,GM0416,,Langedijk,"POLYGON ((4.76743 52.69821, 4.76782 52.69827, ..."
142,GM0457,,Weesp,"POLYGON ((5.01842 52.32434, 5.01841 52.32452, ..."
149,GM0501,,Brielle,"POLYGON ((4.16027 51.88325, 4.16035 51.88335, ..."
157,GM0530,,Hellevoetsluis,"MULTIPOLYGON (((4.22012 51.82577, 4.22008 51.8..."
178,GM0614,,Westvoorne,"POLYGON ((4.05047 51.90730, 4.05061 51.90739, ..."


## Save output

In [13]:
if SAVE_OUTPUT:
    df.to_csv(OUTPUT_DATA_PATH, sep = ';', index = False)

## Watermark

In [14]:
!python -m pip install watermark --quiet

In [15]:
%load_ext watermark

In [16]:
%watermark

Last updated: 2025-01-27T11:02:15.518615+01:00

Python implementation: CPython
Python version       : 3.11.5
IPython version      : 8.17.2

Compiler    : MSC v.1916 64 bit (AMD64)
OS          : Windows
Release     : 10
Machine     : AMD64
Processor   : Intel64 Family 6 Model 142 Stepping 10, GenuineIntel
CPU cores   : 8
Architecture: 64bit



In [17]:
%watermark --iversions

numpy     : 1.26.1
matplotlib: 3.8.1
pandas    : 2.1.2
geopandas : 0.14.0



In [18]:
!lsb_release -a

'lsb_release' is not recognized as an internal or external command,
operable program or batch file.
