In [1]:
import pandas as pd
import yaml

In [2]:
with open("../config.yaml", "r") as file:
    paths = yaml.safe_load(file)

print(paths['data']['raw']['file1'])
print(paths['data']['raw']['file2'])
print(paths['data']['raw']['file3'])
print(paths['data']['raw']['file4'])

../data/raw/inventory_2012-2023.csv
../data/raw/value_2019-2024.csv
../data/raw/value_1997-2020.csv
../data/raw/24-uitdraai-database-aardbevingen-atabix.csv


In [3]:
# load files

df1 = pd.read_csv(paths['data']['raw']['file2']) #, usecols=[0, 1, 2, 3, 4])
df2 = pd.read_csv(paths['data']['raw']['file2'])
df3 = pd.read_csv(paths['data']['raw']['file3'])
df4 = pd.read_csv(paths['data']['raw']['file4'])


In [4]:
# preliminary view

df1.head()

Unnamed: 0,Perioden,2019,2020,2021,2022,2023**,2024*
0,Eigendom,Totaal,Totaal,Totaal,Totaal,Totaal,Totaal
1,Regio's,1 000 euro,1 000 euro,1 000 euro,1 000 euro,1 000 euro,1 000 euro
2,Nederland,250,271,290,317,368,379
3,Groningen (PV),177,190,205,228,268,279
4,Fryslân (PV),190,202,217,241,278,290


In [5]:
# check datatypes

df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 376 entries, 0 to 375
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Perioden  376 non-null    object
 1   2019      370 non-null    object
 2   2020      370 non-null    object
 3   2021      367 non-null    object
 4   2022      360 non-null    object
 5   2023**    357 non-null    object
 6   2024*     357 non-null    object
dtypes: object(7)
memory usage: 20.7+ KB


In [6]:
# check values row 1

df1.iloc[0].tolist()

['Eigendom', 'Totaal', 'Totaal', 'Totaal', 'Totaal', 'Totaal', 'Totaal']

In [7]:
# check values row 2

df1.iloc[1].tolist()

["Regio's",
 '1 000 euro',
 '1 000 euro',
 '1 000 euro',
 '1 000 euro',
 '1 000 euro',
 '1 000 euro']

In [8]:
 dict(df1.iloc[0])

{'Perioden': 'Eigendom',
 '2019': 'Totaal',
 '2020': 'Totaal',
 '2021': 'Totaal',
 '2022': 'Totaal',
 '2023**': 'Totaal',
 '2024*': 'Totaal'}

In [9]:
# rename column names

column_mappings = {
    'Perioden': 'region',
    '2019': '2019',
    '2020': '2020',
    '2021': '2021',
    '2022': '2022',
    '2023**': '2023',
    '2024*': '2024'
}

print("\nColumn Mappings Dictionary:")
print(column_mappings)

# Rename the columns in the DataFrame
df1.rename(columns=column_mappings, inplace=True)


Column Mappings Dictionary:
{'Perioden': 'region', '2019': '2019', '2020': '2020', '2021': '2021', '2022': '2022', '2023**': '2023', '2024*': '2024'}


In [10]:
# check output

df1.head()

Unnamed: 0,region,2019,2020,2021,2022,2023,2024
0,Eigendom,Totaal,Totaal,Totaal,Totaal,Totaal,Totaal
1,Regio's,1 000 euro,1 000 euro,1 000 euro,1 000 euro,1 000 euro,1 000 euro
2,Nederland,250,271,290,317,368,379
3,Groningen (PV),177,190,205,228,268,279
4,Fryslân (PV),190,202,217,241,278,290


In [11]:
# drop first two rows

df_value = df1.drop([0, 1], axis=0)

In [12]:
# check output

df_value.head()

Unnamed: 0,region,2019,2020,2021,2022,2023,2024
2,Nederland,250,271,290,317,368,379
3,Groningen (PV),177,190,205,228,268,279
4,Fryslân (PV),190,202,217,241,278,290
5,Drenthe (PV),203,215,230,255,295,307
6,Overijssel (PV),221,235,251,280,326,344


In [13]:
# check region values unique

# set(df_value['region'])

In [14]:
# convert to correct datatypes

# df_value['period'] = pd.to_datetime(df_value['period'])
df_value['2019'] = pd.to_numeric(df_value['2019'], errors='coerce')
df_value['2020'] = pd.to_numeric(df_value['2020'], errors='coerce')
df_value['2021'] = pd.to_numeric(df_value['2021'], errors='coerce')
df_value['2022'] = pd.to_numeric(df_value['2022'], errors='coerce')
df_value['2023'] = pd.to_numeric(df_value['2023'], errors='coerce')
df_value['2024'] = pd.to_numeric(df_value['2024'], errors='coerce')
# df_value['total_occupied'] = pd.to_numeric(df_value['total_occupied'], errors='coerce')
# df_value['total_not_occupied'] = pd.to_numeric(df_value['total_not_occupied'], errors='coerce')

In [15]:
# check datatypes

df_value.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 374 entries, 2 to 375
Data columns (total 7 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   region  374 non-null    object 
 1   2019    368 non-null    float64
 2   2020    368 non-null    float64
 3   2021    365 non-null    float64
 4   2022    358 non-null    float64
 5   2023    355 non-null    float64
 6   2024    355 non-null    float64
dtypes: float64(6), object(1)
memory usage: 20.6+ KB


In [16]:
# check for null values

df_value.isna().sum()

region     0
2019       6
2020       6
2021       9
2022      16
2023      19
2024      19
dtype: int64

In [17]:
df_value.shape

(374, 7)

In [18]:
# drop null values

# df_value = df_value.dropna()

In [19]:
df_value.shape

(374, 7)

In [20]:
#reset index

df_value = df_value.reset_index(drop=True)


In [21]:
df_value.head()

Unnamed: 0,region,2019,2020,2021,2022,2023,2024
0,Nederland,250.0,271.0,290.0,317.0,368.0,379.0
1,Groningen (PV),177.0,190.0,205.0,228.0,268.0,279.0
2,Fryslân (PV),190.0,202.0,217.0,241.0,278.0,290.0
3,Drenthe (PV),203.0,215.0,230.0,255.0,295.0,307.0
4,Overijssel (PV),221.0,235.0,251.0,280.0,326.0,344.0


In [22]:
set(df_value['region'])

{"'s-Gravenhage (gemeente)",
 "'s-Hertogenbosch",
 'Aa en Hunze',
 'Aalsmeer',
 'Aalten',
 'Achtkarspelen',
 'Alblasserdam',
 'Albrandswaard',
 'Alkmaar',
 'Almelo',
 'Almere',
 'Alphen aan den Rijn',
 'Alphen-Chaam',
 'Altena',
 'Ameland',
 'Amersfoort',
 'Amstelveen',
 'Amsterdam',
 'Apeldoorn',
 'Appingedam',
 'Arnhem',
 'Assen',
 'Asten',
 'Baarle-Nassau',
 'Baarn',
 'Barendrecht',
 'Barneveld',
 'Beek (L.)',
 'Beekdaelen',
 'Beemster',
 'Beesel',
 'Berg en Dal',
 'Bergeijk',
 'Bergen (L.)',
 'Bergen (NH.)',
 'Bergen op Zoom',
 'Berkelland',
 'Bernheze',
 'Best',
 'Beuningen',
 'Beverwijk',
 'Bladel',
 'Blaricum',
 'Bloemendaal',
 'Bodegraven-Reeuwijk',
 'Boekel',
 'Borger-Odoorn',
 'Borne',
 'Borsele',
 'Boxmeer',
 'Boxtel',
 'Breda',
 'Brielle',
 'Bron: CBS',
 'Bronckhorst',
 'Brummen',
 'Brunssum',
 'Bunnik',
 'Bunschoten',
 'Buren',
 'Capelle aan den IJssel',
 'Castricum',
 'Coevorden',
 'Cranendonck',
 'Cuijk',
 'Culemborg',
 'Dalfsen',
 'Dantumadiel',
 'De Bilt',
 'De Fryske 

In [23]:
# export clean dataset to csv

df_value.to_csv('value_2019-2024_clean_new.csv', index=False)