# MPN - Data Preprocessing

In [1]:
# Imports
import pandas as pd
import pyreadstat as ps
from functools import reduce
import matplotlib.pyplot as plt

In [2]:
# Read
aut_data, aut_meta = ps.read_sav('MPNWAVE8_AUTOdata.sav')
dab_data, dab_meta = ps.read_sav('MPNWAVE8_DAGBOEKdata.sav')
dag_data, dag_meta = ps.read_sav('MPNWAVE8_DAGdata.sav')
hhd_data, hhd_meta = ps.read_sav('MPNWAVE8_HHdata.sav')
pda_data, pda_meta = ps.read_sav('MPNWAVE8_Pdata.sav')
pbz_data, pbz_meta = ps.read_sav('MPNWAVE8_Pdata_bijzonder.sav')
wee_data, wee_meta = ps.read_sav('MPNWAVE8_weegfactoren.sav')

In [3]:
# Clean aut
aut_dropcols = ['KENTEKENS_INGEVULD', 'AUTO1_BRANDSTOF_A_w6', 'AUTO1_BRANDSTOF_B_w6', 'AUTO1_BIJTELLING_w5', 'AUTO1_GEWMAX', 'AUTO1_GEWLEEG',  'AUTO1_HLID5', 'AUTO1_HLID6', 'AUTO1_HLID7', 'AUTO1_HLID8', 'AUTO1_HLID9', 'AUTO1_HLID10', 'AUTO1_HLID11', 'AUTO1_HLID12', 'AUTO1_HLID13', 'AUTO2_BRANDSTOF_A_w6', 'AUTO2_BRANDSTOF_B_w6', 'AUTO2_BIJTELLING_w5','AUTO2_GEWMAX', 'AUTO2_GEWLEEG', 'AUTO2_HLID5', 'AUTO2_HLID6', 'AUTO2_HLID7', 'AUTO2_HLID8', 'AUTO2_HLID9', 'AUTO2_HLID10', 'AUTO2_HLID11', 'AUTO2_HLID12', 'AUTO2_HLID13', 'AUTO3_BRANDSTOF_A_w6', 'AUTO3_BRANDSTOF_B_w6', 'AUTO3_BIJTELLING_w5', 'AUTO3_GEWMAX', 'AUTO3_GEWLEEG', 'AUTO3_HLID5', 'AUTO3_HLID6', 'AUTO3_HLID7', 'AUTO3_HLID8', 'AUTO3_HLID9', 'AUTO3_HLID10', 'AUTO3_HLID11', 'AUTO3_HLID12','AUTO3_HLID13','AUTO4_GEWLEEG', 'AUTO4_BRANDSTOF_A_w6', 'AUTO4_BRANDSTOF_B_w6', 'AUTO4_BIJTELLING_w5','AUTO4_HLID5', 'AUTO4_HLID6', 'AUTO4_HLID7', 'AUTO4_HLID8', 'AUTO4_HLID9', 'AUTO4_HLID10', 'AUTO4_HLID11', 'AUTO4_HLID12', 'AUTO4_HLID13', 'AUTO5_BRANDSTOF_A_w6', 'AUTO5_BRANDSTOF_B_w6', 'AUTO5_BIJTELLING_w5', 'AUTO5_GEWMAX', 'AUTO5_GEWLEEG', 'AUTO5_HLID5', 'AUTO5_HLID6', 'AUTO5_HLID7', 'AUTO5_HLID8', 'AUTO5_HLID9', 'AUTO5_HLID10', 'AUTO5_HLID11', 'AUTO5_HLID12', 'AUTO5_HLID13',]
aut_80nan = aut_data.columns[aut_data.isna().mean() > 0.8].tolist()
aut_drop = aut_dropcols + aut_80nan
aut = aut_data.drop(columns=aut_drop, axis=1)


In [4]:
# Clean dab
dab_dropcols = []
dab_80nan = dab_data.columns[dab_data.isna().mean() > 0.8].tolist()
dab_drop = dab_dropcols + dab_80nan
dab = dab_data.drop(columns=dab_drop, axis=1)

In [5]:
# Clean dag
dag_dropcols = []
dag_80nan = dag_data.columns[dag_data.isna().mean() > 0.8].tolist()
dag_drop = dag_dropcols + dag_80nan
dag = dag_data.drop(columns=dag_drop, axis=1)

In [16]:
# Clean hhd
hhd_dropcols = ['PROV', 'HHMOTOR', 'HHBROM', 'HHSNOR', 'HHFIETS', 'HHVOUWFIETS', 'HHEBIKE', 'HHPEDEL', 'HHSCOOT', 'HHOVG', 'HHGEEN', 'KENTEKENINFO', 'woonpc2']
hhd_80nan = hhd_data.columns[hhd_data.isna().mean() > 0.8].tolist()
hhd_drop = hhd_dropcols + hhd_80nan
hhd = hhd_data.drop(columns=hhd_drop, axis=1)

%store hhd

Stored 'hhd' (DataFrame)


In [7]:
# Clean pda

pda_dropcols = ['HH_VALID']
pda_80nan = pda_data.columns[pda_data.isna().mean() > 0.8].tolist()
pda_drop = pda_dropcols + pda_80nan
pda = pda_data.drop(columns=pda_drop, axis=1)

In [8]:
# Clean pbz

pbz_dropcols = ['OORDEEL_TREIN', 'OORDEEL_BTM', 'OORDEEL_FIETS', 'OORDEEL_BROMMER', 'OORDEEL_LOPEN', 'OORDEEL_VLIEGTUIG', 'GEBRUIK_TREIN_STELLING1', 'GEBRUIK_TREIN_STELLING2', 'GEBRUIK_TREIN_STELLING3', 'GEBRUIK_TREIN_STELLING4', 'GEBRUIK_TREIN_STELLING5', 'GEBRUIK_TREIN_STELLING6', 'GEBRUIK_TREIN_STELLING7', 'GEBRUIK_BTM_STELLING1', 'GEBRUIK_BTM_STELLING2', 'GEBRUIK_BTM_STELLING3', 'GEBRUIK_BTM_STELLING4', 'GEBRUIK_BTM_STELLING5', 'GEBRUIK_BTM_STELLING6', 'GEBRUIK_BTM_STELLING7', 'GEBRUIK_FIETS_STELLING1', 'GEBRUIK_FIETS_STELLING2', 'GEBRUIK_FIETS_STELLING3', 'GEBRUIK_FIETS_STELLING4', 'GEBRUIK_FIETS_STELLING5', 'GEBRUIK_FIETS_STELLING6', 'GEBRUIK_FIETS_STELLING7', 'GEBRUIK_LOPEN_STELLING1', 'GEBRUIK_LOPEN_STELLING2', 'GEBRUIK_LOPEN_STELLING3', 'GEBRUIK_LOPEN_STELLING4', 'GEBRUIK_LOPEN_STELLING5', 'GEBRUIK_LOPEN_STELLING6', 'GEBRUIK_LOPEN_STELLING7', 'VVM_TOEKOMST_ANDERS_TREIN', 'VVM_TOEKOMST_ANDERS_BTM', 'VVM_TOEKOMST_ANDERS_FIETS', 'VVM_TOEKOMST_ANDERS_BROM', 'VVM_TOEKOMST_ANDERS_LOPEN', 'VVM_TOEKOMST_ANDERS_VLIEGTUIG', 'COR_OV_VOORHEEN_TREIN', 'COR_OV_VOORHEEN_BTM', 'COR_BTM_SUBST_1', 'COR_BTM_SUBST_2', 'COR_BTM_SUBST_5', 'COR_TREIN_SUBST_1', 'COR_TREIN_SUBST_2', 'COR_TREIN_SUBST_4', 'COR_OV_SUBST_FIETS_ERV_S1', 'COR_OV_SUBST_FIETS_ERV_S2', 'COR_OV_SUBST_FIETS_ERV_S3', 'COR_OV_SUBST_BROM_ERV_S1', 'COR_OV_SUBST_BROM_ERV_S2', 'COR_OV_SUBST_BROM_ERV_S3', 'COR_OV_SUBST_LOOP_ERV_S1', 'COR_OV_SUBST_LOOP_ERV_S2', 'COR_OV_SUBST_LOOP_ERV_S3']
pbz_80nan = pbz_data.columns[pbz_data.isna().mean() > 0.8].tolist()
pbz_drop = pbz_dropcols + pbz_80nan
pbz = pbz_data.drop(columns=pbz_drop, axis=1)

In [9]:
# All used dfs
all = [aut, dab, hhd, pda]

# Movement related dfs
movement = [dag, pda, pbz]

In [10]:
# Check types of 'all' elements
for i, df in enumerate(all):
    if not isinstance(df, (pd.DataFrame, pd.Series)):
        print(f"Element {i} is of type {type(df)}, not a Pandas DataFrame or Series.")

In [11]:
# Merge 'all' elements
merged = pd.concat(all, join='inner', axis=1).fillna(0)
# Drop duplicates
merged = merged.loc[:, ~merged.columns.duplicated()]


In [12]:
move = dag.merge(pda, on = 'PERSID', how='inner').fillna(0)
# Drop duplicates
move = move.loc[:, ~move.columns.duplicated()]

In [13]:
# Sort move by HHID_x
move.sort_values(by='HHID_x', inplace=True)

# Store DataFrames split by 'HHID_x'
split_dfs = {}
prev_HHID = None
start_idx = 0

for idx, row in move.iterrows():
    current_HHID = row['HHID_x']
    
    if current_HHID != prev_HHID and prev_HHID is not None:
        split_dfs[prev_HHID] = move.iloc[start_idx:idx]
        start_idx = idx
    
    prev_HHID = current_HHID

split_dfs[prev_HHID] = move.iloc[start_idx:]

# Access a specific DataFrame from the dictionary
specific_HHID_df = split_dfs[30000715.0]

In [14]:
# Make pickles
merged.to_pickle('merged_20.pkl')
move.to_pickle('move_20.pkl')