# Master Prepare Notebook

Deze notebook wordt gebruikt om alle data uit de datasets in te laden en verder te verwerken, zodat deze klaar staat om modellen te trainen.

In [None]:
# Load public modules.
from sklearn.pipeline import Pipeline
from pathlib import Path
import os, sys
import pickle
import time

# Get the home dir and username.
HOME = Path.home()
USERNAME = os.path.basename(HOME)

# Set codebase path for old VAO.
CODEBASE_PATH_OLD = os.path.join(HOME, 'Documents/woonfraude/codebase/')
sys.path.insert(1, CODEBASE_PATH_OLD)
                
# Set codebase path for new VAO.
CODEBASE_PATH_NEW = os.path.join('/data', USERNAME, 'Documents/woonfraude/codebase/')
sys.path.insert(1, CODEBASE_PATH_NEW)

# Set codebase path for wonen server.
CODEBASE_PATH_WONEN = os.path.abspath('E:\\Jasmine\\woonfraude\\codebase')
sys.path.insert(1, CODEBASE_PATH_WONEN)

# Import own modules.
from datasets import *
from clean import *
from extract_features import *

In [None]:
# Set global variables.
FORCE_DOWNLOAD = False
FORCE_DATASET_SPECIFIC_PREPROCESSING = False

# Load all datasets in memory

In [None]:
##############################
# Initialize dataset objects #
##############################

adresDataset = AdresDataset()
zakenDataset = ZakenDataset()
stadiaDataset = StadiaDataset()
personenDataset = PersonenDataset()
bagDataset = BagDataset()
hotlineDataset = HotlineDataset()

In [None]:
#######################################################################################
# Download data and perform dataset-specific pre-processing steps for dataset objects #
#######################################################################################

# Forces the downloading of new data.
if FORCE_DOWNLOAD:
    adresDataset.download(force=True)
    zakenDataset.download(force=True)
    stadiaDataset.download(force=True)
    personenDataset.download(force=True)
    hotlineDataset.download(force=True)
    

 # Forces the dataset specific pre-processing of the downloaded data.
if FORCE_DATASET_SPECIFIC_PREPROCESSING:
    
    # Adres dataset.
    adresDataset.load('download')
    adresDataset.extract_leegstand()
    adresDataset.enrich_with_woning_id()

    # Zaken Dataset.
    zakenDataset.load('download')
    zakenDataset.add_categories()
    zakenDataset.filter_categories()  # Verwijder meldingen met categorieeen "woningkwaliteit" en "afdeling vergunningen en beheer".

    # Stadia dataset.
    stadiaDataset.load('download')
    stadiaDataset.add_zaak_stadium_ids()

# Clean and extract features from all datasets

In [None]:
#########################
## Clean zaken dataset ##
#########################

# Load dataset from cache.
zakenDataset.load('download_categories_filterCategories')

# Clean dataset.
zakenPipeline = Pipeline(steps=[
    ('clean', CleanTransformer(
        id_column=zakenDataset.id_column,
        drop_duplicates=True,
        fix_date_columns=['begindatum','einddatum', 'wzs_update_datumtijd'],
        clean_dates=True,
        lower_string_columns=True,
        impute_missing_values=True,
        impute_missing_values_custom={'categorie': 'missing'})
    )])

zakenDataset.data = zakenPipeline.fit_transform(zakenDataset.data)

# Save dataset.
zakenDataset.version = 'download_categories_filterCategories_cleaned'
zakenDataset.save()

# Delete dataset class object.
del zakenDataset

In [None]:
##########################
## Clean stadia dataset ##
##########################

# Load dataset from cache.
stadiaDataset.load('download_ids')

# Clean dataset.
stadiaPipeline = Pipeline(steps=[
    ('clean', CleanTransformer(
        id_column=stadiaDataset.id_column,
        drop_duplicates=True,
        fix_date_columns=['begindatum', 'peildatum', 'einddatum', 'date_created',
                          'date_modified', 'wzs_update_datumtijd'],
        clean_dates=True,
        lower_string_columns=True,
        impute_missing_values=True)
    )])

stadiaDataset.data = stadiaPipeline.fit_transform(stadiaDataset.data)

# Save dataset.
stadiaDataset.version = 'download_ids_cleaned'
stadiaDataset.save()

# Delete dataset class object.
del stadiaDataset

In [None]:
############################
## Clean personen dataset ##
############################

# Load dataset from cache.
personenDataset.load('download')

# Clean dataset.
personenPipeline = Pipeline(steps=[
    ('clean', CleanTransformer(
        id_column=personenDataset.id_column,
        drop_duplicates=True,
        fix_date_columns=['geboortedatum'],
        lower_string_columns=True)
    )])

personenDataset.data = personenPipeline.fit_transform(personenDataset.data)

# Save dataset.
personenDataset.version = 'download_cleaned'
personenDataset.save()

# Delete dataset class object.
del personenDataset

In [None]:
#######################
## Clean BAG dataset ##
#######################

# Load dataset from cache.
bagDataset.load('download_columnFix')

# Clean dataset.
bagPipeline = Pipeline(steps=[
    ('clean', CleanTransformer(
        id_column=bagDataset.id_column,
        drop_duplicates=True,
        fix_date_columns=[],
        drop_columns = ['indicatie_geconstateerd', 'indicatie_in_onderzoek', 'woningvoorraad'],
        lower_string_columns=True,
        impute_missing_values=True,
        impute_missing_values_mode=['status_coordinaat_code'],
        fillna_columns={'_huisnummer_verblijfsobject': 0,
                         '_huisletter_verblijfsobject': 'None',
                         '_openbare_ruimte_naam_verblijfsobject': 'None',
                         '_huisnummer_toevoeging_verblijfsobject': 'None',
                         'type_woonobject_omschrijving': 'None',
                         'eigendomsverhouding_id': 'None',
                         'financieringswijze_id': -1,
                         'gebruik_id': -1,
                         'reden_opvoer_id': -1,
                         'status_id_verblijfsobject': -1,
                         'toegang_id': 'None'})
    )])

bagDataset.data = bagPipeline.fit_transform(bagDataset.data)

# Save dataset.
bagDataset.version = 'download_columnFix_cleaned'
bagDataset.save()

# Delete dataset class object.
del bagDataset

In [None]:
###########################
## Clean hotline dataset ##
###########################

# Load dataset from cache.
hotlineDataset.load('download')

# Clean dataset.
hotlinePipeline = Pipeline(steps=[
    ('clean', CleanTransformer(
        id_column=hotlineDataset.id_column,
        drop_duplicates=True,
        lower_string_columns=True,
        impute_missing_values=True)
    )])

hotlineDataset.data = hotlinePipeline.fit_transform(hotlineDataset.data)

# Save dataset.
hotlineDataset.version = 'download_cleaned'
hotlineDataset.save()

# Delete dataset class object.
del hotlineDataset

In [None]:
#########################
## Clean adres dataset ##
#########################

# Load dataset from cache.
adresDataset.load('download_leegstand_woningId')

# Clean dataset.
# Hier de extract stap weghalen? Deze past waarschijnlijk beter na het combinen v/d datasets.
adresPipeline = Pipeline(steps=[
    ('clean', CleanTransformer(
        id_column=adresDataset.id_column,
        drop_duplicates=True,
        fix_date_columns=['hvv_dag_tek', 'max_vestig_dtm', 'wzs_update_datumtijd'],
        lower_string_columns=True,
        impute_missing_values=True,
        fillna_columns={'hsnr': 0, 'sttnaam': 'None', 'hsltr': 'None', 'toev': 'None'})
    )])

adresDataset.data = adresPipeline.fit_transform(adresDataset.data)

# Save dataset.
adresDataset.version = 'download_leegstand_woningId_cleaned'
adresDataset.save()

# Delete dataset class object.
del adresDataset

# Combine the datasets

In [None]:
##############################
# Initialize dataset objects #
##############################

adresDataset = AdresDataset()
zakenDataset = ZakenDataset()
stadiaDataset = StadiaDataset()
personenDataset = PersonenDataset()
bagDataset = BagDataset()
hotlineDataset = HotlineDataset()

In [None]:
###################
## Load datasets ##
###################
# Load datasets from cache (when download and pre-processing steps in previous block have been done).

zakenDataset.load('download_categories_filterCategories_cleaned')
adresDataset.load('download_leegstand_woningId_cleaned')
stadiaDataset.load('download_ids_cleaned')
personenDataset.load('download_cleaned')
bagDataset.load('download_columnFix_cleaned')
hotlineDataset.load('download_cleaned')

In [None]:
##########################
## Enrich adres dataset ##
##########################

# Enrich the adres dataset with information from the bag, personen and hotline datasets.
adresDataset.enrich_with_bag(bagDataset.data)
adresDataset.enrich_with_personen_features(personenDataset.data)
adresDataset.add_hotline_features(hotlineDataset.data)


##########################
## Enrich zaken dataset ##
##########################

# Only keep the finished cases in the zaken dataset (remove all unfinished cases).
# zakenDataset.keep_finished_cases(stadiaDataset.data)

# Add a label to indicate woonfraude.
# zakenDataset.add_binary_label_zaken(stadiaDataset.data)

In [None]:
###############################################################################
## Remove implicit label columns and superfluous columns  from adres dataset ##
###############################################################################

adres_remove = [# Remove because cols do not exists when melding is received
                    'wzs_update_datumtijd',
                    # Remove because cols do not add extra information.
                    'kmrs',
                    'straatcode',
                    'xref',
                    'yref',
                    'postcode',
                    'wzs_buurtcode_os_2015',
                    'wzs_buurtcombinatiecode_os_2015',
                    'wzs_stadsdeelcode_os_2015',
                    'hvv_dag_tek', # Empty column
                    'max_vestig_dtm', # Empty column
                    'wzs_22gebiedencode_os_2015', # Empty column
                    'wzs_22gebiedennaam_os_2015', # Empty column
                    'pvh_cd',
                    'sbv_code',
                    'sbw_code',
                    'wzs_wijze_verrijking_geo',
                    'wzs_22gebiedencode_2015',
                    'brt_naam',
                    'wzs_buurtnaam_os_2015',
                    'wzs_buurtcombinatienaam_os_2015',
                    'wzs_rayonnaam_os_2015',
                    'wzs_rayoncode_os_2015',
                    'wzs_stadsdeelnaam_os_2015',
                    'wzs_alternatieve_buurtennaam_os_2015',
                    'wzs_alternatieve_buurtencode_os_2015',
                    'wzs_geom',
                    'brt_code',
                    'brtcombi_code',
                    'brtcombi_naam',
                    'sdl_code',
                    'wzs_22gebiedennaam_2015',
                    'wzs_id',
                    'a_dam_bag',
                    'landelijk_bag']

bag_remove = ['einde_geldigheid',               # Only 2 entries in column.
              'verhuurbare_eenheden',           # Only ~2k entries in column.
              'geometrie_ligplaats',            # Needs a lot of processing before being useful.
              'bron_id_verblijfsobject',        # Only 2 entries in column.
              'locatie_ingang_id',              # Only 2 entries in column.
              'reden_afvoer_id',                # Only a few entries in column.
              '_gebiedsgerichtwerken_id',       # Superfluous (gebied).
              '_grootstedelijkgebied_id',       # Superfluous (grootstedelijkgebied).
              'buurt_id',                       # Superfluous (buurt).
              # ONDERSTAANDE 4 KOLOMMEN KONDEN EERDER NIET WEG IVM MATCH MET ADRES DATAFRAME.
              # DEZE MOETEN NU WEL WEG, DAAROM WORDT NU HIER ALLES WEGGEHAALD.
              '_openbare_ruimte_naam_nummeraanduiding',          # Superfluous (straatnaam).
              'vervallen_nummeraanduiding',
              'vervallen_ligplaats',
              'vervallen_standplaats',
              'vervallen_verblijfsobject',
              'document_mutatie',               # Not available at time of signal.
              'date_modified_nummeraanduiding', # Not available at time of signal.
              'document_nummer',                # Not needed? (Swaan?)
              'status_coordinaat_omschrijving', # Not needed? (Swaan?)
              'type_woonobject_code',           # Not needed? (Swaan?)
              'id_ligplaats',                   # Not needed.
              'landelijk_id_ligplaats',         # Not needed.
              'id_standplaats',                 # Not needed.
              'landelijk_id_standplaats',       # Not needed.
              'id_verblijfsobject',             # Not needed.
              'landelijk_id_verblijfsobject',   # Not needed.
              ]

# Remove the columns that are defined above from the dataset.
adresDataset.data.drop(columns=adres_remove + bag_remove, inplace=True)

In [None]:
####################################################
## Merge the adres dataset onto the zaken dataset ##
####################################################

# Merge the adres dataset onto the zaken dataset.
zakenDataset.data = zakenDataset.data.merge(adresDataset.data, on='adres_id', how='left')

# Perform  Feature Extraction

In [None]:
#################################################
## Perform feature extraction on zaken dataset ##
#################################################

categorical_col_hot_zaken = ['afg_code_beh', 'beh_code', 'eigenaar', 'categorie']
categorical_cols_hot_adres = ['toev', 'pvh_omschr', 'sbw_omschr', 'sbv_omschr']
categorical_cols_hot_bag = ['status_coordinaat_code', 'type_woonobject_omschrijving',
                            'eigendomsverhouding_id', 'financieringswijze_id',
                            'gebruik_id', 'ligging_id', 'reden_opvoer_id',
                            'status_id_nummeraanduiding', 'toegang_id']

zakenPipeline = Pipeline(steps=[
    ('extract', FeatureExtractionTransformer(
        categorical_cols_hot=categorical_col_hot_zaken + categorical_cols_hot_adres + categorical_cols_hot_bag,
        ))
    ])

zakenDataset.data = zakenPipeline.fit_transform(zakenDataset.data)

# Save Finalized Dataset

In [None]:
# Save.
zakenDataset.version = 'final_all_cases'
zakenDataset.save()