In [8]:
# Load public modules.
import os, sys
from sklearn.pipeline import Pipeline
from pathlib import Path

# Get the home dir and username.
HOME = str(Path.home())
USERNAME = os.path.basename(HOME)

# Set codebase path for old VAO.
CODEBASE_PATH_OLD = os.path.join(HOME, 'Documents/woonfraude/codebase')
sys.path.insert(1, CODEBASE_PATH_OLD)
                
# Set codebase path for new VAO.
CODEBASE_PATH_NEW = os.path.join('/data', USERNAME, 'Documents/woonfraude/codebase')
sys.path.insert(1, CODEBASE_PATH_NEW)

# Import own modules.
from datasets_oo import *
from clean_oo import *
from extract_features_oo import *

In [3]:
# Set global variables.
FORCE_DOWNLOAD = False
FORCE_DATASET_SPECIFIC_PREPROCESSING = False

# Load all datasets in memory

In [4]:
##############################
# Initialize dataset objects #
##############################

adresDataset = AdresDataset()
zakenDataset = ZakenDataset()
stadiaDataset = StadiaDataset()
personenDataset = PersonenDataset()
bagDataset = BagDataset()
hotlineDataset = HotlineDataset()

In [5]:
#######################################################################################
# Download data and perform dataset-specific pre-processing steps for dataset objects #
#######################################################################################

# Forces the downloading of new data.
if FORCE_DOWNLOAD:
    adresDataset.download(force=True)
    zakenDataset.download(force=True)
    stadiaDataset.download(force=True)
    personenDataset.download(force=True)
    bagDataset.download(force=True)
    

 # Forces the dataset specific pre-processing of the downloaded data.
if FORCE_DATASET_SPECIFIC_PREPROCESSING:
    
    # Adres dataset.
    adresDataset.load('download')
    adresDataset.extract_leegstand()
    adresDataset.enrich_with_woning_id()

    # Zaken Dataset.
    zakenDataset.load('download')
    zakenDataset.add_categories()
    zakenDataset.filter_categories()

    # Stadia dataset.
    stadiaDataset.load('download')
    stadiaDataset.add_zaak_stadium_ids()
    stadiaDataset.add_labels()

    # Personen dataset.


    # Bag dataset.
    bagDataset.download(force=True)
    bagDataset.load('download')
    bagDataset.bag_fix()

    # Hotline dataset.
    hotlineDataset.download(force=True)

In [None]:
###################
## Load datasets ##
###################
# Download (or load cached versions of) the datasets.

# Adres
adresDataset = AdresDataset()
adresDataset.load('download')
adresDataset.extract_leegstand()
adresDataset.enrich_with_woning_id()
# adresDataset.load('download_leegstand_woningId')

# Zaken
zakenDataset = ZakenDataset()
zakenDataset.load('download')
zakenDataset.add_categories()
zakenDataset.filter_categories()
# zakenDataset.load('download_categories_filterCategories')

# Stadia
stadiaDataset = StadiaDataset()
stadiaDataset.load('download')
stadiaDataset.add_zaak_stadium_ids()
stadiaDataset.add_labels()
# stadiaDataset.load('download_ids_labels')

# Personen
personenDataset = PersonenDataset()
personenDataset.load('download')

# BAG
bagDataset = BagDataset()
# bagDataset.load('download')
# bagDataset.bag_fix()
BagDataset.load('download_columnFix')

# Hotline
hotlineDataset = HotlineDataset()
hotlineDataset.load('download')

In [6]:
###################
## Load datasets ##
###################
# Load datasets from cache (when download and pre-processing steps in previous block have been done).

adresDataset.load('download_leegstand_woningId')
zakenDataset.load('download_categories_filterCategories')
stadiaDataset.load('download_ids_labels')
personenDataset.load('download')
bagDataset.load('download_columnFix')
hotlineDataset.load('download')

Version 'download_leegstand_woningId' of dataset 'adres' loaded!
Version 'download_categories_filterCategories' of dataset 'zaken' loaded!
Version 'download_ids_labels' of dataset 'stadia' loaded!
Version 'download' of dataset 'personen' loaded!
Version 'download_columnFix' of dataset 'bag' loaded!
Version 'download' of dataset 'hotline' loaded!
