# Master Train Notebook

Deze notebook wordt gebruikt om de data, die met de master_prepare notebook geprepare

In [6]:
# Load public modules.
import os, sys
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint as sp_randint
from pathlib import Path
import pickle
import time

# Get the home dir and username.
HOME = Path.home()
USERNAME = os.path.basename(HOME)

# Set codebase path for old VAO.
CODEBASE_PATH_OLD = os.path.join(HOME, 'Documents/woonfraude/codebase/')
sys.path.insert(1, CODEBASE_PATH_OLD)
                
# Set codebase path for new VAO.
CODEBASE_PATH_NEW = os.path.join('/data', USERNAME, 'Documents/woonfraude/codebase/')
sys.path.insert(1, CODEBASE_PATH_NEW)

# Import own modules.
from datasets import *
from build_model import *

# Load finalized dataset (from master_prepare)

In [2]:
# Load dataset.
zakenDataset = ZakenDataset()
zakenDataset.load('final')

Version 'final' of dataset 'zaken' loaded!


# Show Dataset Statistics

In [3]:
# Show percentage of positive samples in dataset.
print(f"Number of entries: {len(zakenDataset.data)}")
print(f"Percentage positives: {round((zakenDataset.data.woonfraude.sum() * 100) / len(zakenDataset.data.woonfraude), 1)}%")

Number of entries: 16079
Percentage positives: 53.3%


# Create Train/Test Split

In [4]:
# Remove the adres_id column.
zakenDataset.data.drop(columns=['adres_id'], inplace=True)

# Remove text columns, which can't be used for training.
zakenDataset.data.drop(columns=['afg_code_afs', 'afs_code', 'afs_oms', 'beh_oms', 'mededelingen'], inplace=True)

# Only keep numeric data columns.
zakenDataset.data = zakenDataset.data._get_numeric_data()

# Remove columns containing only NaN values.
zakenDataset.data.drop(columns=['hoofdadres', 'begin_geldigheid'], inplace=True)

In [7]:
# Split up the dataset (only use numeric data!).
X_train, X_test, y_train, y_test = split_data_train_test(zakenDataset.data)

Original dataset shape Counter({True: 8572, False: 7507})
Training set shape Counter({True: 7286, False: 6381})
Testing set shape Counter({True: 1286, False: 1126})




# Train Model

In [8]:
# Utility function to report best scores.
def report(results, n_top=10):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")

In [9]:
# Specify parameters and distributions to sample from.
param_dist = {
              "n_estimators": sp_randint(100, 1000),
#               "max_features": ['auto'],
              "max_features": sp_randint(1, 100),
              "max_depth": sp_randint(1, 100),
              "min_samples_leaf": [1],
              "min_samples_split": sp_randint(2, 5),
              "bootstrap": [True, False],
#               "criterion": ["gini", "entropy"],
             }

In [None]:
# Run randomized search with random forest *regressor*.

# We currently use a scoring parameter that was chosen without too much thought.
# We intend to run the search using different scoring parameters in the future,
# and testing the performance in a binary fashion afterwards, by mapping percentages
# below 50% to False and percentages above 50% to True.
#
# See this link for more information about the possible scoring parameters for regression:
# https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter
    
clf = RandomForestRegressor()
n_iter = 10
random_search = RandomizedSearchCV(clf,
                                   param_distributions=param_dist,
                                   n_iter=n_iter,
                                   cv=5,
                                   n_jobs=-1,
                                   scoring='r2')

start = time.time()
random_search.fit(X_train, y_train)

# Print results.
print(f"RandomizedSearchCV took {time.time() - start} seconds for {len(random_search.cv_results_['params'])} candidate parameter settings.")
report(random_search.cv_results_)

In [None]:
# Save best model for later reuse in dashboard. Manually put this model in the "data" folder (temporary solution).

# Select best model.
best_random_forest_regressor_temp = random_search.best_estimator_

# Create feature list and add to model.
feature_names = list(X_train.columns)
best_random_forest_regressor_temp.feature_names = feature_names

# Save model.
pickle.dump(best_random_forest_regressor_temp, open("best_random_forest_regressor_temp.pickle", "wb"))