# Using a Neural Network for Tanzania Tourism Prediction

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
import os
import time

from tensorflow.keras import layers
from tensorflow.keras import regularizers
from tensorflow.keras.layers.experimental import preprocessing

from sklearn.model_selection import train_test_split

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, FunctionTransformer
from sklearn.impute import SimpleImputer

    
print('Using TensorFlow version: %s' % tf.__version__)

In [None]:
#!pip install -q git+https://github.com/tensorflow/docs
    
import tensorflow_docs as tfdocs
import tensorflow_docs.modeling
import tensorflow_docs.plots

## Data loading

In [None]:
# Load data
raw_data_df = pd.read_csv('data/original_zindi_data/Train.csv')

#cleaning data and preparing
X = raw_data_df.drop("total_cost", axis=1)
y = raw_data_df["total_cost"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

# Preprocessing of data

In [None]:
# create pipeline
def create_preprocessor(X_train:pd.DataFrame):
    print(f"Create preprocessor")
    
    # Create feature lists for different kinds of pipelines
    impute_median_features = ['total_female', 'total_male']      # num_features
    impute_missing_features = ['travel_with']                    # cat_feature
    impute_no_comments_features = ['most_impressing']            # cat_feature

    # ID is a unique identifier for each tourist and therefore not relevant for the model
    drop_features = ['ID']                                      # cat_feature

    num_features = list(X_train.columns[X_train.dtypes!=object])
    # remove items that also need to go through imputation
    num_features = [x for x in num_features if x not in impute_median_features]

    cat_features = list(X_train.columns[X_train.dtypes==object])

    all_columns = list(X.columns)
    # get list of all columns that only concern Package Tours
    package_columns = [col for col in all_columns if 'package' in col]

    # remove items that also need to go through imputation or need to be dropped and remove package columns
    cat_features = [x for x in cat_features if x not in impute_missing_features and x not in impute_no_comments_features and x not in drop_features and x not in package_columns]

    # Create preprocessing pipelines
    impute_median_pipeline = Pipeline([
    ('imputer_num', SimpleImputer(strategy='median')),
    ('std_scaler', StandardScaler())
    ])

    impute_missing_pipeline = Pipeline([
    ('imputer_cat', SimpleImputer(strategy='constant', fill_value='missing')),
    ('1hot', OneHotEncoder(handle_unknown='ignore', drop='first', sparse=False))
    ])

    impute_no_comments_pipeline = Pipeline([
    ('imputer_cat', SimpleImputer(strategy='constant', fill_value='No comments')),
    ('1hot', OneHotEncoder(handle_unknown='ignore', drop='first', sparse=False))
    ])

    num_pipeline = Pipeline([
    ('std_scaler', StandardScaler())
    ])

    cat_pipeline = Pipeline([
    ('1hot', OneHotEncoder(handle_unknown='ignore', drop='first', sparse=False))
    ])

    preprocessor = ColumnTransformer([
        ('median', impute_median_pipeline, impute_median_features),
        ('missing', impute_missing_pipeline, impute_missing_features),
        ('nocomment', impute_no_comments_pipeline, impute_no_comments_features),
        ('num', num_pipeline, num_features),
        ('cat', cat_pipeline, cat_features)
        ])

    return preprocessor

In [None]:
# Create a preprocessor
preprocessor = create_preprocessor(X_train)

# Use the preprocessor to preprocess the data
X_train_preprocessed = preprocessor.fit_transform(X_train)
X_test_preprocessed = preprocessor.transform(X_test)

In [None]:
# Functions for log-transforming of target (y)
def create_log_transformer():
    return FunctionTransformer(
        func=np.log1p,
        inverse_func=np.expm1
    )

def log_transform_target(y:pd.Series) -> pd.Series:
    print("Log-transform y")
    log_transformer = create_log_transformer()
    y_log = log_transformer.transform(y)
    return y_log

def inverse_log_transform_target(y_log:pd.Series) -> pd.Series:
    print("Inverse-log-transform y")
    log_transformer = create_log_transformer()
    y = log_transformer.inverse_func(y_log)
    return y


In [None]:
# log-transformation of data
y_train_log = log_transform_target(y_train)
y_test_log = log_transform_target(y_test)

## Preparation for logging with Tensorboard

In [None]:
# define parameters
N_TRAIN = len(X_train)
BATCH_SIZE = 32
STEPS_PER_EPOCH = N_TRAIN // BATCH_SIZE
EPOCHS = 100

In [None]:
# configure checkpoints for saving model in between calculation
checkpoint_path = "training__tanzania/cp.ckpt"
checkpoint_dir = os.path.dirname(checkpoint_path)

cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path, save_weights_only=True,verbose=0)

root_logdir = os.path.join(os.curdir, "my_logs_tanzania")


In [None]:
# define directory for logging
def get_run_logdir():
   run_id = time.strftime('run_%d_%m_%Y-%H_%M_%S')
   return os.path.join(root_logdir, run_id)


run_logdir = get_run_logdir()


In [None]:
# define callbacks for logging to use in Tensorboard
def get_callbacks(name):
# returns list of callbacks
  return [
    tfdocs.modeling.EpochDots(),    # to reduce logging noise
    tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=200),
    tf.keras.callbacks.TensorBoard(run_logdir+name, histogram_freq=1)   # to produce logs for using Tensorboard
  ]


# Train Tensorflow Model

In [None]:
# set model name for logging
your_history = {}
model_name = 'first'

In [None]:
def model_compile_and_fit(model, steps_per_epoch, epochs, batch_size, model_name): 
    # learning rate schedule
    lr_schedule = tf.keras.optimizers.schedules.InverseTimeDecay(
        0.01,
        decay_steps=steps_per_epoch*1000,
        decay_rate=1,
        staircase=False)

    # Get optimizer
    optimizer = tf.keras.optimizers.Adam(learning_rate=lr_schedule, name='Adam')

    # model.compile
    with tf.device('/cpu:0'):		#optional, only for mac!!
        model.compile(
        optimizer=optimizer,
        loss='mae',
        metrics='mse')

    # model.fit
    # with preprocessed features and log-transformed target variable
    with tf.device('/cpu:0'):
        results = model.fit(X_train_preprocessed,
                            y_train_log,
                            validation_split=0.2,
                            verbose=0,
                            steps_per_epoch=steps_per_epoch,
                            epochs=epochs,
                            batch_size=batch_size,
                            callbacks=get_callbacks(model_name)
                            )

    return results

## Build the model

In [None]:
# define normalizer
normalizer = preprocessing.Normalization(name='norm', input_shape=[(X_train_preprocessed).shape[1]],axis = None)
normalizer.adapt(np.array(X_train_preprocessed))
normalizer.mean.numpy()

In [None]:
# instantiate model
model_tanzania = tf.keras.Sequential([
    normalizer,
    layers.Dense(name='layer1', input_shape = [None, (X_train_preprocessed).shape[1]], units=64, activation='relu'),
    layers.Dense(name='layer2', units=1)
    ]) 

## Train the model

In [None]:
your_history['first_model'] = model_compile_and_fit(model_tanzania, STEPS_PER_EPOCH, EPOCHS, BATCH_SIZE, 'first_model')

## Evaluate the model training

In [None]:
history_plotter = tfdocs.plots.HistoryPlotter(metric = 'mse', smoothing_std=10)
history_plotter.plot(your_history)

In [None]:
%load_ext tensorboard

## Load Tensorboard
(execute the next cell twice if it does not work the first time)

In [None]:
%tensorboard --logdir=./my_logs_tanzania

## Further model tuning

In [None]:
# Add dropout layer
# instantiate model
model_tanzania = tf.keras.Sequential([
    normalizer,
    layers.Dense(name='layer1', input_shape = [None, (X_train_preprocessed).shape[1]], units=64, activation='relu', kernel_regularizer=regularizers.l2(0.01)),
    layers.Dropout(0.25),
    layers.Dense(name='layer2', units=1)
    ]) 

your_history['dropout_reg_model'] = model_compile_and_fit(model_tanzania, STEPS_PER_EPOCH, EPOCHS, BATCH_SIZE, 'dropout_reg_model')   


In [None]:
history_plotter = tfdocs.plots.HistoryPlotter(metric = 'mse', smoothing_std=10)
history_plotter.plot(your_history)

In [None]:
# Add more dense layers
# instantiate model
model_tanzania = tf.keras.Sequential([
    normalizer,
    layers.Dense(name='layer1', input_shape = [None, (X_train_preprocessed).shape[1]], units=64, activation='relu', kernel_regularizer=regularizers.l2(0.01)),
    layers.Dropout(0.25),
    layers.Dense(name='layer2', input_shape = [None, (X_train_preprocessed).shape[1]], units=64, activation='relu', kernel_regularizer=regularizers.l2(0.01)),
    layers.Dropout(0.25),
    layers.Dense(name='layer3', units=1)
    ]) 

your_history['more_layers_model'] = model_compile_and_fit(model_tanzania, STEPS_PER_EPOCH, EPOCHS, BATCH_SIZE, 'more_layers_model')   

In [None]:
history_plotter = tfdocs.plots.HistoryPlotter(metric = 'mse', smoothing_std=10)
history_plotter.plot(your_history)