# Bank Marketing

### Install Requirements

In [0]:
! pip install memory_profiler

Collecting memory_profiler
[?25l  Downloading https://files.pythonhosted.org/packages/9f/fe/1fca7273dd111108f204a686b12a12b6422d405fe4614087aa7d5a66ea87/memory_profiler-0.55.0.tar.gz (40kB)
[K    100% |████████████████████████████████| 40kB 3.0MB/s 
Building wheels for collected packages: memory-profiler
  Building wheel for memory-profiler (setup.py) ... [?25ldone
[?25h  Stored in directory: /root/.cache/pip/wheels/f0/ff/63/fdbff3f1e1b76ad4eae491dd5b190902906b093e93eb86dd5a
Successfully built memory-profiler
Installing collected packages: memory-profiler
Successfully installed memory-profiler-0.55.0


In [0]:
%load_ext memory_profiler

### Import Libraries

In [0]:
from IPython import display
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import tensorflow as tf
import os
from tensorflow.python.data import Dataset
from tensorflow.estimator.inputs import numpy_input_fn, pandas_input_fn
from tensorflow.feature_column import numeric_column, categorical_column_with_vocabulary_list, categorical_column_with_identity
from sklearn.preprocessing import MinMaxScaler
from tensorflow.contrib.tensor_forest.python import tensor_forest
from tensorflow.python.ops import resources

tf.logging.set_verbosity(tf.logging.WARN)
pd.options.display.max_rows = 10
pd.options.display.float_format = '{:.3f}'.format

# Pretty Display of Variables
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

### Necessary Functions

In [None]:
def pandas_fn(X, Y, features=None, batch_size=1, shuffle=True, num_epochs=None):
    """ Creates a input function that constructs the input function
    for the model. You can pass the data after splitting the training and 
    test data. Function selects features while preparing the data.
  
    Args:
        X: Pandas DataFrame containing the training data.
        Y: Pandas Series containing the labels
        features: A List containing the names of the features. Default to None
        batch_size: Size of each features to fetch. Default value `1`
        shuffle: Shuffle the data. Default value `True`
        num_epochs: Iteration to fetch the data. `None` for infinity.

    Returns:
        pandas input function
    """
    if features is not None:
        return pandas_input_fn(X[features], Y, batch_size=batch_size, shuffle=shuffle, num_epochs=num_epochs)
    else:
        return pandas_input_fn(X, Y, batch_size=batch_size, shuffle=shuffle, num_epochs=num_epochs)

In [None]:
def numpy_fn(X, Y, features=None, batch_size=1, shuffle=True, num_epochs=None):
    """ Creates a input function that constructs the input function
    for the model. You can pass the data after splitting the training and 
    test data. Function selects features while preparing the data.

    Args:
        X: Pandas DataFrame containing the training data.
        Y: Pandas Series or DataFrame containing the labels
        features: A List containing the names of the features. Default to None.
        batch_size: Size of each features to fetch. Default value `1`
        shuffle: Shuffle the data. Default value `True`
        num_epochs: Iteration to fetch the data. `None` for infinity.
    """
  
    if features is not None:
        return numpy_input_fn(x={key:np.array(value) for key,value in dict(X[features]).items()},
                        y=Y.values, batch_size=batch_size, shuffle=shuffle, num_epochs=num_epochs)
    else:
        return numpy_input_fn(x={key:np.array(value) for key,value in dict(X).items()},
                        y=Y.values, batch_size=batch_size, shuffle=shuffle, num_epochs=num_epochs)

In [None]:
def compute_predictions(model, predict_fn):
    """ A Function that produces resuts for the given model.

    Args:
        model: A model data was train on. tf.estimator object
        predict_fn: A function used to predict the output

    Returns:
        np.array containing prediction results
    """
    predictions = model.predict(input_fn=predict_fn)
    return np.array([item['predictions'][0] for item in predictions])

In [None]:
def loss2(predictions, y):
    """ Loss function to find the error value of our model
  
    Args:
        predictions: Predicted values
        y: Actual values
  
    Returns:
        A Tensor object to containing the error value.
  
  """
    return tf.reduce_mean(tf.squared_difference(predictions, y))

In [None]:
def loss(predictions, y):
    """ Loss function to find the error value of our model. Mean squared error.
  
    Args:
        predictions: Predicted values
        y: Actual values
  
    Returns:
        A Tensor object to containing the error value.
  
    """
    return tf.reduce_mean(tf.squared_difference(predictions, y.values))

In [None]:
def numeric_to_categorical_features(feature_columns):
    """ Construct TensorFlow categorical columns
  
    Args:
      feature_columns: A dictionary containing category names and 
      its bucket size as key value pair
  
    """
    
    return set([categorical_column_with_identity(key=k, num_buckets=b) for k, b in feature_columns.items()])

In [None]:
def categorical_features(feature_columns):
    """ Construct TensorFlow categorical columns
  
    Args:
      feature_columns: A dictionary containing category names and 
      its values as key value pair
  
    """
    
    return set([categorical_column_with_vocabulary_list(key=k, vocabulary_list=v) for k, v in feature_columns.items()])

In [None]:
def numeric_features(feature_columns):
    """Construct the TensorFlow Feature Columns.

    Args:
        data(List): The names of the numerical input features to use.
    ....
    Returns:
        A set of feature columns
    """ 
  
    return set([numeric_column(feature) for feature in feature_columns])

In [None]:
def metric_auc(labels, predictions):
    return {
        'auc_precision_recall': tf.metrics.auc(
            labels=labels, predictions=predictions['logistic'], num_thresholds=200,
            curve='PR', summation_method='careful_interpolation')
    }

In [None]:
def process_bank_data(ratio=0.8):
    """ Downloads the banking dataset, preprocess the data and splits it.
    Args:
        ratio: Split ratio. Default is 0.8
    
    Returns:
        train, test and sample dataset
    
    """
    
    user = "fazilbtopal"
    key = "a01ead977f55d872c4deeadb0f173aa1"

    if '.kaggle' not in os.listdir('/root'):
        !mkdir ~/.kaggle
    !touch /root/.kaggle/kaggle.json
    !chmod 666 /root/.kaggle/kaggle.json
    with open('/root/.kaggle/kaggle.json', 'w') as f:
        f.write('{"username":"%s","key":"%s"}' % (user, key))
    !chmod 600 /root/.kaggle/kaggle.json

    print('Downloading data from web..')
    ! kaggle datasets download -d sonujha090/bank-marketing
    ! unzip -qq /content/bank-marketing.zip
    
    print('Reading the dataset..')
    # Load the data set.
    bank_df = pd.read_csv("/content/bank-full.csv")
    bank_sample_df = pd.read_csv("/content/bank.csv")
    
    print('Converting categories to binary values..')
    bank_df[bank_df.y, "yes"] = 1
    bank_df[bank_df.y, "no"] = 0
    bank_sample_df[bank_df.y, "yes"] = 1
    bank_sample_df[bank_df.y, "no"] = 0

    # Drop NA values
    bank_df.dropna(inplace=True)
    bank_sample_df.dropna(inplace=True)
    
    print('Shuffling the data..')
    # Shuffle
    bank_df = bank_df.reindex(np.random.permutation(bank_df.index))
    bank_sample_df = bank_sample_df.reindex(np.random.permutation(bank_sample_df.index))

    print('Splitting train & test frames..')
    # Split
    split_size_head = round(ratio*bank_df.shape[0])
    split_size_tail = bank_df.shape[0] - split_size_head
    train = bank_df.head(split_size_head)
    test = bank_df.tail(split_size_tail)

    return train, test, sample

In [None]:
train, test, sample = process_bank_data()

### Model Functions

#### Estimator Functions

In [0]:
def linear_classifier(labels, funcs, feature_columns, nclasses, learning_rate, steps, quiet=False):
    """ Linear Regression using high level Estimator API
  
    Args:
        labels: A dictionary contains labels as pandas Series for training and testing. 
                {train: y_train, test: y_test}
        funcs: A dictionary contains training/test/predict function to hook up with model.
                {train: train_fn, test: test_fn, predict: predict_fn}
        feature_columns: Feature columns as list
        nclasses: Number of classes in the output column
        learning_rate: Learning rate of the model as float
        steps: Number of steps required to run the model with parameters as integer
        quiet: Selects whether to print results or not.
    
    """
    Y_train, Y_test = labels['train'], labels['test']
    train_fn, test_fn, predict_fn = funcs['train'], funcs['test'], funcs['predict']
    
    # Model parameters
    model = tf.estimator.LinearClassifier(feature_columns=feature_columns,
                                          n_classes=nclasses,
                                          optimizer=tf.train.AdamOptimizer(learning_rate))
      
    # Train the model, starting from the prior state.
    model.train(input_fn=train_fn, max_steps=steps)

    # Evalute model
    eval_result = model.evaluate(input_fn=test_fn)
    
    if quiet is False:
        print("Printing Eval Results....\n", eval_result)

    # Get the loss value
    mse = eval_result["average_loss"]

    # TODO FROM HERE
    # Compute predictions.
    predictions = compute_predictions(model, predict_fn)     

    # Compute loss.
    test_loss = loss(predictions, Y_test)
    
    with tf.Session() as sess:
        test_mse = test_loss.eval()

    if quiet is False:
        # Print the current loss.
        print("  Loss for Train: %0.6f" % (mse**0.5))
        print("  Loss for Test: %0.6f" % (test_mse**0.5))

### Logistic Regression High Level API

**Model Parameters for Logistic Model**

In [0]:
# Model Parameters
learning_rate = 0.02
steps = 200
batch_size = 100

**Logistic Regression**

In [0]:
feature_columns = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 
                   'month', 'poutcome', 'age', 'balance', 'duration']
label = "y"

# Input functions
train_input = pandas_fn(train, train[label], features=feature_columns, batch_size=batch_size)
test_input = pandas_fn(train, train[label], features=feature_columns, batch_size=1, shuffle=False, num_epochs=1)
predict_input = pandas_fn(test, test[label], features=feature_columns, batch_size=1, shuffle=False, num_epochs=1)

fns = {'train': train_input, 'test': test_input, 'predict': predict_input}
fn_labels = {'train': train[label], 'test': test[label]}

categories = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'poutcome']
numerics = ['age', 'balance', 'duration']
cats = {c:list(bank_sample_df[c].unique()) for c in categories}
features = categorical_features(cats).union(numeric_features(numerics))

In [0]:
linear_classifier(fn_labels, fns, feature_columns, nclasses=2, learning_rate=learning_rate, steps=steps)

Printing Eval Results....
 {'accuracy': 0.7853351, 'accuracy_baseline': 0.88343287, 'auc': 0.88751864, 'auc_precision_recall': 0.52794605, 'average_loss': 0.7527573, 'label/mean': 0.116567135, 'loss': 0.7527573, 'precision': 0.3337083, 'prediction/mean': 0.3260792, 'recall': 0.84440225, 'global_step': 200}
  Loss for Train: 0.867616


In [0]:
%memit linear_classifier(fn_labels, fns, feature_columns, nclasses=2, learning_rate=learning_rate, steps=steps, quiet=True)

Printing Eval Results....
 {'accuracy': 0.8514709, 'accuracy_baseline': 0.88343287, 'auc': 0.8205705, 'auc_precision_recall': 0.3418096, 'average_loss': 0.5692881, 'label/mean': 0.116567135, 'loss': 0.5692881, 'precision': 0.3790795, 'prediction/mean': 0.15986457, 'recall': 0.42979127, 'global_step': 200}
  Loss for Train: 0.754512
peak memory: 1070.65 MiB, increment: 0.09 MiB


In [0]:
%time linear_classifier(fn_labels, fns, feature_columns, nclasses=2, learning_rate=learning_rate, steps=steps, quiet=True)

Printing Eval Results....
 {'accuracy': 0.87900907, 'accuracy_baseline': 0.88343287, 'auc': 0.5886312, 'auc_precision_recall': 0.21392412, 'average_loss': 2.270195, 'label/mean': 0.116567135, 'loss': 2.270195, 'precision': 0.33870968, 'prediction/mean': 0.019721337, 'recall': 0.039848197, 'global_step': 200}
  Loss for Train: 1.506717
CPU times: user 4min 23s, sys: 50.2 s, total: 5min 13s
Wall time: 2min 54s


In [0]:
%prun linear_classifier(fn_labels, fns, feature_columns, nclasses=2, learning_rate=learning_rate, steps=steps, quiet=True)

Printing Eval Results....
 {'accuracy': 0.8709356, 'accuracy_baseline': 0.88343287, 'auc': 0.88876885, 'auc_precision_recall': 0.5291479, 'average_loss': 0.40094, 'label/mean': 0.116567135, 'loss': 0.40094, 'precision': 0.46169493, 'prediction/mean': 0.19255924, 'recall': 0.64611006, 'global_step': 200}
  Loss for Train: 0.633198
 

**Logistic Regression Results**

**Accuracy**: 0.8709356  
**AUC**: 0.88876885  
**AUC Precision Recall**: 0.5291479  
**Average Loss**: 0.40094  
**Loss**: 0.40094  
**Precision**: 0.46169493  
**Recall**: 0.64611006  


**%time**    
CPU times: user 4min 23s, sys: 50.2 s, total: 5min 13s  
Wall time: 2min 54s

**%prun**   
9798300 function calls (9429810 primitive calls) in 174.565 seconds

**%memit**   
peak memory: 1070.65 MiB, increment: 0.09 MiB


**Model Parameters**  
learning_rate =  0.02
steps = 200
batch_size = 100