# Model Predictions from ML Engine

In this notebook we use our deployed TensorFlow model for prediction and compare the results with a classical Logistic Regression model.

### Read data from Cloud Storage

In [1]:
# Get the project id
import os
project_id = os.environ['VM_PROJECT']

In [2]:
# Read the train data from Cloud Storage
%gcs read -o gs://$project_id-mlengine/data/bank_data_train.csv -v data_train_file

In [3]:
# Read the evaluation data from Cloud Storage
%gcs read -o gs://$project_id-mlengine/data/bank_data_eval.csv -v data_eval_file

In [4]:
# Read the test data from Cloud Storage
%gcs read -o gs://$project_id-mlengine/data/bank_data_test.csv -v data_test_file

In [5]:
# Import necessary libraries for handling the data
import pandas as pd
import numpy as np
from io import BytesIO

In [6]:
# Read the data into pandas DataFrames
columns = ['age', 'job', 'marital', 'education', 'default', 'housing', 'loan',
           'contact', 'month', 'day_of_week', 'campaign', 'pdays', 'previous',
           'poutcome', 'emp_var_rate', 'cons_price_idx', 'cons_conf_idx',
           'euribor3m', 'nr_employed', 'y']
data_train = pd.read_csv(BytesIO(data_train_file), names=columns)
data_eval = pd.read_csv(BytesIO(data_eval_file), names=columns)
data_test = pd.read_csv(BytesIO(data_test_file), names=columns)

### Fit a Logistic Regression model

In [7]:
# View the data to select columns for logistic regression
data_train.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,campaign,pdays,previous,poutcome,emp_var_rate,cons_price_idx,cons_conf_idx,euribor3m,nr_employed,y
0,44,blue-collar,married,professional.course,no,yes,yes,cellular,may,tue,-0.204059,0.197445,1.672169,failure,-1.199962,-1.176024,-1.230753,-1.361165,-0.948933,no
1,35,admin.,single,high.school,no,yes,no,cellular,nov,tue,-0.571394,0.197445,-0.349378,nonexistent,-0.116607,-0.645203,-0.321833,0.305574,0.395964,no
2,54,services,married,high.school,no,no,no,cellular,aug,tue,0.53061,0.197445,-0.349378,nonexistent,0.839294,-0.223312,0.954984,0.774939,0.84519,no
3,60,technician,divorced,professional.course,unknown,no,no,telephone,jun,wed,-0.571394,0.197445,-0.349378,nonexistent,0.839294,1.54206,-0.278551,0.770898,0.84519,no
4,42,blue-collar,married,basic.6y,unknown,no,no,telephone,may,thu,-0.571394,0.197445,-0.349378,nonexistent,0.648114,0.727672,0.890061,0.713743,0.329206,no


In [8]:
# For convenience, select the normalized numeric columns
selected_columns = ['campaign', 'pdays', 'previous', 'emp_var_rate', 'cons_price_idx', 'cons_conf_idx', 'euribor3m', 'nr_employed']

In [9]:
# Combine training and evaluation data for cross-validation
data_train_eval = pd.concat([data_train, data_eval])

X_train = data_train_eval[selected_columns]
X_test = data_test[selected_columns]

y_train = data_train_eval['y'].map({'no': 0, 'yes': 1})
y_test = data_test['y'].map({'no': 0, 'yes': 1})

In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, StratifiedKFold

# List parameter ranges to be explored
parameters = {'C': [0.1, 0.5, 1.0, 5.0, 10.0], 'class_weight': [None, 'balanced']}
lr = LogisticRegression()

# Perform grid search over parameter ranges using stratified K-fold
clf = GridSearchCV(lr, parameters, scoring='f1', cv=StratifiedKFold(5, shuffle=True))
clf.fit(X_train, y_train)

GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=None, shuffle=True),
       error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'class_weight': [None, 'balanced'], 'C': [0.1, 0.5, 1.0, 5.0, 10.0]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='f1', verbose=0)

In [11]:
print('Highest f1-score:', np.round(clf.best_score_,2))

Highest f1-score: 0.38


In [12]:
# Choose the model with best parameters
lr = clf.best_estimator_

In [13]:
from sklearn.metrics import confusion_matrix, auc, precision_recall_curve

In [14]:
y_pred = lr.predict(X_test)
y_pred_proba = lr.predict_proba(X_test)

In [15]:
# Calculate the area under precision-recall curve
precision, recall, _ = precision_recall_curve(y_test, y_pred_proba[:,1])
auc(recall, precision)

0.41233296836283534

In [16]:
# Form the confusion matrix
confusion_matrix(y_test, y_pred)

array([[280,  86],
       [ 14,  32]])

### ML Engine

In [17]:
# Import the API client
from googleapiclient import discovery

ml = discovery.build('ml','v1')

In [18]:
data_test.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,campaign,pdays,previous,poutcome,emp_var_rate,cons_price_idx,cons_conf_idx,euribor3m,nr_employed,y
0,35,technician,single,professional.course,no,yes,no,cellular,may,thu,-0.571394,0.197445,-0.349378,nonexistent,-1.199962,-1.176024,-1.230753,-1.325948,-0.948933,no
1,36,admin.,married,university.degree,no,yes,yes,telephone,may,thu,0.163276,0.197445,-0.349378,nonexistent,0.648114,0.727672,0.890061,0.713743,0.329206,no
2,42,blue-collar,married,basic.9y,unknown,no,no,telephone,may,mon,0.163276,0.197445,-0.349378,nonexistent,0.648114,0.727672,0.890061,0.712011,0.329206,no
3,26,self-employed,single,university.degree,no,no,no,cellular,may,fri,0.163276,0.197445,-0.349378,nonexistent,-1.199962,-1.176024,-1.230753,-1.370402,-0.948933,no
4,38,admin.,single,basic.9y,unknown,yes,no,telephone,jun,wed,0.897945,0.197445,-0.349378,nonexistent,0.839294,1.54206,-0.278551,0.77263,0.84519,no


In [19]:
# Data is submitted for prediction as a dictionary
data_test.drop(columns='y').loc[0:1].to_dict(orient='records')

[{'age': 35,
  'campaign': -0.571393844537291,
  'cons_conf_idx': -1.2307531731913108,
  'cons_price_idx': -1.176024467957052,
  'contact': 'cellular',
  'day_of_week': 'thu',
  'default': 'no',
  'education': 'professional.course',
  'emp_var_rate': -1.1999622087139987,
  'euribor3m': -1.3259478549686754,
  'housing': 'yes',
  'job': 'technician',
  'loan': 'no',
  'marital': 'single',
  'month': 'may',
  'nr_employed': -0.9489332739923321,
  'pdays': 0.19744502733362396,
  'poutcome': 'nonexistent',
  'previous': -0.34937785043730324},
 {'age': 36,
  'campaign': 0.16327560768586266,
  'cons_conf_idx': 0.8900613289070047,
  'cons_price_idx': 0.72767221802066,
  'contact': 'telephone',
  'day_of_week': 'thu',
  'default': 'no',
  'education': 'university.degree',
  'emp_var_rate': 0.6481138647658136,
  'euribor3m': 0.7137429064053495,
  'housing': 'yes',
  'job': 'admin.',
  'loan': 'yes',
  'marital': 'married',
  'month': 'may',
  'nr_employed': 0.3292059916394463,
  'pdays': 0.19744

In [20]:
# Form the API request
instances = data_test.drop(columns='y').to_dict(orient='records')
request = {'instances': instances}

In [21]:
# Call the API for predictions
response = ml.projects().predict(name='projects/mlengine-example/models/bank_marketing', body=request).execute()

In [22]:
# Inspect the response
len(response['predictions'])

412

In [23]:
# Inspect the response further
response['predictions'][:2]

[{'class_ids': [0],
  'classes': ['0'],
  'logistic': [0.05924775451421738],
  'logits': [-2.7649519443511963],
  'probabilities': [0.9407522678375244, 0.05924775451421738]},
 {'class_ids': [0],
  'classes': ['0'],
  'logistic': [0.030696040019392967],
  'logits': [-3.452444553375244],
  'probabilities': [0.9693039059638977, 0.030696038156747818]}]

In [24]:
# Extract the prediction probabilities
pred_proba = np.array([response['predictions'][i]['probabilities'] for i in range(len(response['predictions']))])

In [25]:
# Calculate the area under precision-recall curve
precision_test, recall_test, _ = precision_recall_curve(y_test, pred_proba[:,1])
auc(recall_test, precision_test)

0.5042307942702203

In [26]:
# Form the confusion matrix
pred = np.array([response['predictions'][i]['class_ids'] for i in range(len(response['predictions']))])
confusion_matrix(y_test, pred)

array([[354,  12],
       [ 29,  17]])