In [1]:
import pandas as pd
from sklearn.externals import joblib
from sklearn.preprocessing import StandardScaler
import json

  LARGE_SPARSE_SUPPORTED = LooseVersion(scipy_version) >= '0.14.0'


In [2]:
clf = joblib.load("./model_data/boston_housing_prediction.joblib")

In [3]:
def scale(payload):
    """Scales Payload"""
    scaler = StandardScaler().fit(payload.astype(float))
    scaled_adhoc_predict = scaler.transform(payload.astype(float))
    return scaled_adhoc_predict

In [4]:
def test_prediction(chas,rm,tax,ptratio,b,lsat):
    x = {"CHAS": {"0": chas}, "RM": {"0": rm}, "TAX": {"0": tax},
    "PTRATIO": {"0": ptratio}, "B": {"0": b}, "LSTAT": {"0": lsat}}
    json_str = json.dumps(x)
    json_payload = json.loads(json_str)
    print(f'json_payload:\n{json_payload}\n')
    inference_payload = pd.DataFrame(json_payload)
    print(f'inference_payload:\n{inference_payload}\n')
    prediction_inference = list(clf.predict(inference_payload))
    print(f'inference_payload prediction:\n{prediction_inference}\n')
    # scale the input
    scaled_payload = scale(inference_payload)
    print(f'scaled_payload:\n{scaled_payload}\n')
    # get an output prediction from the pretrained model, clf
    prediction_scaled = list(clf.predict(scaled_payload))
    print(f'scaled_payload prediction:\n{prediction_scaled}\n')

In [5]:
chas = 0
rm = 6.575
tax = 296.0
ptratio = 15.3
b =396.9
lsat =4.98

test_prediction(chas,rm,tax,ptratio,b,lsat)

json_payload:
{'CHAS': {'0': 0}, 'RM': {'0': 6.575}, 'TAX': {'0': 296.0}, 'PTRATIO': {'0': 15.3}, 'B': {'0': 396.9}, 'LSTAT': {'0': 4.98}}

inference_payload:
   CHAS     RM    TAX  PTRATIO      B  LSTAT
0     0  6.575  296.0     15.3  396.9   4.98

inference_payload prediction:
[13.075330728476153]

scaled_payload:
[[0. 0. 0. 0. 0. 0.]]

scaled_payload prediction:
[20.35373177134412]



In [6]:
chas = 0
rm = 3
tax = 100
ptratio = 1.3
b =36.9
lsat =6.98

test_prediction(chas,rm,tax,ptratio,b,lsat)

json_payload:
{'CHAS': {'0': 0}, 'RM': {'0': 3}, 'TAX': {'0': 100}, 'PTRATIO': {'0': 1.3}, 'B': {'0': 36.9}, 'LSTAT': {'0': 6.98}}

inference_payload:
   CHAS  RM  TAX  PTRATIO     B  LSTAT
0     0   3  100      1.3  36.9   6.98

inference_payload prediction:
[12.49283674687483]

scaled_payload:
[[0. 0. 0. 0. 0. 0.]]

scaled_payload prediction:
[20.35373177134412]



In [7]:
chas = 1
rm = 2
tax = 500
ptratio = 10.3
b =20.9
lsat =16.98

test_prediction(chas,rm,tax,ptratio,b,lsat)

json_payload:
{'CHAS': {'0': 1}, 'RM': {'0': 2}, 'TAX': {'0': 500}, 'PTRATIO': {'0': 10.3}, 'B': {'0': 20.9}, 'LSTAT': {'0': 16.98}}

inference_payload:
   CHAS  RM  TAX  PTRATIO     B  LSTAT
0     1   2  500     10.3  20.9  16.98

inference_payload prediction:
[10.42161872892515]

scaled_payload:
[[0. 0. 0. 0. 0. 0.]]

scaled_payload prediction:
[20.35373177134412]



# Questions/Observations

**Given different inputs why does prediction using scaled_playload give same output whereas that with inference_payload gives different outputs?**

Given different inputs:
- inference_payload is different therefore a different prediction.
- scaled_playload seems to be the same therefore the same prediction.

To further investigate this start by looking and going deeper into:
- i) the classifier (clf) from "./model_data/boston_housing_prediction.joblib" and 
- ii) scale() function

you could consider:
- a) training your own model then presisting it as well as 
- b) other methods of preprocessing input data.
- c) different features (not only for predictive power but also considering AI ethics)

While this is an interesting point for further investigation, it's  beyond the scope of this assignment.

References:
- [9. Model persistence](https://scikit-learn.org/stable/model_persistence.html)
- [Compare the effect of different scalers on data with outliers](https://scikit-learn.org/stable/auto_examples/preprocessing/plot_all_scaling.html#sphx-glr-auto-examples-preprocessing-plot-all-scaling-py)
- [Developing scikit-learn estimators](https://scikit-learn.org/stable/developers/develop.html)
- [Ethics of artificial intelligence](https://en.wikipedia.org/wiki/Ethics_of_artificial_intelligence)
- [github issue: B 1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town #16155](https://github.com/scikit-learn/scikit-learn/issues/16155)
- [github issue: model.predict() gives same output for all inputs #6447](https://github.com/keras-team/keras/issues/6447)
- [joblib.load](https://joblib.readthedocs.io/en/latest/generated/joblib.load.html#joblib.load)
- [sklearn.ensemble.GradientBoostingRegressor](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingRegressor.html)
- [sklearn.preprocessing.StandardScaler](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html)

In [8]:
dir(clf)

['_SUPPORTED_LOSS',
 '__abstractmethods__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_abc_impl',
 '_check_initialized',
 '_check_params',
 '_clear_state',
 '_decision_function',
 '_estimator_type',
 '_fit_stage',
 '_fit_stages',
 '_get_param_names',
 '_init_decision_function',
 '_init_state',
 '_is_initialized',
 '_make_estimator',
 '_resize_state',
 '_rng',
 '_staged_decision_function',
 '_validate_estimator',
 '_validate_y',
 'alpha',
 'apply',
 'criterion',
 'estimators_',
 'feature_importances_',
 'fit',
 'get_params',
 'init',
 'init_',
 'learning_rate',
 'loss',
 'loss_',
 'max_dep

In [9]:
clf.feature_importances_

array([0.00475996, 0.46051833, 0.06129686, 0.02882264, 0.02417202,
       0.42043019])

In [10]:
# https://scikit-learn.org/stable/developers/develop.html
# All scikit-learn estimators have get_params and set_params functions. 
# The get_params function takes no arguments and returns 
# a dict of the __init__ parameters of the estimator, together with their values.
clf.get_params
# clf.score

<bound method BaseEstimator.get_params of GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=3, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=400, n_iter_no_change=None, presort='auto',
             random_state=7, subsample=1.0, tol=0.0001,
             validation_fraction=0.1, verbose=0, warm_start=False)>

In [11]:
# https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingRegressor.html
clf.__dict__

{'n_estimators': 400,
 'learning_rate': 0.1,
 'loss': 'ls',
 'criterion': 'friedman_mse',
 'min_samples_split': 2,
 'min_samples_leaf': 1,
 'min_weight_fraction_leaf': 0.0,
 'subsample': 1.0,
 'max_features': None,
 'max_depth': 3,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'init': None,
 'random_state': 7,
 'alpha': 0.9,
 'verbose': 0,
 'max_leaf_nodes': None,
 'warm_start': False,
 'presort': 'auto',
 'validation_fraction': 0.1,
 'n_iter_no_change': None,
 'tol': 0.0001,
 'n_features_': 6,
 'n_classes_': 1,
 'loss_': <sklearn.ensemble.gradient_boosting.LeastSquaresError at 0x161c20e0d68>,
 'max_features_': 6,
 'init_': <sklearn.ensemble.gradient_boosting.MeanEstimator at 0x161c20e0da0>,
 'estimators_': array([[DecisionTreeRegressor(criterion='friedman_mse', max_depth=3,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fractio