In [None]:
import pandas as pd
from sklearn.datasets import load_wine

from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline, FeatureUnion

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest

from sklearn.ensemble import RandomForestClassifier

import pickle

In [None]:
data = load_wine()
df = pd.DataFrame(data['data'])
df.columns = data['feature_names']
y = data['target']
df.head()

In [None]:
"""
Firstly, we will create our own class to keep only features we want in our pipeline. We don't want to run PCA on all features but only on the sample so we create own class 
that filters the features in the original dataframe. We can put our own classes into the pipelines, as long as they have following methods:

.fit()
.transform()
.fit_transform()
"""
# own class that can be inserted to pipeline as any other sklearn object.
class RawFeats:
    def __init__(self, feats):
        self.feats = feats

    def fit(self, X, y=None):
        pass


    def transform(self, X, y=None):
        return X[self.feats]

    def fit_transform(self, X, y=None):
        self.fit(X)
        return self.transform(X)


# features we want to keep for PCA
feats = ['alcohol','malic_acid','ash','alcalinity_of_ash','magnesium',
         'total_phenols','flavanoids','nonflavanoid_phenols']
# creating class object with indexes we want to keep.
raw_feats = RawFeats(feats)

In [None]:
sc = StandardScaler()
pca = PCA(n_components=2)

In [None]:
selection = SelectKBest(k=4)

In [None]:
rf = RandomForestClassifier()

In [None]:
"""
As in the tutorial yesterday we will apply two different feature extraction techniques:

PCA
SelectKBest
and combine them with FeatureUnion. The small difference is that we will use only sample of features for PCA.
"""

PCA_pipeline = Pipeline([
    ("rawFeats", raw_feats),
    ("scaler", sc),
    ("pca", pca)
])

kbest_pipeline = Pipeline([("kBest", selection)])

all_features = FeatureUnion([
    ("pcaPipeline", PCA_pipeline), 
    ("kBestPipeline", kbest_pipeline)
])

In [None]:
main_pipeline = Pipeline([
    ("features", all_features),
    ("rf", rf)
])

In [None]:
# set up our parameters grid
param_grid = {"features__pcaPipeline__pca__n_components": [1, 2, 3],
                  "features__kBestPipeline__kBest__k": [1, 2, 3],
                  "rf__n_estimators":[2, 5, 10],
                  "rf__max_depth":[2, 4, 6]
             }

# create a Grid Search object
grid_search = GridSearchCV(main_pipeline, param_grid, n_jobs = -1, verbose=10, refit=True)    

# fit the model and tune parameters
grid_search.fit(df, y)

In [None]:
print(grid_search.best_params_)

In [None]:
pickle.dump( grid_search, open( "model.p", "wb" ) )

Create a new .py file 

In [None]:
# import Flask and jsonify
from flask import Flask, jsonify, request
# import Resource, Api and reqparser
from flask_restful import Resource, Api, reqparse
import pandas as pd
import numpy
import pickle

# Create an API similar to previous tutorial.
app = Flask(__name__)
api = Api(app)


# At the beginning of the file, we need to create the same custom class we used in the model creation part. The functions from that 
# class are used in the model and stored in the pickle file we created earlier. Therefore, the model needs to have access to the class 
# during the scoring as well. The accesses to other sklearn modules are provided automatically and we don't have to do anything about 
# them in the scoring file.

class RawFeats:
    def __init__(self, feats):
        self.feats = feats

    def fit(self, X, y=None):
        pass


    def transform(self, X, y=None):
        return X[self.feats]

    def fit_transform(self, X, y=None):
        self.fit(X)
        return self.transform(X)

    
# load model
model = pickle.load( open( "model.p", "rb" ) )


# Now, we need to create an endpoint where we can communicate with our ML model. This time, we are going to use POST request.
class Scoring(Resource):
    def post(self):
        json_data = request.get_json()
        df = pd.DataFrame(json_data.values(), index=json_data.keys()).transpose()
        # getting predictions from our model.
        # it is much simpler because we used pipelines during development
        res = model.predict_proba(df)
        # we cannot send numpt array as a result
        return res.tolist() 
    

# Now, we need to assign an endpoint to our API.
# assign endpoint
api.add_resource(Scoring, '/scoring')


# The last thing to do is to create an application run when the file api.py is run directly (not imported as a module from another script).
if __name__ == '__main__':
    app.run(debug=True)
    
    
# Run the API by opening the command line and type python app.py.

back to notebook, test 

In [None]:
import requests
URL = "http://127.0.0.1:5000/scoring"
# sending get request and saving the response as response object 
r = requests.post(url = URL, json = json_data) 

In [None]:
print(r.json())