### Prediction log file - creating and updating 

Logging is tied to a machine learning model to enable performance monitoring.

In [None]:
import re
import os
import sys
import time
import csv
import joblib
import uuid
from collections import Counter,defaultdict
import numpy as np
import pandas as pd
from termcolor import cprint
import matplotlib.pyplot as plt

from datetime import date
import numpy as np
from sklearn import svm
from sklearn import datasets
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, f1_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from imblearn.pipeline import make_pipeline

In [None]:
## load the aavail data 
df = pd.read_csv(os.path.join(".",r"data/aavail-target.csv"))

In [None]:
## pull out the target 
_y = df.pop('is_subscriber')   # drop (the column) and return the values from it
y = np.zeros(_y.size)          # form a equal size array "y"
y[_y==0] = 1                   # convert "y" as churn-data ("=1")

## then drop the other columns as un-needed
df.drop(columns=['customer_id','customer_name'],inplace=True)
#df.head()

In [None]:
def train_model(X,y,saved_model):
    """
    function to train model
    """ 
    df = pd.read_csv(os.path.join(".",r"data/aavail-target.csv"))
    _y = df.pop('is_subscriber')
    y = np.zeros(_y.size)
    y[_y==0] = 1 
    df.drop(columns=['customer_id','customer_name'],inplace=True)
    # df.head()
    X = df
    
    ## Perform a train-test split
    rs = random_state = 42
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, 
                                                stratify=y, random_state=rs)
    ## Specify parameters and model
    ## for numeric
    numeric_features = ['age', 'num_streams']
    numeric_transformer = Pipeline(steps=[
                         ('imputer', SimpleImputer(strategy='mean')),
                         ('scaler', StandardScaler())])

    ## for categorical
    categorical_features = ['country', 'subscriber_type']
    categorical_transformer = Pipeline(steps=[
      ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
      ('onehot', OneHotEncoder(handle_unknown='ignore'))])

    ## define "preprocessor" and passing it the specific values
    preprocessor = ColumnTransformer(
      transformers=[
                   ('num', numeric_transformer, numeric_features),
                   ('cat', categorical_transformer, categorical_features)])

    ## fit model on training data
    ## (i) svm
    param_grid_svm = {
                    'svm__C': [0.01,0.1,0.5,1.0,1.5,5.0,10.0],
                    'svm__gamma': [0.001,0.01,0.1]
    }
    best_params = {}
    pipe_svm = Pipeline(steps=[('pre', preprocessor),
                                ('svm',SVC(kernel='rbf',
                                           class_weight='balanced'))])
    grid = GridSearchCV(pipe_svm, param_grid=param_grid_svm, cv=5, 
                       iid=False, n_jobs=-1)
    grid.fit(X_train, y_train)
    y_pred = grid.predict(X_test)

    print("-->".join(pipe_svm.named_steps.keys()))

    best_params = grid.best_params_
    #print("best parameters:", best_params)
    print("f1_score",round(f1_score(y_test, y_pred,average='binary'),3))

    ## retrain using all data
    grid.fit(X, y)
    print("... saving model: {}".format(saved_model))
    joblib.dump(grid,saved_model)

In [None]:
def _update_predict_log(y_pred,y_proba,query,runtime):
    """
    update predict log file
    """
    
    ## name the logfile using something that cycles with date (day, month, year)    
    today = date.today()
    logfile = "Prediction-{}-{}.log".format(today.year, today.month)

    ## write the data to a csv file    
    header = ['unique_id','timestamp','y_pred','y_proba','x_shape','model_version','runtime']
    write_header = False
    if not os.path.exists(logfile):
        write_header = True
    with open(logfile,'a') as csvfile:
        writer = csv.writer(csvfile, delimiter=',', quotechar='|')
        if write_header:
            writer.writerow(header)

        to_write = map(str,[uuid.uuid4(),time.time(),y_pred,y_proba,query.shape,MODEL_VERSION,runtime])
        writer.writerow(to_write)

In [None]:
def model_load():
    """
    funtion to load model
    """

    if not os.path.exists(saved_model):
        raise Exception("Model '{}' cannot be found. Is the model trained?".format(saved_model))
    
    model = joblib.load(saved_model)
    return(model)

In [None]:
def predict(query,saved_model,verbose=True):
    """
    generic function for prediction
    """
    
    ## start timer for runtime
    time_start = time.time()
    
    ## ensure the model is loaded
    model = joblib.load(saved_model)

    ## input checks
    if isinstance(query,list):
        query = np.array([query])
    if len(query.shape) == 1:
        query = query.reshape(1, -1)
    
    ## make prediction and gather data for log entry
    y_pred = model.predict(query)
    y_proba = None
    if 'predict_proba' in dir(model) and model.probability == True:
        y_proba = model.predict_proba(query)
    m, s = divmod(time.time()-time_start, 60)
    h, m = divmod(m, 60)
    runtime = "%03d:%02d:%02d"%(h, m, s)

    ## update the log file
    _update_predict_log(y_pred,y_proba,query,runtime)
    
    return(y_pred)