# Imports

In [None]:
import math
import inflection
import pandas as pd 
import numpy as np
import seaborn as sns
import datetime as dt
import xgboost as xgb
import random
import pickle

from matplotlib            import pyplot as plt
from IPython.core.display  import HTML
from IPython.display       import Image
from tabulate              import tabulate
from scipy                 import stats
from sklearn.preprocessing import RobustScaler, MinMaxScaler, LabelEncoder
from datetime              import timedelta
from boruta                import BorutaPy
from sklearn.ensemble      import RandomForestRegressor
from sklearn.metrics       import mean_absolute_error, mean_absolute_percentage_error, mean_squared_error
from sklearn.linear_model  import LinearRegression, Lasso

# Passo 10 - Deploy Model to Production

1 Criar uma Classe com as limpezas, transformações e encoding (Rossmann.py)
2 Criar a API (Handler.py)
3 Criar um Script para testar a API.

Conferir se exportei o modelo treinado (model_rossmann_trained) (ok)
Exportar parâmetros pra aplicar nos novos dados que chegam via api (no Data Preparation -> Rescaling)

### Testando o carregamento do modelo via pickle

In [4]:
# Testando o carregamento do modelo via pickle: (ok, sem customer)
model_rossmann_final = pickle.load(open("../model/model_rossmann.pkl", "rb"))

In [5]:
model_rossmann_final

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.9, enable_categorical=False,
             eta=0.01, gamma=0, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.00999999978,
             max_delta_step=0, max_depth=9, min_child_weight=15, missing=nan,
             monotone_constraints='()', n_estimators=3500, n_jobs=4,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, subsample=0.1,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [7]:
cols_when_model_builds_final = model_rossmann_final.get_booster().feature_names

In [8]:
cols_when_model_builds_final

['store',
 'promo',
 'store_type',
 'assortment',
 'competition_distance',
 'competition_open_since_month',
 'competition_open_since_year',
 'promo2',
 'promo2_since_week',
 'promo2_since_year',
 'competition_time_month',
 'promo2_time_week',
 'month_cos',
 'month_sin',
 'day_cos',
 'day_sin',
 'week_of_year_cos',
 'week_of_year_sin',
 'day_of_week_sin',
 'day_of_week_cos']

## Rossmann Class


1 - Criar uma Classe com as limpezas, transformações e encoding (Rossmann.py)
Sobe e copia do passo 1 do Rename Columns até o Change Data Types e cola aqui.
SHIFT + M pra fazer um merge de tudo.
Pega também da Feature Engeneerign até a filtragem de variáveis-Seleção das Colunas
Pega também do Rescaling até Nature Transformation 
Pega também as cols do Boruta no Manual Feature Selection

Script será colado no projeto em api/rossmann/Rossmann.py

In [5]:
import pickle
import inflection
import pandas as pd
import numpy as np
import math
import datetime

class Rossmann ( object ):
    #construtor - primeira função que classe roda quando é instanciada
    def __init__ ( self ):
        #toda vez que instanciar esta classe, guarda na memória todas as transformações dos arquivos pickle
        #self - variáveis dentro da classe Rossmann só poderão ser acessadas internamente
        self.home_path='/Users/home/pharmacy_sales_forecast/'
        self.competition_distance_scaler     = pickle.load (open (self.home_path + 'parameter/competition_distance_scaler.pkl', 'rb') )
        self.competition_time_month_scaler   = pickle.load (open (self.home_path + 'parameter/competition_time_month_scaler.pkl', 'rb') )
        self.promo2_time_week_scaler         = pickle.load (open (self.home_path + 'parameter/promo2_time_week_scaler.pkl', 'rb') )
        self.year_scaler                     = pickle.load (open (self.home_path + 'parameter/year_scaler.pkl', 'rb') )
        self.store_type_scaler               = pickle.load (open (self.home_path + 'parameter/store_type_scaler.pkl', 'rb' ) )
    
    def data_cleaning ( self, df1 ):
        ## 1.1 Rename Columns
        cols_old = ['Store', 'DayOfWeek', 'Date', 'Open', 'Promo',
           'StateHoliday', 'SchoolHoliday', 'StoreType', 'Assortment',
           'CompetitionDistance', 'CompetitionOpenSinceMonth',
           'CompetitionOpenSinceYear', 'Promo2', 'Promo2SinceWeek',
           'Promo2SinceYear', 'PromoInterval']

        #cria função pra transformar em snake_case
        snakecase = lambda x: inflection.underscore ( x )
        #aplica a função em todas as palabras de cols_old, guarda em lista na cols_new
        cols_new = list( map( snakecase, cols_old ) )
        #rename
        df1.columns = cols_new


        ## 1.3 Data Types
        #convert 'date' to datetim e:
        df1['date'] = pd.to_datetime( df1['date'] )

        ## 1.5 Fillout NA
        #competition_distance
        df1['competition_distance'] = df1['competition_distance'].apply( lambda x: 200000.0 if math.isnan( x ) else x ) 
        #competition_open_since_month
        df1['competition_open_since_month'] = df1.apply ( lambda x: x['date'].month if math.isnan( x['competition_open_since_month'] ) else x['competition_open_since_month'], axis=1 ) 

        #competition_open_since_year
        df1['competition_open_since_year'] = df1.apply ( lambda x: x['date'].year if math.isnan( x['competition_open_since_year'] ) else x['competition_open_since_year'], axis=1 ) 

        #promo2_since_week
        df1['promo2_since_week'] = df1.apply ( lambda x: x['date'].week if math.isnan( x['promo2_since_week'] ) else x['promo2_since_week'], axis=1 )

        #promo2_since_year: 
        df1['promo2_since_year'] = df1.apply ( lambda x: x['date'].year if math.isnan( x['promo2_since_year'] ) else x['promo2_since_year'], axis=1 )

        #promo_interval
        month_map = {1: 'Jan', 2: 'Feb', 3: 'Mar', 4: 'Apr', 5: 'May', 6: 'Jun', 7: 'Jul', 8: 'Aug', 9: 'Sep', 10: 'Oct', 11: 'Nov', 12: 'Dec' }
        df1['promo_interval'].fillna(0, inplace=True)

        #cria coluna 'month_map' e seta o mes nela
        df1['month_map'] = df1['date'].dt.month.map( month_map )

        #cria coluna 'is_promo', e seta 1 se 'month_map' estiver em 'promo_interval', senão 0.
        df1['is_promo'] = df1[['promo_interval', 'month_map']].apply ( lambda x: 0 if x['promo_interval'] == 0 else 1 if x['month_map'] in x['promo_interval'].split(',') else 0, axis =1 )

        ## 1.6 Change Types
        #competition
        df1['competition_open_since_month'] = df1['competition_open_since_month'].astype( int )
        df1['competition_open_since_year'] = df1['competition_open_since_year'].astype( int )
        #promo2
        df1['promo2_since_week'] = df1['promo2_since_week'].astype( int )
        df1['promo2_since_year'] = df1['promo2_since_year'].astype( int )
        
        return df1

    def feature_engineering( self, df2 ):
        
        # Passo 3 -  Filtragem de Variáveis
        #year
        df2['year'] = df2['date'].dt.year
        #Month
        df2['month'] = df2['date'].dt.month
        #day
        df2['day'] = df2['date'].dt.day
        #week_of_year
        df2['week_of_year'] = df2['date'].dt.isocalendar().week
        #year_week
        df2['year_week'] = df2['date'].dt.strftime( '%Y-%W' )
        #competition_since
        df2['competition_since'] = df2.apply (lambda x: datetime.datetime (year=x['competition_open_since_year'], month=x['competition_open_since_month'], day=1 ), axis=1)
        df2['competition_time_month'] = ( (df2['date'] - df2['competition_since']) / 30 ).apply (lambda x: x.days).astype ( int )
        #promo_since 
        df2['promo_since'] = df2['promo2_since_year'].astype( str ) + '-' + df2['promo2_since_week'].astype( str )
        df2['promo_since'] = df2['promo_since'].apply ( lambda x: datetime.datetime.strptime( x + '-1', '%Y-%W-%w' ) - datetime.timedelta( days=7 ) )         
        df2['promo2_time_week'] = ( ( df2['date'] - df2['promo_since'] )/ 7).apply ( lambda x: x.days ).astype (int)
        #assortment
        df2['assortment'] = df2['assortment'].apply( lambda x: 'basic' if x == 'a' else 'extra' if x == 'b' else 'extended' )
        #state_holiday 
        df2['state_holiday'] = df2['state_holiday'].apply ( lambda x: 'public_holiday' if x == 'a' else 'easter_holiday' if x == 'b' else 'christmas' if x == 'c' else 'regular_day' )
        
        # 3.1 -  Filtragem de Linhas
        #df2 = df2.loc[ df2['open'] != 0 ]
        df2 = df2[df2['open'] != 0]
        
        # 3.2 - Seleção das Colunas
        cols_drop = [ 'open', 'promo_interval', 'month_map' ]
        df2.drop (cols_drop, axis=1)

        return df2

    def data_preparation ( self, df5 ):

        ## 5.2 Rescaling
        #competition_distance
        df5['competition_distance'] = self.competition_distance_scaler.transform(df5[['competition_distance']].values )
        #competition_time_month
        df5['competition_time_month'] = self.competition_time_month_scaler.transform(df5[['competition_time_month']].values ) 
        #promo2_time_week
        df5['promo2_time_week'] = self.promo2_time_week_scaler.transform(df5[['promo2_time_week']].values ) 
        
        #year
        df5['year'] = self.year_scaler.transform(df5[['year']].values ) 
        
        ## 5.3 Transformação
        ### 5.3.1 Encoding
        #### state_holiday - (One-Hot-Encoding)
        df5 = pd.get_dummies( df5, prefix=['state_holiday'], columns=['state_holiday'] )
        #### store_type (Label Encoding)
        #Precisou do fit_transform, só com transform a API retornava erro: ValueError: y contains previously unseen labels: 'a'
        df5['store_type'] = self.store_type_scaler.fit_transform( df5['store_type'] )
        #### assortment (Ordinal Encoding)
        assortment_dict = {'basic': 1, 'extra': 2, 'extended': 3}
        df5['assortment'] = df5['assortment'].map( assortment_dict )

        ### 5.3.3 Nature Transformation
        #month
        df5['month_sin'] = df5['month'].apply (lambda x: np.sin(x * ( 2. * np.pi/12 ) ) )
        df5['month_cos'] = df5['month'].apply (lambda x: np.cos(x * ( 2. * np.pi/12 ) ) )
        #day
        df5['day_sin'] = df5['day'].apply (lambda x: np.sin(x * ( 2. * np.pi/30 ) ) )
        df5['day_cos'] = df5['day'].apply (lambda x: np.cos(x * ( 2. * np.pi/30 ) ) )
        #week_of_year
        df5['week_of_year_sin'] = df5['week_of_year'].apply (lambda x: np.sin(x * ( 2. * np.pi/52 ) ) )
        df5['week_of_year_cos'] = df5['week_of_year'].apply (lambda x: np.cos(x * ( 2. * np.pi/52 ) ) )
        #day_of_week
        df5['day_of_week_sin'] = df5['day_of_week'].apply (lambda x: np.sin(x * ( 2. * np.pi/7 ) ) )
        df5['day_of_week_cos'] = df5['day_of_week'].apply (lambda x: np.cos(x * ( 2. * np.pi/7 ) ) )
    
        cols_selected = ['store','promo','store_type','assortment','competition_distance','competition_open_since_month',
            'competition_open_since_year','promo2','promo2_since_week','promo2_since_year','competition_time_month',
            'promo2_time_week','month_cos','month_sin','day_cos','day_sin','week_of_year_cos','week_of_year_sin',
            'day_of_week_sin','day_of_week_cos'] #20 cols

        return df5[ cols_selected ]
    
    def get_prediction( self, model, original_data, test_data ):
        #prediction
        pred = model.predict( test_data )
        
        #retorna o dataset recebido, com a coluna prediction preenchida
        original_data['prediction'] = np.expm1( pred )
        
        return original_data.to_json( orient='records', date_format='iso' )
    

## API Handler

2 - Criar a API (Handler.py)
Vamos usar a biblioteca Flesk, para receber requisições em API, responder, manter o endpoint funcionando, etc.

Script será colado no projeto em api/handler.py

In [6]:
import pickle
import pandas as pd
#from pacote [pasta.nome_arquivo] import nome classe
from rossmann.Rossmann import Rossmann
from flask             import Flask, request, Response

#carregar modelo em memória
model = pickle.load( open ('/Users/home/repos/pharmacy_sales_forecast/model/model_rossmann.pkl', 'rb' ) )
             
app = Flask (__name__)

#criar o endpoint
@app.route( '/rossmann/predict', methods=['POST'] )
#ao receber chamada, executa:
def rossmann_predict():
    test_json = request.get_json()

    #conversão do json em dataframe
    if test_json: #se recebeu dados
        #se vier um só json
        if isinstance( test_json, dict ):
            test_raw = pd.DataFrame( test_json, index=[0] )
        # se vierem vários jsons
        else:
            #pega todas as chaves da primeira linha (json é um dicionário), e assume estas chaves como colunas 
            test_raw = pd.DataFrame( test_json, columns=test_json[0].keys() )
            
        # Instanciar Rossmann class, pra ter acesso aos seus métodos.
        pipeline = Rossmann()
        
        #data cleaning
        df1 = pipeline.data_cleaning( test_raw )
        
        #feature engineering
        df2 = pipeline.feature_engineering( df1 )
        
        #data preparation
        df3 = pipeline.data_preparation( df2 )
        
        #prediction                   #modelo, dados originais, dados preparados
        df_response = pipeline.get_prediction( model, test_raw, df3 )
            
        return df_response
    else:
        #requisição deu certo, mas retorna vazio pois recebeu vazio.
        return Response( '{}', status=200, mimetype='application/json' )
if __name__ == '__main__':  
    #rodar no localhost
    app.run ( '0.0.0.0' )


ModuleNotFoundError: No module named 'rossmann'

## API Tester

3 - Criar um Script para testar a API.

In [81]:
# Revisado e Norton API Tester = Meigarom API Tester 100%
import requests
import json

In [32]:
#loading test dataset
df10 = pd.read_csv( '/Users/home/repos/pharmacy_sales_forecast/data/test.csv' )
df_store_raw = pd.read_csv("/Users/home/repos/pharmacy_sales_forecast/data/store.csv", low_memory=False)

In [63]:
df_store_raw.head(5)

Unnamed: 0,Store,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval
0,1,c,a,1270.0,9.0,2008.0,0,,,
1,2,a,a,570.0,11.0,2007.0,1,13.0,2010.0,"Jan,Apr,Jul,Oct"
2,3,a,a,14130.0,12.0,2006.0,1,14.0,2011.0,"Jan,Apr,Jul,Oct"
3,4,c,c,620.0,9.0,2009.0,0,,,
4,5,a,a,29910.0,4.0,2015.0,0,,,


In [73]:
#merge test dataset + store - para ter todos os dados necessários para as predições
df_test = pd.merge( df10, df_store_raw, how='left', on='Store' )

#choose store for prediction - prever só p/ 1 loja no teste
#df_test = df_test[df_test['Store'] == 22 ]
df_test = df_test[df_test['Store'].isin( [1, 3, 16, 24, 12, 100] )]

#remove closed days
df_test = df_test[df_test['Open'] != 0]
df_test = df_test[~df_test['Open'].isnull()]##pegar linhas que não tem o Open vazio (sem impacto na loja 22..)

#drop id
df_test = df_test.drop( 'Id', axis=1 )

In [74]:
#convert Dataframe to json pra poder enviar via API 
#cada linha vira um json assim -> {"Store": 22, "DayOfWeek": 4, (demais chave+valor) }
data = json.dumps( df_test.to_dict( orient='records' ) )

In [66]:
data

'[{"Store": 1, "DayOfWeek": 4, "Date": "2015-09-17", "Open": 1.0, "Promo": 1, "StateHoliday": "0", "SchoolHoliday": 0, "StoreType": "c", "Assortment": "a", "CompetitionDistance": 1270.0, "CompetitionOpenSinceMonth": 9.0, "CompetitionOpenSinceYear": 2008.0, "Promo2": 0, "Promo2SinceWeek": NaN, "Promo2SinceYear": NaN, "PromoInterval": NaN}, {"Store": 12, "DayOfWeek": 4, "Date": "2015-09-17", "Open": 1.0, "Promo": 1, "StateHoliday": "0", "SchoolHoliday": 0, "StoreType": "a", "Assortment": "c", "CompetitionDistance": 1070.0, "CompetitionOpenSinceMonth": NaN, "CompetitionOpenSinceYear": NaN, "Promo2": 1, "Promo2SinceWeek": 13.0, "Promo2SinceYear": 2010.0, "PromoInterval": "Jan,Apr,Jul,Oct"}, {"Store": 16, "DayOfWeek": 4, "Date": "2015-09-17", "Open": 1.0, "Promo": 1, "StateHoliday": "0", "SchoolHoliday": 0, "StoreType": "a", "Assortment": "c", "CompetitionDistance": 3270.0, "CompetitionOpenSinceMonth": NaN, "CompetitionOpenSinceYear": NaN, "Promo2": 0, "Promo2SinceWeek": NaN, "Promo2SinceYe

### Teste da API na máquina local

In [None]:
#funcionou com os arquivos locais!
# 200: Só roda na primeira vez que liguei a máquina, abri o Jupyter e testei.
# Retorna depois um erro no terminal ao rodar o python handler.py e testar o código abaixo:
# d zsh: segmentation fault  python handler.py
# Mesmo após pesquisas, não consegui resolver. Como funcionou na 1ª vez local, e também no Heroku, fica só o registro.

In [11]:
#API Call 
#5000 é a porta padrão do flesk
url = 'http://0.0.0.0:5000/rossmann/predict' #ulr local
header = {'Content-type':'application/json'}
data = data

#enviar dados
r = requests.post( url, data=data, headers=header )
print( 'Status Code {}'.format( r.status_code ) )

Status Code 200


### Teste da API na Cloud Heroku

In [75]:
#API Call 
url = 'https://rossmann-store-prediction.herokuapp.com/rossmann/predict' #ulr heroku + endpoint rossman/predict
header = {'Content-type':'application/json'}
data = data

#enviar dados
r = requests.post( url, data=data, headers=header )
print( 'Status Code {}'.format( r.status_code ) )

Status Code 200


In [76]:
#cada json será uma linha no df:
r.json()[0]

{'store': 1,
 'day_of_week': 4,
 'date': '2015-09-17T00:00:00.000Z',
 'open': 1.0,
 'promo': 1,
 'state_holiday': 'regular_day',
 'school_holiday': 0,
 'store_type': 'c',
 'assortment': 'basic',
 'competition_distance': 1270.0,
 'competition_open_since_month': 9,
 'competition_open_since_year': 2008,
 'promo2': 0,
 'promo2_since_week': 38,
 'promo2_since_year': 2015,
 'promo_interval': 0,
 'month_map': 'Sep',
 'is_promo': 0,
 'year': 2015,
 'month': 9,
 'day': 17,
 'week_of_year': 38,
 'year_week': '2015-37',
 'competition_since': '2008-09-01T00:00:00.000Z',
 'competition_time_month': 85,
 'promo_since': '2015-09-14T00:00:00.000Z',
 'promo2_time_week': 0,
 'prediction': 4730.4165039062}

In [77]:
#converter novamente para um DF a partir do json retornado
d1 = pd.DataFrame( r.json(), columns=r.json()[0].keys() )

In [78]:
#previsão de vendas diária para 6 semanas da loja 22:
d1.head()

Unnamed: 0,store,day_of_week,date,open,promo,state_holiday,school_holiday,store_type,assortment,competition_distance,competition_open_since_month,competition_open_since_year,promo2,promo2_since_week,promo2_since_year,promo_interval,month_map,is_promo,year,month,day,week_of_year,year_week,competition_since,competition_time_month,promo_since,promo2_time_week,prediction
0,1,4,2015-09-17T00:00:00.000Z,1.0,1,regular_day,0,c,basic,1270.0,9,2008,0,38,2015,0,Sep,0,2015,9,17,38,2015-37,2008-09-01T00:00:00.000Z,85,2015-09-14T00:00:00.000Z,0,4730.416504
1,3,4,2015-09-17T00:00:00.000Z,1.0,1,regular_day,0,a,basic,14130.0,12,2006,1,14,2011,"Jan,Apr,Jul,Oct",Sep,0,2015,9,17,38,2015-37,2006-12-01T00:00:00.000Z,107,2011-03-28T00:00:00.000Z,233,7321.921387
2,12,4,2015-09-17T00:00:00.000Z,1.0,1,regular_day,0,a,extended,1070.0,9,2015,1,13,2010,"Jan,Apr,Jul,Oct",Sep,0,2015,9,17,38,2015-37,2015-09-01T00:00:00.000Z,0,2010-03-22T00:00:00.000Z,286,7771.212402
3,16,4,2015-09-17T00:00:00.000Z,1.0,1,regular_day,0,a,extended,3270.0,9,2015,0,38,2015,0,Sep,0,2015,9,17,38,2015-37,2015-09-01T00:00:00.000Z,0,2015-09-14T00:00:00.000Z,0,7173.164062
4,24,4,2015-09-17T00:00:00.000Z,1.0,1,regular_day,0,a,extended,4590.0,3,2000,1,40,2011,"Jan,Apr,Jul,Oct",Sep,0,2015,9,17,38,2015-37,2000-03-01T00:00:00.000Z,189,2011-09-26T00:00:00.000Z,207,9430.619141


In [79]:
d2 = d1[['store','prediction']].groupby( 'store' ).sum().reset_index()

for i in range (len(d2)):
    print('Store number {} will sell R${:,.2f} in the next 6 weeks'.format(
        d2.loc[i, 'store'],
        d2.loc[i, 'prediction'] ))

Store number 1 will sell R$190,577.83 in the next 6 weeks
Store number 3 will sell R$276,943.92 in the next 6 weeks
Store number 12 will sell R$302,039.34 in the next 6 weeks
Store number 16 will sell R$278,288.45 in the next 6 weeks
Store number 24 will sell R$378,206.26 in the next 6 weeks
Store number 100 will sell R$282,196.91 in the next 6 weeks


In [80]:
#previsão para 6 semanas da loja 22:
d2

Unnamed: 0,store,prediction
0,1,190577.826172
1,3,276943.918945
2,12,302039.338867
3,16,278288.445312
4,24,378206.259277
5,100,282196.908691
