### Created by: Rodrigo Didier, 01/31/21.

# Import Libs

In [71]:

import os
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split

# DATA SCIENCE PIPLINE --------------------------------------------------------------------------

# 0. Feature Engineering

# 1. Data Extraction

In [72]:
class dataExtractor:

    def __init__(self):
        self.data_path = os.getenv('DATASET_PATH')
        self.data = self.extractData()

    def extractData(self):
        '''
        Loads a dataset with product data from a specified path.
        '''

        return pd.read_csv(self.data_path)

#  2. Data Formatting

In [73]:
dataFormatter().categoryToDummy().head()

Unnamed: 0,product_id,seller_id,query,search_page,position,title,concatenated_tags,creation_date,price,weight,express_delivery,minimum_quantity,view_counts,order_counts,Bebê,Bijuterias e Jóias,Decoração,Lembrancinhas,Outros,Papel e Cia
0,11394449,8324141,espirito santo,2,6,Mandala Espírito Santo,mandala mdf,2015-11-14 19:42:12,171.89,1200.0,1,4,244,,0,0,1,0,0,0
1,15534262,6939286,cartao de visita,2,0,Cartão de Visita,cartao visita panfletos tag adesivos copos lon...,2018-04-04 20:55:07,77.67,8.0,1,5,124,,0,0,0,0,0,1
2,16153119,9835835,expositor de esmaltes,1,38,Organizador expositor p/ 70 esmaltes,expositor,2018-10-13 20:57:07,73.920006,2709.0,1,1,59,,0,0,0,0,1,0
3,15877252,8071206,medidas lencol para berco americano,1,6,Jogo de Lençol Berço Estampado,t jogo lencol menino lencol berco,2017-02-27 13:26:03,118.770004,0.0,1,1,180,1.0,1,0,0,0,0,0
4,15917108,7200773,adesivo box banheiro,3,38,ADESIVO BOX DE BANHEIRO,adesivo box banheiro,2017-05-09 13:18:38,191.81,507.0,1,6,34,,0,0,1,0,0,0


In [92]:
dataFormatter().split_train_test_validate()

['weight',
 'express_delivery',
 'minimum_quantity',
 'view_counts',
 'order_counts',
 'category']

In [95]:
X_data = dataExtractor().data
X_data
y_vals = list(dataExtractor().data.columns[-6:])

In [96]:
y_vals

['weight',
 'express_delivery',
 'minimum_quantity',
 'view_counts',
 'order_counts',
 'category']

In [91]:
class dataFormatter(dataExtractor):
    def __init__(self):
        self.extracted_data = dataExtractor().data
        #self.train_data = self.formatData()[0]
        #self.validation_data = self.formatData()[1]
        
    def formatData(self):
        '''
        Processes the dataset to use it for training and validation.
        '''
        
        # add methods to formating the data to the  modeler.
    
    def categoryToDummy(self):
        # get dummy
        dummy = pd.get_dummies(self.extracted_data['category'])

        # merge df
        df_dummy = pd.merge(self.extracted_data, dummy, left_index=True, right_index=True)

        # delet no dummy cols
        del df_dummy['category']

        return df_dummy
    
    def split_train_test_validate(self):
        '''
        Split the dataset into train, validation and test
        '''
        
        # categorical variables list to split
        y_cols = list(dataExtractor().data.columns[-6:])
        
        # full data categorized
        X_data = 
        X, X_test, y, y_test = train_test_split(X_data, y_data, test_size=0.2, train_size=0.8)
        #X_train, X_val, y_train, y_val = train_test_split(X,y,test_size = 0.25,train_size =0.75)
        
        #return (X, y), (X_train, y_train), (x_val, y_val)

# 3. Data Modeling

In [76]:
class dataModeler(dataFormatter):
    def __init__(self):
        self.model = dataModeler.getModel()
        self.train_data = dataFormatter.train_data
        
    def getModel():
        '''
        Specifies a model to handle the categorization problem.
        # criar etapa de modelagem -métodos se for o caso
        # dar split no train e treinar
        
        
        '''

        model = None
        return model
        
    def writeModel(self):
        '''
        Exports a candidate model to a specified path available
        in the environment variable MODEL_PATH.
        '''

        # criar try: tenta salvar, se nao dá erro.
    
        return pickle.dump(self.model, open(os.getenv('MODEL_PATH'),'wb'))


# 4. Model Validation

In [77]:
class modelValidator(dataModeler, dataFormatter):
    def __init__(self):
        self.model = self.readModel()
        self.validation_data = dataFormatter.validation_data
    
    def readModel(self):
        '''
        get the model selected to handle the categorization problem.
        '''
        
        # criar um try, se nao der entao dar um saveModel
        
        return pickle.load(open(os.getenv('MODEL_PATH'),'rb'))
        
    def validateModel(self):
        '''
        Generates metrics about the model accuracy (precision, recall, F1, etc.)
        for each category and exports them to a specified path available in the 
        environment variable METRICS_PATH.
        '''
        # try k-fold and variants.
        
        # should save metrics in METRICS_PATH
        f = open(os.getenv('METRICS_PATH'), "w")
        f.write("F1:95.0, Precision:87.5")
        f.close()
        pass

# 5. Model Prediction

In [78]:
dataExtractor().extractData().head()

Unnamed: 0,product_id,seller_id,query,search_page,position,title,concatenated_tags,creation_date,price,weight,express_delivery,minimum_quantity,view_counts,order_counts,category
0,11394449,8324141,espirito santo,2,6,Mandala Espírito Santo,mandala mdf,2015-11-14 19:42:12,171.89,1200.0,1,4,244,,Decoração
1,15534262,6939286,cartao de visita,2,0,Cartão de Visita,cartao visita panfletos tag adesivos copos lon...,2018-04-04 20:55:07,77.67,8.0,1,5,124,,Papel e Cia
2,16153119,9835835,expositor de esmaltes,1,38,Organizador expositor p/ 70 esmaltes,expositor,2018-10-13 20:57:07,73.920006,2709.0,1,1,59,,Outros
3,15877252,8071206,medidas lencol para berco americano,1,6,Jogo de Lençol Berço Estampado,t jogo lencol menino lencol berco,2017-02-27 13:26:03,118.770004,0.0,1,1,180,1.0,Bebê
4,15917108,7200773,adesivo box banheiro,3,38,ADESIVO BOX DE BANHEIRO,adesivo box banheiro,2017-05-09 13:18:38,191.81,507.0,1,6,34,,Decoração


In [79]:
dataExtractor().extractData().category.value_counts()

Lembrancinhas         17524
Decoração              8723
Bebê                   6930
Papel e Cia            2750
Outros                 1133
Bijuterias e Jóias      940
Name: category, dtype: int64

In [80]:
dataFormatter().categoryToDummy()

Unnamed: 0,product_id,seller_id,query,search_page,position,title,concatenated_tags,creation_date,price,weight,express_delivery,minimum_quantity,view_counts,order_counts,Bebê,Bijuterias e Jóias,Decoração,Lembrancinhas,Outros,Papel e Cia
0,11394449,8324141,espirito santo,2,6,Mandala Espírito Santo,mandala mdf,2015-11-14 19:42:12,171.890000,1200.0,1,4,244,,0,0,1,0,0,0
1,15534262,6939286,cartao de visita,2,0,Cartão de Visita,cartao visita panfletos tag adesivos copos lon...,2018-04-04 20:55:07,77.670000,8.0,1,5,124,,0,0,0,0,0,1
2,16153119,9835835,expositor de esmaltes,1,38,Organizador expositor p/ 70 esmaltes,expositor,2018-10-13 20:57:07,73.920006,2709.0,1,1,59,,0,0,0,0,1,0
3,15877252,8071206,medidas lencol para berco americano,1,6,Jogo de Lençol Berço Estampado,t jogo lencol menino lencol berco,2017-02-27 13:26:03,118.770004,0.0,1,1,180,1.0,1,0,0,0,0,0
4,15917108,7200773,adesivo box banheiro,3,38,ADESIVO BOX DE BANHEIRO,adesivo box banheiro,2017-05-09 13:18:38,191.810000,507.0,1,6,34,,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37995,13230578,1756482,mochila personalizada galinha pintadinha,1,2,Mochila Galinha Pintadinha M,primaria 2019 1 aninho abdulzinho mochilas ani...,2016-09-17 10:49:39,18.790000,149.0,1,27,321,,0,0,0,1,0,0
37996,6736914,9301388,tag dia dos pais,1,32,30 TAGS DIA DOS PAIS 005,dia pais,2019-06-12 17:03:52,31.680000,7.0,1,1,43,,0,0,0,0,0,1
37997,11017911,8732362,kit bolsa maternidade,5,31,Kit bolsa bebê maternidade personalizada,paula carvalho bebe,2018-08-24 11:43:00,543.170000,3006.0,1,4,515,18.0,1,0,0,0,0,0
37998,6807331,1869417,festa 15 anos,1,8,Chaveiro Almofada 15 anos,yasmin centro mesa compras 15 anos 2020 lembra...,2017-10-21 18:49:56,10.720000,18.0,1,53,2456,138.0,0,0,0,1,0,0
