### Created by: Rodrigo Didier, 01/31/21.

# Import Libs

In [20]:

import os
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split

import nltk
nltk.download('punkt')
nltk.download('stopwords')

from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize

from nltk.corpus import stopwords
from string import punctuation

from nltk.probability import FreqDist

[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# DATA SCIENCE PIPLINE --------------------------------------------------------------------------

# 0. Feature Engineering

# 1. Data Extraction

In [3]:
class dataExtractor:

    def __init__(self):
        self.data_path = os.getenv('DATASET_PATH')
        self.data = self.extractData()

    def extractData(self):
        '''
        Loads a dataset with product data from a specified path.
        '''

        return pd.read_csv(self.data_path)

#  2. Data Formatting

In [31]:
(X, y), (X_test, y_test), (X_val, y_val)= dataFormatter().split_train_test_validate()

In [46]:
    X.assign(full_text= lambda row: ' '.join([row['query'], row['title'], row['concatenated_tags']]))

TypeError: sequence item 0: expected str instance, Series found

In [40]:
 ' '.join([X.query, X.title, X.concatenated_tags])

TypeError: sequence item 0: expected str instance, method found

In [49]:
' '.join([X['query'].str, X['title'].str, X['concatenated_tags'].str])

TypeError: sequence item 0: expected str instance, StringMethods found

In [45]:
X['query']

7917                        leque de madeira
33016                   adesivo de parede 3d
31923              caixa papel personalizada
11067                       tapete bem vindo
28087       capa para caderneta de vacinacao
                        ...                 
6622     lembrancinhas de maternidade menino
35187                               trocador
31132    lembrancinhas de maternidade em eva
37527                              sacolinha
23841             roupas de cachorro atacado
Name: query, Length: 30400, dtype: object

In [55]:
X.query + X.title

TypeError: Cannot broadcast np.ndarray with operand of type <class 'method'>

In [54]:
X.head()

Unnamed: 0,product_id,seller_id,query,search_page,position,title,concatenated_tags,creation_date,price,weight,express_delivery,minimum_quantity,view_counts,order_counts
7917,4142514,6080672,leque de madeira,1,3,Leques de madeira Envio Expresso 5 dias úteis,casamento leques,2016-08-21 14:17:14,14.22,9.0,1,30,220,1.0
33016,6363203,2417318,adesivo de parede 3d,1,14,Papel de Parede 3D Paisagem Cachoeira 0019,papel parede 3d cachoeira papel parede parede,2018-05-03 14:44:22,62.699997,9.0,0,2,9960,39.0
31923,6432298,971052,caixa papel personalizada,2,3,Caixa personalizada de Nossa Senhora com vidro...,religiosos,2018-09-09 16:05:42,39.94,0.0,1,4,52,20.0
11067,2661983,6024256,tapete bem vindo,1,19,Tapete croche bem vindo mesclado pink,decoracao primavera tapetes porta decorar tape...,2017-02-12 20:18:51,110.27,1009.0,0,7,146,
28087,291711,2335174,capa para caderneta de vacinacao,1,20,Capa para Caderneta de Vacinação,capas cadernetas vacinacao cartao vacina leona...,2017-05-22 16:26:07,59.91,258.0,0,9,205,2.0


In [30]:
class dataFormatter(dataExtractor):
    def __init__(self):
        self.extracted_data = dataExtractor().data
        #self.train_data = self.formatData()[0]
        #self.validation_data = self.formatData()[1]
        
    def formatData(self):
        '''
        Processes the dataset to use it for training and validation.
        '''
        
        # add methods to formating the data to the  modeler.
    
    def categoryToDummy(self):
        # get dummy
        dummy = pd.get_dummies(self.extracted_data['category'])

        # merge df
        df_dummy = pd.merge(self.extracted_data, dummy, left_index=True, right_index=True)

        # delet no dummy cols
        del df_dummy['category']

        return df_dummy
    
    def split_train_test_validate(self):
        '''
        Split the dataset into train, validation and test
        '''
        
        # categorical variables list to split
        y_cols = list(self.categoryToDummy().columns[-6:])
        X_cols = list(self.categoryToDummy().columns[:-6])
        
        # full data categorized
        y_data =  self.categoryToDummy()[y_cols] #.values
        X_data =  self.categoryToDummy()[X_cols] #.values
                
        X, X_test, y, y_test = train_test_split(X_data, y_data, test_size=0.2, train_size=0.8)
        X_train, X_val, y_train, y_val = train_test_split(X,y,test_size = 0.25,train_size =0.75)
        
        return (X, y), (X_test, y_test), (X_val, y_val)

In [17]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')

from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize

texto = 'João pé de feijão, foi correr e sentiu. Porém, seguiu! esse cara é fod* texto'
sentencas = sent_tokenize(texto)
palavras = word_tokenize(texto.lower())

from nltk.corpus import stopwords
from string import punctuation

stopwords = set(stopwords.words('portuguese') + list(punctuation))
palavras_sem_stopwords = [palavra for palavra in palavras if palavra not in stopwords]

from nltk.probability import FreqDist
frequencia = FreqDist(palavras_sem_stopwords)

#from collections import defaultdict
#sentencas_importantes = defaultdict(int)

#for i, sentenca in enumerate(sentencas):
#    for palavra in word_tokenize(sentenca.lower()):
#        if palavra in frequencia:
#            sentencas_importantes[i] += frequencia[palavra]

[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [19]:
frequencia

FreqDist({'joão': 1, 'pé': 1, 'feijão': 1, 'correr': 1, 'sentiu': 1, 'porém': 1, 'seguiu': 1, 'cara': 1, 'fod': 1, 'texto': 1})

In [16]:
from heapq import nlargest

idx_sentencas_importantes = nlargest(1, sentencas_importantes, sentencas_importantes.get)

for i in sorted(idx_sentencas_importantes):
    print(sentencas[i])

João pé de feijão, foi correr e sentiu.


In [None]:
import re
import numpy as n
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
nltk.download('punkt')

# 3. Data Modeling

In [86]:
class dataModeler(dataFormatter):
    def __init__(self):
        self.model = dataModeler.getModel()
        self.train_data = dataFormatter.train_data
        
    def getModel():
        '''
        Specifies a model to handle the categorization problem.
        # criar etapa de modelagem -métodos se for o caso
        # dar split no train e treinar
        
        
        '''

        model = None
        return model
        
    def writeModel(self):
        '''
        Exports a candidate model to a specified path available
        in the environment variable MODEL_PATH.
        '''

        # criar try: tenta salvar, se nao dá erro.
    
        return pickle.dump(self.model, open(os.getenv('MODEL_PATH'),'wb'))


# 4. Model Validation

In [87]:
class modelValidator(dataModeler, dataFormatter):
    def __init__(self):
        self.model = self.readModel()
        self.validation_data = dataFormatter.validation_data
    
    def readModel(self):
        '''
        get the model selected to handle the categorization problem.
        '''
        
        # criar um try, se nao der entao dar um saveModel
        
        return pickle.load(open(os.getenv('MODEL_PATH'),'rb'))
        
    def validateModel(self):
        '''
        Generates metrics about the model accuracy (precision, recall, F1, etc.)
        for each category and exports them to a specified path available in the 
        environment variable METRICS_PATH.
        '''
        # try k-fold and variants.
        
        # should save metrics in METRICS_PATH
        f = open(os.getenv('METRICS_PATH'), "w")
        f.write("F1:95.0, Precision:87.5")
        f.close()
        pass

# 5. Model Prediction