### Created by: Rodrigo Didier, 01/31/21.

# Import Libs

In [54]:
import sys
import os
import re
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split

import nltk
nltk.download('punkt')
nltk.download('stopwords')

from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize

from nltk.corpus import stopwords
from string import punctuation

from nltk.probability import FreqDist

[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# DATA SCIENCE PIPLINE --------------------------------------------------------------------------

# 0. Feature Engineering

# 1. Data Extraction

In [55]:
class dataExtractor:

    def __init__(self):
        self.data_path = os.getenv('DATASET_PATH')
        self.data = self.extractData()

    def extractData(self):
        '''
        Loads a dataset with product data from a specified path.
        '''

        return pd.read_csv(self.data_path)

#  2. Data Formatting

In [56]:
class dataFormatter(dataExtractor):
    def __init__(self):
        self.extracted_data = dataExtractor().data
        #self.train_data = self.formatData()[0]
        #self.validation_data = self.formatData()[1]
        
    def formatData(self):
        '''
        Processes the dataset to use it for training and validation.
        '''
        
        # add methods to formating the data to the  modeler.
    
    def categoryToDummy(self):
        # get dummy
        dummy = pd.get_dummies(self.extracted_data['category'])

        # merge df
        df_dummy = pd.merge(self.extracted_data, dummy, left_index=True, right_index=True)

        # delet no dummy cols
        del df_dummy['category']

        return df_dummy
    
    def splitTrainTestValidate(self):
        '''
        Split the dataset into train, validation and test
        '''
        
        # categorical variables list to split
        y_cols = list(self.categoryToDummy().columns[-6:])
        X_cols = list(self.categoryToDummy().columns[:-6])
        
        # full data categorized
        y_data =  self.categoryToDummy()[y_cols] #.values
        X_data =  self.categoryToDummy()[X_cols] #.values
                
        X, X_test, y, y_test = train_test_split(X_data, y_data, test_size=0.2, train_size=0.8)
        X_train, X_val, y_train, y_val = train_test_split(X,y,test_size = 0.25,train_size =0.75)
        
        return (X, y), (X_test, y_test), (X_val, y_val)
    
    def joinStringColumns(self):
        
        (X, y), (X_test, y_test), (X_val, y_val) = self.splitTrainTestValidate()
        f_join_strings = lambda row: row['query'] + ' ' + row['title'] + ' ' + row['concatenated_tags']
        
        X = X.assign(full_text= f_join_strings)
        X_test = X_test.assign(full_text= f_join_strings)
        X_val = X_val.assign(full_text= f_join_strings)
        
        return (X, y), (X_test, y_test), (X_val, y_val)
    
 
    def normalizeFullText(self):
        """Lower text and remove punctuation, articles and extra whitespace."""
    
        
        def normalize_text(s):
            """Lower text and remove punctuation, articles and extra whitespace."""        

            def compost_words(text):
                text = re.sub('[/]', ' ', str(text))
                text = re.sub('[-]', ' ', str(text))
                return text

            def white_space_fix(text):
                return " ".join(text.split())

            def remove_punc(text):
                exclude = set(punctuation)
                return "".join(ch for ch in text if ch not in exclude)

            def lower(text):
                return text.lower()
    
            def remove_stop_words(text):
                from nltk.corpus import stopwords
                stopwords = set(stopwords.words('portuguese') + list(punctuation) + list('/'))
                palavras = word_tokenize(text)
                palavras_sem_stopwords = [palavra for palavra in palavras if palavra not in stopwords]
                return " ".join(palavras_sem_stopwords)
        
            return remove_stop_words(white_space_fix(remove_punc(lower(compost_words(s)))))
    
        (X, y), (X_test, y_test), (X_val, y_val) = self.joinStringColumns()

        X['full_text'] = list(map(normalize_text,list(X['full_text'])))
        X_test['full_text'] = list(map(normalize_text,list(X_test['full_text'])))
        X_val['full_text'] = list(map(normalize_text,list(X_val['full_text'])))
        
        
        return (X, y), (X_test, y_test), (X_val, y_val)


In [60]:
(X, y), (X_test, y_test), (X_val, y_val) = dataFormatter().normalizeFullText()

In [66]:
from tqdm import tqdm

In [67]:
words_counts = {}
for text in tqdm(X.full_text):
    for word in text.split():
        words_counts[word] = 1
    words_counts[word] += 1
    
DICT_SIZE = 10000
POPULAR_WORDS = sorted(words_counts, key=words_counts.get, reverse=True)[:DICT_SIZE]
WORDS_TO_INDEX = {key: rank for rank, key in enumerate(POPULAR_WORDS, 0)}
INDEX_TO_WORDS = {index:word for word, index in WORDS_TO_INDEX.items()}
ALL_WORDS = WORDS_TO_INDEX.keys()



100%|██████████| 30400/30400 [00:00<00:00, 132527.10it/s]


In [85]:
from scipy import sparse as sp_sparse


def my_bag_of_words(text, words_to_index, dict_size):
    """
        text: a string
        dict_size: size of the dictionary
        
        return a vector which is a bag-of-words representation of 'text'
    """
    result_vector = np.zeros(dict_size)
    for word in text.split(' '):
        if word in words_to_index:
            result_vector[words_to_index[word]] +=1
    return result_vector



In [88]:
X_train_mybag = sp_sparse.vstack([sp_sparse.csr_matrix(my_bag_of_words(text, WORDS_TO_INDEX, DICT_SIZE)) for text in X.full_text])
X_test_mybag = sp_sparse.vstack([sp_sparse.csr_matrix(my_bag_of_words(text, WORDS_TO_INDEX, DICT_SIZE)) for text in X_test.full_text])
X_val_mybag = sp_sparse.vstack([sp_sparse.csr_matrix(my_bag_of_words(text, WORDS_TO_INDEX, DICT_SIZE)) for text in X_val.full_text])
#print('X_train shape ', X_train_mybag.shape, '\nX_val shape ', X_val_mybag.shape)

In [87]:
X_train_mybag

<30400x10000 sparse matrix of type '<class 'numpy.float64'>'
	with 249308 stored elements in Compressed Sparse Row format>

In [89]:
X_test_mybag

<7600x10000 sparse matrix of type '<class 'numpy.float64'>'
	with 60253 stored elements in Compressed Sparse Row format>

In [101]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression

In [102]:
def train_classifier(X_train, y_train, C, regularisation):
    """
      X_train, y_train — training data
      
      return: trained classifier
    """
    
    # Create and fit LogisticRegression wraped into OneVsRestClassifier.

    model = OneVsRestClassifier(LogisticRegression(penalty=regularisation, C=C, max_iter=10000)).fit(X_train, y_train)
    return model



In [103]:
classifier_mybag = train_classifier(X_train_mybag, y, C = 4, regularisation = 'l2')

In [114]:
y_test_predicted_labels_mybag = classifier_mybag.predict(X_test_mybag)

In [105]:
y_val_predicted_labels_mybag

array([[0, 0, 1, 0, 0, 0],
       [0, 0, 0, 1, 0, 0],
       [1, 0, 0, 0, 0, 0],
       ...,
       [1, 0, 0, 0, 0, 0],
       [0, 0, 0, 1, 0, 0],
       [0, 0, 0, 1, 0, 0]])

In [115]:
y_test_predicted_scores_mybag = classifier_mybag.decision_function(X_test_mybag)

In [109]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score 
from sklearn.metrics import average_precision_score
from sklearn.metrics import recall_score

In [112]:
def print_evaluation_scores(y_test, predicted):
    
    print('Accuracy: ', accuracy_score(y_test, predicted, normalize=False))
    print('F1-score macro: ', f1_score(y_test, predicted, average='macro'))
    print('F1-score micro: ', f1_score(y_test, predicted, average='micro'))
    print('F1-score weighted: ', f1_score(y_test, predicted, average='weighted'))
    print('Precision macro: ', average_precision_score(y_test, predicted, average='macro'))
    print('Precision micro: ', average_precision_score(y_test, predicted, average='micro'))
    print('Precision weighted: ', average_precision_score(y_test, predicted, average='weighted'))

In [118]:
print('Model: Bag-of-words\nTrain')
print_evaluation_scores(y_test, y_test_predicted_labels_mybag)

Bag-of-words
 Train
Accuracy:  6536
F1-score macro:  0.8533481618176899
F1-score micro:  0.8913723142420009
F1-score weighted:  0.8897436801501449
Precision macro:  0.7571362480651391
Precision micro:  0.8143795095838614
Precision weighted:  0.8220702155936682


In [None]:
# Validation

In [117]:
X_val_mybag = sp_sparse.vstack([sp_sparse.csr_matrix(my_bag_of_words(text, WORDS_TO_INDEX, DICT_SIZE)) for text in X_val.full_text])
y_val_predicted_labels_mybag = classifier_mybag.predict(X_val_mybag)

print('Bag-of-words\n')
print_evaluation_scores(y_val, y_val_predicted_labels_mybag)


Bag-of-words

Accuracy:  7081
F1-score macro:  0.9491299934244464
F1-score micro:  0.9492086616780345
F1-score weighted:  0.948969197384583
Precision macro:  0.9116329721073463
Precision micro:  0.9105314232956073
Precision weighted:  0.915296235807644


In [122]:
dataExtractor().extractData().head()

Unnamed: 0,product_id,seller_id,query,search_page,position,title,concatenated_tags,creation_date,price,weight,express_delivery,minimum_quantity,view_counts,order_counts,category
0,11394449,8324141,espirito santo,2,6,Mandala Espírito Santo,mandala mdf,2015-11-14 19:42:12,171.89,1200.0,1,4,244,,Decoração
1,15534262,6939286,cartao de visita,2,0,Cartão de Visita,cartao visita panfletos tag adesivos copos lon...,2018-04-04 20:55:07,77.67,8.0,1,5,124,,Papel e Cia
2,16153119,9835835,expositor de esmaltes,1,38,Organizador expositor p/ 70 esmaltes,expositor,2018-10-13 20:57:07,73.920006,2709.0,1,1,59,,Outros
3,15877252,8071206,medidas lencol para berco americano,1,6,Jogo de Lençol Berço Estampado,t jogo lencol menino lencol berco,2017-02-27 13:26:03,118.770004,0.0,1,1,180,1.0,Bebê
4,15917108,7200773,adesivo box banheiro,3,38,ADESIVO BOX DE BANHEIRO,adesivo box banheiro,2017-05-09 13:18:38,191.81,507.0,1,6,34,,Decoração


In [121]:
dataExtractor().extractData().category.value_counts()

Lembrancinhas         17524
Decoração              8723
Bebê                   6930
Papel e Cia            2750
Outros                 1133
Bijuterias e Jóias      940
Name: category, dtype: int64

# 3. Data Modeling

In [58]:
class dataModeler(dataFormatter):
    def __init__(self):
        self.model = dataModeler.getModel()
        self.train_data = dataFormatter.train_data
        
    def getModel():
        '''
        Specifies a model to handle the categorization problem.
        # criar etapa de modelagem -métodos se for o caso
        # dar split no train e treinar
        
        
        '''

        model = None
        return model
        
    def writeModel(self):
        '''
        Exports a candidate model to a specified path available
        in the environment variable MODEL_PATH.
        '''

        # criar try: tenta salvar, se nao dá erro.
    
        return pickle.dump(self.model, open(os.getenv('MODEL_PATH'),'wb'))


# 4. Model Validation

In [59]:
class modelValidator(dataModeler, dataFormatter):
    def __init__(self):
        self.model = self.readModel()
        self.validation_data = dataFormatter.validation_data
    
    def readModel(self):
        '''
        get the model selected to handle the categorization problem.
        '''
        
        # criar um try, se nao der entao dar um saveModel
        
        return pickle.load(open(os.getenv('MODEL_PATH'),'rb'))
        
    def validateModel(self):
        '''
        Generates metrics about the model accuracy (precision, recall, F1, etc.)
        for each category and exports them to a specified path available in the 
        environment variable METRICS_PATH.
        '''
        # try k-fold and variants.
        
        # should save metrics in METRICS_PATH
        f = open(os.getenv('METRICS_PATH'), "w")
        f.write("F1:95.0, Precision:87.5")
        f.close()
        pass

# 5. Model Prediction