### Created by: Rodrigo Didier, 01/31/21.

# Import Libs

In [1]:
import sys
import os
import re
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split

import nltk
nltk.download('punkt')
nltk.download('stopwords')

from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize

from nltk.corpus import stopwords
from string import punctuation

from nltk.probability import FreqDist

from tqdm import tqdm
from scipy import sparse as sp_sparse

from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score 
from sklearn.metrics import average_precision_score
from sklearn.metrics import recall_score

[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# DATA SCIENCE PIPLINE --------------------------------------------------------------------------

# 0. Feature Engineering

# 1. Data Extraction

In [2]:
class dataExtractor:

    def __init__(self):
        self.data_path = os.getenv('DATASET_PATH')
        self.data = self.extractData()

    def extractData(self):
        '''
        Loads a dataset with product data from a specified path.
        '''

        return pd.read_csv(self.data_path)

#  2. Data Formatting

In [3]:
class dataFormatter:
    def __init__(self):
        self.extracted_data = dataExtractor().data
        #self.train_data = self.formatData()[0]
        #self.validation_data = self.formatData()[1]
        
    def formatData(self):
        '''
        Processes the dataset to use it for training and validation.
        '''
        
        # add methods to formating the data to the  modeler.
    
    def categoryToDummy(self):
        # get dummy
        dummy = pd.get_dummies(self.extracted_data['category'])

        # merge df
        df_dummy = pd.merge(self.extracted_data, dummy, left_index=True, right_index=True)

        # delet no dummy cols
        del df_dummy['category']

        return df_dummy
    
    def splitTrainTestValidate(self):
        '''
        Split the dataset into train, validation and test
        '''
        
        # categorical variables list to split
        y_cols = list(self.categoryToDummy().columns[-6:])
        X_cols = list(self.categoryToDummy().columns[:-6])
        
        # full data categorized
        y_data =  self.categoryToDummy()[y_cols] #.values
        X_data =  self.categoryToDummy()[X_cols] #.values
                
        X, X_test, y, y_test = train_test_split(X_data, y_data, test_size=0.2, train_size=0.8)
        X_train, X_val, y_train, y_val = train_test_split(X,y,test_size = 0.25,train_size =0.75)
        
        return (X, y), (X_test, y_test), (X_val, y_val)
    
    def joinStringColumns(self):
        
        (X, y), (X_test, y_test), (X_val, y_val) = self.splitTrainTestValidate()
        f_join_strings = lambda row: row['query'] + ' ' + row['title'] + ' ' + row['concatenated_tags']
        
        X = X.assign(full_text= f_join_strings)
        X_test = X_test.assign(full_text= f_join_strings)
        X_val = X_val.assign(full_text= f_join_strings)
        
        return (X, y), (X_test, y_test), (X_val, y_val)
    
 
    def normalizeFullText(self):
        """Lower text and remove punctuation, articles and extra whitespace."""
    
        
        def normalize_text(s):
            """Lower text and remove punctuation, articles and extra whitespace."""        

            def compost_words(text):
                text = re.sub('[/]', ' ', str(text))
                text = re.sub('[-]', ' ', str(text))
                return text

            def white_space_fix(text):
                return " ".join(text.split())

            def remove_punc(text):
                exclude = set(punctuation)
                return "".join(ch for ch in text if ch not in exclude)

            def lower(text):
                return text.lower()
    
            def remove_stop_words(text):
                from nltk.corpus import stopwords
                stopwords = set(stopwords.words('portuguese') + list(punctuation) + list('/'))
                palavras = word_tokenize(text)
                palavras_sem_stopwords = [palavra for palavra in palavras if palavra not in stopwords]
                return " ".join(palavras_sem_stopwords)
        
            return remove_stop_words(white_space_fix(remove_punc(lower(compost_words(s)))))
    
        (X, y), (X_test, y_test), (X_val, y_val) = self.joinStringColumns()

        X['full_text'] = list(map(normalize_text,list(X['full_text'])))
        X_test['full_text'] = list(map(normalize_text,list(X_test['full_text'])))
        X_val['full_text'] = list(map(normalize_text,list(X_val['full_text'])))
        
        
        return (X, y), (X_test, y_test), (X_val, y_val)
    
    def getBagOfWordsSparse(self):
        (X, y), (X_test, y_test), (X_val, y_val) = self.normalizeFullText()
        
        # first lets trainig the data.
        
        # start counting words
        words_counts = {}
        for text in tqdm(X.full_text):
            for word in text.split():
                words_counts[word] = 1
            words_counts[word] += 1
    
        # get 10k most popular words
        DICT_SIZE = int(os.getenv('DICT_OF_WORDS_SIZE'))
        POPULAR_WORDS = sorted(words_counts, key=words_counts.get, reverse=True)[:DICT_SIZE]
        WORDS_TO_INDEX = {key: rank for rank, key in enumerate(POPULAR_WORDS, 0)}
        INDEX_TO_WORDS = {index:word for word, index in WORDS_TO_INDEX.items()}
        ALL_WORDS = WORDS_TO_INDEX.keys()
        
        def my_bag_of_words(text, words_to_index, dict_size):
            """
            text: a string
            dict_size: size of the dictionary
        
            return a vector which is a bag-of-words representation of 'text'
            """
            result_vector = np.zeros(dict_size)
            for word in text.split(' '):
                if word in words_to_index:
                    result_vector[words_to_index[word]] +=1
            return result_vector

        X_train_mybag = sp_sparse.vstack([sp_sparse.csr_matrix(my_bag_of_words(text, WORDS_TO_INDEX, DICT_SIZE)) for text in X.full_text])
        X_test_mybag = sp_sparse.vstack([sp_sparse.csr_matrix(my_bag_of_words(text, WORDS_TO_INDEX, DICT_SIZE)) for text in X_test.full_text])
        X_val_mybag = sp_sparse.vstack([sp_sparse.csr_matrix(my_bag_of_words(text, WORDS_TO_INDEX, DICT_SIZE)) for text in X_val.full_text])
        
        return (X_train_mybag, y), (X_test_mybag, y_test), (X_val_mybag, y_val)

# 3. Data Modeling

In [4]:
class dataModeler:
    #def __init__(self):
        #self.model = self.createModelBoWClassifier()
        #self.write_model =  self.writeModel()
        #self.train_data = dataFormatter.train_data
        
    def createModelBoWClassifier(self):
        '''
        Specifies a model to handle the categorization problem.
        # criar etapa de modelagem -métodos se for o caso
        # dar split no train e treinar     
        '''
        
        def train_classifier(X_train, y_train, C, regularisation):
            """
            X_train, y_train — training data
      
            return: trained classifier
            """
    
            # Create and fit LogisticRegression wraped into OneVsRestClassifier.

            model = OneVsRestClassifier(LogisticRegression(penalty=regularisation, C=C, max_iter=10000)).fit(X_train, y_train)
            return model
        
        
        (X_train_mybag, y), (X_test_mybag, y_test), (X_val_mybag, y_val) = dataFormatter().getBagOfWordsSparse()
        
        classifier_bow = train_classifier(X_train_mybag, y, C = 4, regularisation = 'l2')
        
        # in this first version, model 'll be saved here in the code.
        pickle.dump(classifier_bow, open(os.getenv('MODEL_PATH'),'wb'))
        
        return classifier_bow
    


# 4. Model Validation

In [5]:
class modelValidator:
    def __init__(self):
        self.model = dataModeler().createModelBoWClassifier()
        #self.validation_data = dataFormatter.validation_data
    
    #def readModel(self):
    #    '''
    #    get the model selected to handle the categorization problem.
    #    '''
    #    
    #    # criar um try, se nao der entao dar um saveModel
    #    
    #    return pickle.load(open(os.getenv('MODEL_PATH'),'rb'))
        
    def validateModel(self):
        '''
        Generates metrics about the model accuracy (precision, recall, F1, etc.)
        for each category and exports them to a specified path available in the 
        environment variable METRICS_PATH.
        '''
        # try k-fold and variants.
        
        # should save metrics in METRICS_PATH
        def write_evaluation_scores(y_test, y_test_predicted, y_val, y_val_predicted):
            
            f = open(os.getenv('METRICS_PATH'), "w")
            f.write(f"model BoW TRAIN metrics:\n")
            f.write(f"F1-score macro: {f1_score(y_test, y_test_predicted, average='macro')}\n")
            f.write(f"F1-score micro: {f1_score(y_test, y_test_predicted, average='micro')}\n")
            f.write(f"F1-score weighted: {f1_score(y_test, y_test_predicted, average='weighted')}\n")
            f.write(f"Precision macro: {average_precision_score(y_test, y_test_predicted, average='macro')}\n")
            f.write(f"Precision micro: {average_precision_score(y_test, y_test_predicted, average='micro')}\n")
            f.write(f"Precision weighted: {average_precision_score(y_test, y_test_predicted, average='weighted')}\n")
            f.write(f"model BoW VALIDATION metrics:\n")
            f.write(f"F1-score macro: {f1_score(y_val, y_val_predicted, average='macro')}\n")
            f.write(f"F1-score micro: {f1_score(y_val, y_val_predicted, average='micro')}\n")
            f.write(f"F1-score weighted: {f1_score(y_val, y_val_predicted, average='weighted')}\n")
            f.write(f"Precision macro: {average_precision_score(y_val, y_val_predicted, average='macro')}\n")
            f.write(f"Precision micro: {average_precision_score(y_val, y_val_predicted, average='micro')}\n")
            f.write(f"Precision weighted: {average_precision_score(y_val, y_val_predicted, average='weighted')}\n")
            f.close()
        
        (X_train_mybag, y), (X_test_mybag, y_test), (X_val_mybag, y_val) = dataFormatter().getBagOfWordsSparse()
        
        y_test_predicted = self.model.predict(X_test_mybag)
        y_val_predicted = self.model.predict(X_val_mybag)
        
        write_evaluation_scores(y_test, y_test_predicted, y_val, y_val_predicted)

        #dataModeler
        #pass

# 5. DS Pipeline

In [6]:
class dataPipeline:
    def __init__(self):
        self.run = modelValidator().validateModel()

In [7]:
dataPipeline().run

100%|██████████| 30400/30400 [00:00<00:00, 313700.06it/s]
100%|██████████| 30400/30400 [00:00<00:00, 353613.01it/s]


In [None]:
print('Accuracy: ', accuracy_score(y_test, predicted, normalize=False))
    print('F1-score macro: ', f1_score(y_test, predicted, average='macro'))
    print('F1-score micro: ', f1_score(y_test, predicted, average='micro'))
    print('F1-score weighted: ', f1_score(y_test, predicted, average='weighted'))
    print('Precision macro: ', average_precision_score(y_test, predicted, average='macro'))
    print('Precision micro: ', average_precision_score(y_test, predicted, average='micro'))
    print('Precision weighted: ', average_precision_score(y_test, predicted, average='weighted'))

In [None]:
classifier_mybag = train_classifier(X_train_mybag, y, C = 4, regularisation = 'l2')

In [None]:
y_test_predicted_labels_mybag = classifier_mybag.predict(X_test_mybag)

In [None]:
y_test_predicted_scores_mybag = classifier_mybag.decision_function(X_test_mybag)

In [None]:
#os.getenv('DATASET_PATH')
int(os.getenv('DICT_OF_WORDS_SIZE'))

In [None]:
(X, y), (X_test, y_test), (X_val, y_val) = dataFormatter().normalizeFullText()

In [None]:
from tqdm import tqdm
from scipy import sparse as sp_sparse

from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score 
from sklearn.metrics import average_precision_score
from sklearn.metrics import recall_score

In [None]:
words_counts = {}
for text in tqdm(X.full_text):
    for word in text.split():
        words_counts[word] = 1
    words_counts[word] += 1
    
DICT_SIZE = 10000
POPULAR_WORDS = sorted(words_counts, key=words_counts.get, reverse=True)[:DICT_SIZE]
WORDS_TO_INDEX = {key: rank for rank, key in enumerate(POPULAR_WORDS, 0)}
INDEX_TO_WORDS = {index:word for word, index in WORDS_TO_INDEX.items()}
ALL_WORDS = WORDS_TO_INDEX.keys()



In [None]:
from scipy import sparse as sp_sparse


def my_bag_of_words(text, words_to_index, dict_size):
    """
        text: a string
        dict_size: size of the dictionary
        
        return a vector which is a bag-of-words representation of 'text'
    """
    result_vector = np.zeros(dict_size)
    for word in text.split(' '):
        if word in words_to_index:
            result_vector[words_to_index[word]] +=1
    return result_vector



In [None]:
X_train_mybag = sp_sparse.vstack([sp_sparse.csr_matrix(my_bag_of_words(text, WORDS_TO_INDEX, DICT_SIZE)) for text in X.full_text])
X_test_mybag = sp_sparse.vstack([sp_sparse.csr_matrix(my_bag_of_words(text, WORDS_TO_INDEX, DICT_SIZE)) for text in X_test.full_text])
X_val_mybag = sp_sparse.vstack([sp_sparse.csr_matrix(my_bag_of_words(text, WORDS_TO_INDEX, DICT_SIZE)) for text in X_val.full_text])
#print('X_train shape ', X_train_mybag.shape, '\nX_val shape ', X_val_mybag.shape)

In [None]:
X_train_mybag

In [None]:
X_test_mybag

In [None]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression

In [None]:
def train_classifier(X_train, y_train, C, regularisation):
    """
      X_train, y_train — training data
      
      return: trained classifier
    """
    
    # Create and fit LogisticRegression wraped into OneVsRestClassifier.

    model = OneVsRestClassifier(LogisticRegression(penalty=regularisation, C=C, max_iter=10000)).fit(X_train, y_train)
    return model



In [None]:
classifier_mybag = train_classifier(X_train_mybag, y, C = 4, regularisation = 'l2')

In [None]:
y_test_predicted_labels_mybag = classifier_mybag.predict(X_test_mybag)

In [None]:
y_test_predicted_scores_mybag = classifier_mybag.decision_function(X_test_mybag)

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score 
from sklearn.metrics import average_precision_score
from sklearn.metrics import recall_score

In [None]:
def print_evaluation_scores(y_test, predicted):
    
    print('Accuracy: ', accuracy_score(y_test, predicted, normalize=False))
    print('F1-score macro: ', f1_score(y_test, predicted, average='macro'))
    print('F1-score micro: ', f1_score(y_test, predicted, average='micro'))
    print('F1-score weighted: ', f1_score(y_test, predicted, average='weighted'))
    print('Precision macro: ', average_precision_score(y_test, predicted, average='macro'))
    print('Precision micro: ', average_precision_score(y_test, predicted, average='micro'))
    print('Precision weighted: ', average_precision_score(y_test, predicted, average='weighted'))

In [None]:
print('Model: Bag-of-words\nTrain')
print_evaluation_scores(y_test, y_test_predicted_labels_mybag)

In [None]:
# Validation

In [None]:
X_val_mybag = sp_sparse.vstack([sp_sparse.csr_matrix(my_bag_of_words(text, WORDS_TO_INDEX, DICT_SIZE)) for text in X_val.full_text])
y_val_predicted_labels_mybag = classifier_mybag.predict(X_val_mybag)

print('Bag-of-words\n')
print_evaluation_scores(y_val, y_val_predicted_labels_mybag)


In [None]:
dataExtractor().extractData().head()

In [None]:
dataExtractor().extractData().category.value_counts()

# 3. Data Modeling

In [None]:
class dataModeler(dataFormatter):
    def __init__(self):
        self.model = dataModeler.getModel()
        self.train_data = dataFormatter.train_data
        
    def getModel():
        '''
        Specifies a model to handle the categorization problem.
        # criar etapa de modelagem -métodos se for o caso
        # dar split no train e treinar
        
        
        '''

        model = None
        return model
        
    def writeModel(self):
        '''
        Exports a candidate model to a specified path available
        in the environment variable MODEL_PATH.
        '''

        # criar try: tenta salvar, se nao dá erro.
    
        return pickle.dump(self.model, open(os.getenv('MODEL_PATH'),'wb'))


# 4. Model Validation

In [None]:
class modelValidator(dataModeler, dataFormatter):
    def __init__(self):
        self.model = self.readModel()
        self.validation_data = dataFormatter.validation_data
    
    def readModel(self):
        '''
        get the model selected to handle the categorization problem.
        '''
        
        # criar um try, se nao der entao dar um saveModel
        
        return pickle.load(open(os.getenv('MODEL_PATH'),'rb'))
        
    def validateModel(self):
        '''
        Generates metrics about the model accuracy (precision, recall, F1, etc.)
        for each category and exports them to a specified path available in the 
        environment variable METRICS_PATH.
        '''
        # try k-fold and variants.
        
        # should save metrics in METRICS_PATH
        f = open(os.getenv('METRICS_PATH'), "w")
        f.write("F1:95.0, Precision:87.5")
        f.close()
        pass

# 5. Model Prediction