# Development of Intelligent Computing Systems _ 2021  IME-USP
- course [page][3]
- ministred by: MSc [Renato Cordeiro Ferreira][1]
- student: [Rodrigo Didier Anderson][2]

[1]: https://www.linkedin.com/in/renatocf/
[2]: https://www.linkedin.com/in/didier11/
[3]: https://www.ime.usp.br/verao/index.php

This is the first part of the course project, we will create the training pipeline for a categorization model.

More specifically, the goal is to train a model that should receive data related to products and return the best categories for them.


- More details about this stage of the project [here][1].
- More info about the data can be found [here][2]
[1]: https://github.com/didier-rda/intelligent-systems-project/blob/main/training/README.md
[2]: https://github.com/didier-rda/intelligent-systems-project/blob/main/data/README.md

## Training Pipeline  
(less than 5 minutes)

This training pipeline follows the following steps. 

For each step, a class was created with the necessary methods to fulfill the respective stage of the pipline.

1. **Data extraction** <br>
   Loads a dataset with product data from a specified path available in the
   environment variable `DATASET_PATH`.
   
   class: `dataExtractor`



2. **Data formatting** <br>
   Processes the dataset to use it for training and validation.
   
   class: `dataFormatter`



3. **Data Modeling & Model Exportation** <br>
   - Specifies a model to handle the categorization problem;
   - Exports a candidate model to a specified path available in the environment
     variable `MODEL_PATH`;
   
   class: `dataModeler`



4. **Model validation** <br>
   Generates metrics about the model accuracy (precision, recall, F1, etc.)
   for each category and exports them to a specified path available in the
   environment variable `METRICS_PATH`.

   class: `modelValidator`
   



for the pipeline scheduling a last class: `dataPipeline` was created.

It purpose is to run the pipeline recursively by calling the other classes when necessary.

# Import libs

In [1]:
# sys and data processing libs
import sys
import os
import re
import time
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split

# nlp data corpus and libs
import nltk
nltk.download('punkt')
nltk.download('stopwords')

from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize

from string import punctuation

from nltk.probability import FreqDist

from scipy import sparse as sp_sparse

# ML modules from sklearn
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score 
from sklearn.metrics import average_precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report

[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


# 1. Data Extraction

In [2]:
class dataExtractor:

    def __init__(self):

        self.data_path = os.getenv('DATASET_PATH')
        self.data = self.extractData()

        
    def extractData(self):
        '''
        Loads a dataset with product data from a specified path.
        '''
       
        return pd.read_csv(self.data_path)

#  2. Data Formatting

In [3]:
class dataFormatter:

    def __init__(self):
        self.extracted_data = dataExtractor().data
 
    
    def categoryToDummy(self):
        # get dummy
        dummy = pd.get_dummies(self.extracted_data['category'])

        # merge df
        df_dummy = pd.merge(self.extracted_data, dummy, left_index=True, right_index=True)

        # delet no dummy cols
        del df_dummy['category']

        return df_dummy
    
    
    def splitTrainTestValidate(self):
        '''
        Split the dataset into train, validation and test
        
        train: 60%
        
        test: 20%
        
        validatiom: 20%
        '''
        
        # categorical variables list to split
        y_cols = list(self.categoryToDummy().columns[-6:])
        X_cols = list(self.categoryToDummy().columns[:-6])
        
        # full data categorized
        y_data =  self.categoryToDummy()[y_cols]
        X_data =  self.categoryToDummy()[X_cols]
                
        X, X_test, y, y_test = train_test_split(X_data, y_data, test_size=0.2, train_size=0.8)
        X_train, X_val, y_train, y_val = train_test_split(X,y,test_size = 0.25,train_size =0.75)
        
        return (X, y), (X_test, y_test), (X_val, y_val)
  

    def joinStringColumns(self):
        
        (X, y), (X_test, y_test), (X_val, y_val) = self.splitTrainTestValidate()
        f_join_strings = lambda row: row['query'] + ' ' + row['title'] + ' ' + row['concatenated_tags']
        
        X = X.assign(full_text= f_join_strings)
        X_test = X_test.assign(full_text= f_join_strings)
        X_val = X_val.assign(full_text= f_join_strings)
        
        return (X, y), (X_test, y_test), (X_val, y_val)
    
 
    def normalizeFullText(self):
        """Lower text and remove punctuation, articles and extra whitespace."""
    
        
        def normalize_text(s):
            """Lower text and remove punctuation, articles and extra whitespace."""        

            
            def compost_words(text):
                text = re.sub('[/]', ' ', str(text))
                text = re.sub('[-]', ' ', str(text))
                return text

            
            def white_space_fix(text):
                return " ".join(text.split())

            
            def remove_punc(text):
                exclude = set(punctuation)
                return "".join(ch for ch in text if ch not in exclude)

            
            def lower(text):
                return text.lower()
    
    
            def remove_stop_words(text):
                from nltk.corpus import stopwords # this import in this part of the code was necessary to not give an error
                stopwords = set(stopwords.words('portuguese') + list(punctuation) + list('/'))
                palavras = word_tokenize(text)
                palavras_sem_stopwords = [palavra for palavra in palavras if palavra not in stopwords]
                return " ".join(palavras_sem_stopwords)
        
            return remove_stop_words(white_space_fix(remove_punc(lower(compost_words(s)))))
    
        (X, y), (X_test, y_test), (X_val, y_val) = self.joinStringColumns()

        X['full_text'] = list(map(normalize_text,list(X['full_text'])))
        X_test['full_text'] = list(map(normalize_text,list(X_test['full_text'])))
        X_val['full_text'] = list(map(normalize_text,list(X_val['full_text'])))
        
        
        return (X, y), (X_test, y_test), (X_val, y_val)

    
    def getBagOfWordsSparse(self):
        '''
        represent words in a corpus in a numeric format for multilabel classification.
        '''
        (X, y), (X_test, y_test), (X_val, y_val) = self.normalizeFullText()
        
        # start counting words in trainig data
        words_counts = {}
        for text in X.full_text:
            for word in text.split():
                words_counts[word] = 1
            words_counts[word] += 1
    
        # get 10k most popular words - to decrease the complexity of processing
        DICT_SIZE = int(os.getenv('DICT_OF_WORDS_SIZE'))
        POPULAR_WORDS = sorted(words_counts, key=words_counts.get, reverse=True)[:DICT_SIZE]
        WORDS_TO_INDEX = {key: rank for rank, key in enumerate(POPULAR_WORDS, 0)}
        
        # we gonna us thist dictionary to transform new data to be classified
        pickle.dump(WORDS_TO_INDEX, open(os.getenv('WORDS_TO_INDEX'),'wb'))
        
        INDEX_TO_WORDS = {index:word for word, index in WORDS_TO_INDEX.items()}
        ALL_WORDS = WORDS_TO_INDEX.keys()

        
        def my_bag_of_words(text, words_to_index, dict_size):
            """
            text: a string
            dict_size: size of the dictionary
        
            return a vector which is a bag-of-words representation of 'text'
            """
            result_vector = np.zeros(dict_size)
            for word in text.split(' '):
                if word in words_to_index:
                    result_vector[words_to_index[word]] +=1
            return result_vector

        X_train_mybag = sp_sparse.vstack([sp_sparse.csr_matrix(my_bag_of_words(text, WORDS_TO_INDEX, DICT_SIZE)) for text in X.full_text])
        X_test_mybag = sp_sparse.vstack([sp_sparse.csr_matrix(my_bag_of_words(text, WORDS_TO_INDEX, DICT_SIZE)) for text in X_test.full_text])
        X_val_mybag = sp_sparse.vstack([sp_sparse.csr_matrix(my_bag_of_words(text, WORDS_TO_INDEX, DICT_SIZE)) for text in X_val.full_text])
        
        return (X_train_mybag, y), (X_test_mybag, y_test), (X_val_mybag, y_val)

# 3. Data Modeling & Model Exportation

In [4]:
class dataModeler:
     
    def createModelBoWClassifier(self):
        '''
        Specifies a model to handle the categorization problem.
        # criar etapa de modelagem -métodos se for o caso
        # dar split no train e treinar     
        '''
        
        
        def train_classifier(X_train, y_train, C, regularisation):
            """
            X_train, y_train — training data
      
            return: trained classifier
            """
    
            # Create and fit LogisticRegression wraped into OneVsRestClassifier.
            model = OneVsRestClassifier(LogisticRegression(penalty=regularisation, C=C, max_iter=10000)).fit(X_train, y_train)
            return model
        
        (X_train_mybag, y), (X_test_mybag, y_test), (X_val_mybag, y_val) = dataFormatter().getBagOfWordsSparse()
        
        classifier_bow = train_classifier(X_train_mybag, y, C = 4, regularisation = 'l2')
        
        # in this first version, model 'll be saved here in the code.
        pickle.dump(classifier_bow, open(os.getenv('MODEL_PATH'),'wb'))
        
        pickle.dump(X_train_mybag, open(os.getenv('X_TRAIN'),'wb'))
        pickle.dump(y, open(os.getenv('Y_TRAIN'),'wb'))
        pickle.dump(X_test_mybag, open(os.getenv('X_TEST'),'wb'))
        pickle.dump(y_test, open(os.getenv('Y_TEST'),'wb'))
        pickle.dump(X_val_mybag, open(os.getenv('X_VAL'),'wb'))
        pickle.dump(y_val, open(os.getenv('Y_VAL'),'wb'))
        return classifier_bow
    


# 4. Model Validation

In [5]:
class modelValidator:
    
    def __init__(self):
        self.model = dataModeler().createModelBoWClassifier()
        self.target_names = ['Bebê','Bijuterias e Jóias','Decoração','Lembrancinhas','Outros','Papel e Cia']
     
    
    def validateModel(self):
        '''
        Generates metrics about the model accuracy (precision, recall, F1, etc.)
        for each category and exports them to a specified path available in the 
        environment variable METRICS_PATH.
        '''

        def write_evaluation_scores(y_test, y_test_predicted, y_val, y_val_predicted):
            
            f = open(os.getenv('METRICS_PATH'), "w")
            f.write(f"model BoW TEST metrics:\n")
            f.write(f"\n{classification_report(y_test, y_test_predicted, target_names=self.target_names, zero_division=0)}\n")
            f.write(f"\nmodel BoW VALIDATION metrics:\n")
            f.write(f"\n{classification_report(y_val, y_val_predicted, target_names=self.target_names, zero_division=0)}\n")
            f.close()
        
        X_test_mybag = pickle.load(open(os.getenv('X_TEST'),'rb'))
        y_test = pickle.load(open(os.getenv('Y_TEST'),'rb'))
        X_val_mybag = pickle.load(open(os.getenv('X_VAL'),'rb'))
        y_val = pickle.load(open(os.getenv('Y_VAL'),'rb'))
        
        y_test_predicted = self.model.predict(X_test_mybag)
        y_val_predicted = self.model.predict(X_val_mybag)
        
        write_evaluation_scores(y_test, y_test_predicted, y_val, y_val_predicted)

# 5. DS Pipeline

In [6]:
class dataPipeline:
    def __init__(self):
        self.run = modelValidator().validateModel()

In [7]:
dataPipeline().run