# English GloVe Embeddings Bag - Implantação

Este componente classifica sentenças baseado nos pacotes de Embeddings GloVe diponível disponiblizado pela [Stanford](https://nlp.stanford.edu/projects/glove/)
### **Em caso de dúvidas, consulte os [tutoriais da PlatIAgro](https://platiagro.github.io/tutorials/).**

## Declaração de Classe para Predições em Tempo Real

A tarefa de implantação cria um serviço REST para predições em tempo-real.<br>
Para isso você deve criar uma classe `Model` que implementa o método `predict`.

In [None]:
%%writefile Model.py
import logging
import os
import pickle
from typing import Dict, Iterable, List, Union

import numpy as np
import pandas as pd
import pytorch_lightning as pl
import torch
from Dataset import ImdbDataset
from Model_Lightning import GloveFinetuner
from platiagro import load_model
from pytorch_lightning.callbacks import ModelCheckpoint

logger = logging.getLogger(__name__)


class Model(object):
    def __init__(self, dataset: str = None, target: str = None):
        # Carrega artefatos: estimador, etc
        artifacts_file_name = "artifacts.p"
        artifacts = pickle.load(open(f"/tmp/data/{artifacts_file_name}", "rb"))
        self.max_epochs = artifacts["hyperparams"]["max_epochs"]
        self.columns = artifacts["deployment_infos"]["columns"]
        self.label_encoder = artifacts["model_parameters"]["label_encoder"]
        self.glove_vocab = artifacts["deployment_infos"]["glove_vocab"]

        # Carregando pesos do modelo
        dataset_infos = {
            "all_data": artifacts["dataset_infos"]["all_data"],
            "CustomDataset": ImdbDataset,
        }
        self.model = GloveFinetuner(
            hyperparams=artifacts["hyperparams"],
            model_parameters=artifacts["model_parameters"],
            dataset_infos=dataset_infos,
            extra_infos=artifacts["extra_infos"],
        )

        model_file_name = "pytorch_model.pt"
        self.model.load_state_dict(torch.load(f"/tmp/data/{model_file_name}"))
        self.model.eval()
        self.trainer = self.fit_model()

    def load_checkpoint(self, filepath, artifacts):
        dataset_infos = {
            "all_data": artifacts["dataset_infos"]["all_data"],
            "CustomDataset": ImdbDataset,
        }
        model = GloveFinetuner(
            hyperparams=artifacts["hyperparams"],
            model_parameters=artifacts["model_parameters"],
            dataset_infos=dataset_infos,
            extra_infos=artifacts["extra_infos"],
        )

        return model

    def fit_model(self):
        cwd = "/tmp/data"
        #cwd = os.getcwd()
        checkpoint_path = cwd + "/epoch=199.ckpt"
        checkpoint_dir = os.path.dirname(os.path.abspath(checkpoint_path))
        print(f"Files in {checkpoint_dir}: {os.listdir(checkpoint_dir)}")
        print(f"Saving checkpoints to {checkpoint_dir}")
        checkpoint_callback = ModelCheckpoint(
            filepath=checkpoint_dir, save_top_k=-1, monitor="val_acc"
        )  # Keeps all checkpoints.

        resume_from_checkpoint = None
        if os.path.exists(checkpoint_path):
            print(f"Restoring checkpoint: {checkpoint_path}")
            resume_from_checkpoint = checkpoint_path

        trainer = pl.Trainer(
            gpus=0,
            max_epochs=self.max_epochs,
            check_val_every_n_epoch=100,
            profiler=True,
            checkpoint_callback=checkpoint_callback,
            progress_bar_refresh_rate=100,
            resume_from_checkpoint=resume_from_checkpoint,
        )

        trainer.fit(self.model)
        return trainer

    def tokenize_text(self, text_list: list = None):
        tokenize_list = list()
        for text in text_list:
            text = text[0]
            text = text.split(" ")
            tokenize_list.append(text)
        return tokenize_list

    def build_glove_matrix(self, X):
        X = self.tokenize_text(X)
        glove_matrix = []
        word_filtered_matrix = []

        for token_line in X:
            token_phrase = [
                self.glove_vocab[word]
                for word in token_line
                if (word in self.glove_vocab)
            ]
            filtered_words = [word for word in token_line if (word in self.glove_vocab)]
            word_filtered_matrix.append(filtered_words)
            glove_matrix.append(token_phrase)

        return glove_matrix, word_filtered_matrix

    def predict(
        self, X: np.ndarray, feature_names: Iterable[str], meta: Dict = None
    ) -> Union[np.ndarray, List, str, bytes]:
        if feature_names:
            # Antes de utilizar o conjunto de dados X no modelo, reordena suas features de acordo com a ordem utilizada no treinamento
            df = pd.DataFrame(X, columns=feature_names)
            X = df[self.columns].to_numpy()

        X_inference_glove_ids, X_inference_glove_words = self.build_glove_matrix(X)
        result = self.model.predict(X_inference_glove_ids, X_inference_glove_words)

        return result

In [2]:
%run Model.py

In [3]:
from Model import Model

artifacts_file_name = "artifacts.p"
artifacts = pickle.load(open(f"/tmp/data/{artifacts_file_name}", "rb"))
X_test = artifacts["deployment_infos"]["X_test"]
inferenceModel = Model()

INFO:lightning:GPU available: False, used: False


Files in /home/jovyan/tasks/GloVe_Bag_Of_Embeddings_Sentence_Classification: ['pyotrch_model.pt', '.ipynb_checkpoints', 'Experiment.ipynb', 'glove_dir', 'Model.py', 'epoch=199.ckpt', 'lightning_logs', 'glove.6B.zip', 'Deployment.ipynb', 'artifacts.p', '__pycache__', 'epoch=99.ckpt', 'Model_Lightning.py', 'Dataset.py']
Saving checkpoints to /home/jovyan/tasks/GloVe_Bag_Of_Embeddings_Sentence_Classification
Restoring checkpoint: /home/jovyan/tasks/GloVe_Bag_Of_Embeddings_Sentence_Classification/epoch=199.ckpt


INFO:lightning:
  | Name          | Type             | Params
-----------------------------------------------
0 | loss_funtion  | CrossEntropyLoss | 0     
1 | predict_proba | Softmax          | 0     
2 | embedding_bag | EmbeddingBag     | 120 M 
3 | layer1        | Linear           | 90 K  
4 | layer2        | Linear           | 602   
5 | net           | Sequential       | 90 K  


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validation sanity check', layout=Layout…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…

INFO:lightning:

Profiler Report

Action              	|  Mean duration (s)	|  Total time (s) 
-----------------------------------------------------------------
on_train_start      	|  0.060309       	|  0.060309       
on_train_end        	|  0.003972       	|  0.003972       






In [5]:
X_test = np.array(
    [["There are many levels of perversity at play. And yet it is transfixing."]]
)

In [6]:
resultado = inferenceModel.predict(X_test, None)
resultado

Unnamed: 0,ORIGINAL_TARGET,ORIGINAL_CODE,PREDICTED_TARGET,PREDICTED_CODE,NEG_PROBA,POS_PROBA
0,,,positive,1,0.000702,0.999298
