# TensorFlow Serving and Preprocessing layers

https://www.tensorflow.org/guide/keras/preprocessing_layers

I livelli di pre-elaborazione permettono di creare pipeline per l'elaborazione dell'input per ottenere modelli che siano realmente end-to-end. Ovvero modelli che che accettano come input dati grezzi, gestendo autonomamente la normalizzazione delle features.

### (Alcuni) Livelli disponibili

**Pre-elaborazione del testo**
 - tf.keras.layers.TextVectorization

**Pre-elaborazione delle caratteristiche numeriche**

 - tf.keras.layers.Normalization
 - tf.keras.layers.Discretization

**Incremento dei dati di immagine**

Questi livelli applicano trasformazioni casuali ad un batch di immagini. Sono attivi solo durante l'allenamento.

 - tf.keras.layers.RandomCrop
 - tf.keras.layers.RandomFlip
 - tf.keras.layers.RandomTranslation
 - tf.keras.layers.RandomRotation
 - tf.keras.layers.RandomZoom
 - tf.keras.layers.RandomHeight
 - tf.keras.layers.RandomWidth
 - tf.keras.layers.RandomContrast

 ## Pre-elaborazione dei dati prima del modello o all'interno del modello???

### Prepare Dataset

In [1]:
import os

In [3]:
path = '../../Research/Wiki_Dataset/input_dir'

In [4]:
os.listdir(path)

['sport', 'salute', 'tecnologia']

In [5]:
import glob
import numpy as np
import pandas as pd
import re
import time

In [6]:
import logging
import re
import string
from typing import Tuple, Union, List, Dict

In [7]:
import numpy as np
import tensorflow as tf
from keras.layers import TextVectorization

In [8]:
level = logging.INFO
logging.basicConfig(level=level)
logger = logging.getLogger(__name__)

In [9]:
files = glob.glob(f"{path}/*/*.txt", recursive=True)

files = [file for file in files]

In [10]:
files[0]

'../../Research/Wiki_Dataset/input_dir/sport/articolo_sport_336.txt'

In [11]:
class2id = {'sport':0, 'salute':1, 'tecnologia':2}

In [12]:
id2class = {v:k for k,v in class2id.items()}

In [13]:
def get_class(file):

    txt = file.split("/")[-1]
    txt = txt.split("_")[1]
    
    classe = class2id[txt]
    
    return classe

In [14]:
get_class(files[1000])

2

In [15]:
def read_text_data(txt_file:str) -> str:

    with open(txt_file, 'r', encoding='utf-8') as f:
        txt = f.readlines()
        txt = " ".join(" ".join(txt).split("\n"))
        txt = re.findall(r"<body>(.*)</body>", txt)[0]
        
    return txt

In [16]:
# read_text_data(files[1000])

In [17]:
# '[%s]'%re.escape(string.punctuation)

In [18]:
classes = [get_class(file) for file in files]
# classes[:10]

In [19]:
dataset = [read_text_data(file) for file in files]
# dataset[0]

In [20]:
class TFModel(tf.Module):
    
    def __init__(self, model: tf.keras.Model) -> None:
        self.model = model
                

    @tf.function(input_signature=[tf.TensorSpec(shape=(1,), dtype=tf.string)])
    def prediction(self, review: str) -> Dict[str, Union[str, List[float]]]:
        return {
            'prediction': self.model(review)
        }

In [21]:
class ModelTrainer:
    
    def __init__(self) -> None:
        self.tf_model_wrapper: TFModel
            
        
        # Architettura
        self.embed_size = 128
        self.max_features = 20000
        self.epochs = 10
        self.batch_size = 128
        self.max_len = 100
                
    
    def fetch_data(self) -> Tuple[np.ndarray, np.ndarray]:
        
        files = glob.glob(f"{path}/*/*.txt", recursive=True)
        files = [file for file in files]
        
        classes = [get_class(file) for file in files]
        classes = tf.keras.utils.to_categorical(np.array(classes))
        
        dataset = [read_text_data(file) for file in files]
        dataset = np.array(dataset)
        
        self.text_dataset = dataset

        return dataset, classes, dataset, classes
    
    
    def custom_preprocessing(self, raw_text:str) -> tf.string:
        
        lowercase = tf.strings.lower(raw_text)
        stripped_html = tf.strings.regex_replace(lowercase, "<br/>", "")
        
        txt = tf.strings.regex_replace(stripped_html, '[%s]' % re.escape(string.punctuation), '')
        txt = tf.strings.regex_replace(txt, "[ ]+", " ")
        
        return txt
    
    
    def init_vectorize_layer(self, text_dataset:np.ndarray) -> TextVectorization:
        
        text_vectorizer = TextVectorization(max_tokens=self.max_features, 
                                            standardize=self.custom_preprocessing, 
                                            output_mode='int', 
                                            output_sequence_length=self.max_len)
        
        with tf.device('/cpu:0'):
            text_vectorizer.adapt(text_dataset)
        
        return text_vectorizer
    
    
    def init_model(self, text_dataset: np.ndarray) -> tf.keras.Model:
        
        logger.info("initialize vectorize_layer")
        vectorize_layer = self.init_vectorize_layer(self.text_dataset)
        
        logger.info("initialize Model")
        raw_input = tf.keras.Input(shape=(1,), dtype=tf.string)
        
        x = vectorize_layer(raw_input)
        x = tf.keras.layers.Embedding(self.max_features + 1, 
                                      self.embed_size, 
                                      input_length=self.max_len)(x)
        
        x = tf.keras.layers.LSTM(128)(x)
        
        predictions = tf.keras.layers.Dense(3, activation='softmax')(x)
        
        model = tf.keras.Model(raw_input, predictions)
        
        model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
        
        return model
    
    
    def train(self) -> None:
        
        train_data, train_labels, _, _ = self.fetch_data()
        
        model = self.init_model(train_data)
        
        logger.info("initialize Training")
        
        with tf.device('/cpu:0'):
            model.fit(train_data, train_labels, 
                      epochs=self.epochs, 
                  batch_size=self.batch_size, 
                      shuffle=True)

        self.tf_model_wrapper = TFModel(model)
        
        tf.saved_model.save(self.tf_model_wrapper.model, 
                            export_dir=f'classifier/saved_models/{int(time.time())}', 
                            signatures={'serving_default':self.tf_model_wrapper.prediction})
        
        logger.info("Model saved")

In [22]:
classifier = ModelTrainer()

In [24]:
classifier.train()

INFO:__main__:initialize vectorize_layer
INFO:__main__:initialize Model
INFO:__main__:initialize Training


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10




INFO:tensorflow:Assets written to: classifier/saved_models/1676126291/assets


INFO:tensorflow:Assets written to: classifier/saved_models/1676126291/assets
INFO:__main__:Model saved
