# TensorFlow Serving and Preprocessing layers

__inserire links__

### Prepare Dataset

In [1]:
import os

In [2]:
path = '../../Research/Wiki_Dataset/input_dir'

In [3]:
os.listdir(path)

['sport', 'salute', 'tecnologia']

In [4]:
import glob
import numpy as np
import pandas as pd
import re
import time

In [5]:
import logging
import re
import string
from typing import Tuple, Union, List, Dict

In [6]:
import numpy as np
import tensorflow as tf
from keras.layers import TextVectorization

In [7]:
level = logging.INFO
logging.basicConfig(level=level)
logger = logging.getLogger(__name__)

In [8]:
files = glob.glob(f"{path}/*/*.txt", recursive=True)

files = [file for file in files]

In [9]:
files[0]

'../../Research/Wiki_Dataset/input_dir/sport/articolo_sport_336.txt'

In [10]:
class2id = {'sport':0, 'salute':1, 'tecnologia':2}

In [11]:
id2class = {v:k for k,v in class2id.items()}

In [12]:
def get_class(file):

    txt = file.split("/")[-1]
    txt = txt.split("_")[1]
    
    classe = class2id[txt]
    
    return classe

In [13]:
get_class(files[1000])

2

In [14]:
def read_text_data(txt_file:str) -> str:

    with open(txt_file, 'r', encoding='utf-8') as f:
        txt = f.readlines()
        txt = " ".join(" ".join(txt).split("\n"))
        txt = re.findall(r"<body>(.*)</body>", txt)[0]
        
    return txt

In [15]:
# read_text_data(files[1000])

In [16]:
classes = [get_class(file) for file in files]
# classes[:10]

In [17]:
dataset = [read_text_data(file) for file in files]
# dataset[0]

In [18]:
class TFModel(tf.Module):
    
    def __init__(self, model: tf.keras.Model) -> None:
        self.model = model
        

    @tf.function(input_signature=[tf.TensorSpec(shape=(1,), dtype=tf.string)])
    def prediction(self, review: str) -> Dict[str, Union[str, List[float]]]:
        return {
            'prediction': self.model(review)
        }

In [19]:
class ModelTrainer:
    
    def __init__(self) -> None:
        self.tf_model_wrapper: TFModel
            
        
        # Architettura
        self.embed_size = 128
        self.max_features = 20000
        self.epochs = 1
        self.batch_size = 128
        self.max_len = 50
                
    
    def fetch_data(self) -> Tuple[np.ndarray, np.ndarray]:
        
        files = glob.glob(f"{path}/*/*.txt", recursive=True)
        files = [file for file in files]
        
        classes = [get_class(file) for file in files]
        classes = tf.keras.utils.to_categorical(np.array(classes))
        
        dataset = [read_text_data(file) for file in files]
        dataset = np.array(dataset)
        
        self.text_dataset = dataset

        return dataset, classes, dataset, classes
    
    
    def custom_preprocessing(self, raw_text:str) -> tf.string:
        
        lowercase = tf.strings.lower(raw_text)
        stripped_html = tf.strings.regex_replace(lowercase, "<br/>", "")
        
        txt = tf.strings.regex_replace(stripped_html, '[%s]' % re.escape(string.punctuation), '')
        txt = tf.strings.regex_replace(txt, "[ ]+", " ")
        
        return txt
    
    
    def init_vectorize_layer(self, text_dataset:np.ndarray) -> TextVectorization:
        
        text_vectorizer = TextVectorization(max_tokens=self.max_features, 
                                            standardize=self.custom_preprocessing, 
                                            output_mode='int', 
                                            output_sequence_length=self.max_len)
        
        text_vectorizer.adapt(text_dataset)
        
        return text_vectorizer
    
    
    def init_model(self, text_dataset: np.ndarray) -> tf.keras.Model:
        
        logger.info("initialize vectorize_layer")
        vectorize_layer = self.init_vectorize_layer(self.text_dataset)
        
        logger.info("initialize Model")
        raw_input = tf.keras.Input(shape=(1,), dtype=tf.string)
        
        x = vectorize_layer(raw_input)
        x = tf.keras.layers.Embedding(self.max_features + 1, 
                                      self.embed_size, 
                                      input_length=self.max_len)(x)
        
        x = tf.keras.layers.LSTM(128)(x)
        
        predictions = tf.keras.layers.Dense(3, activation='softmax')(x)
        
        model = tf.keras.Model(raw_input, predictions)
        
        model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
        
        return model
    
    
    def train(self) -> None:
        
        train_data, train_labels, _, _ = self.fetch_data()
        
        model = self.init_model(train_data)
        
        logger.info("initialize Training")
        
        with tf.device('/cpu:0'):
            model.fit(train_data, train_labels, 
                      epochs=self.epochs, 
                  batch_size=self.batch_size, 
                      shuffle=True)

        self.tf_model_wrapper = TFModel(model)
        
        tf.saved_model.save(self.tf_model_wrapper.model, 
                            export_dir=f'classifier/saved_models/{int(time.time())}', 
                            signatures={'serving_default':self.tf_model_wrapper.prediction})
        
        logger.info("Model saved")

In [20]:
# tf.losses.categorical_crossentropy

In [21]:
# '[%s]'%re.escape(string.punctuation)

In [22]:
classifier = ModelTrainer()

In [23]:
# classifier.train()

In [24]:
train_data, train_labels, _, _ = classifier.fetch_data()

In [25]:
embed_size = 128
max_features = 20#000
epochs = 1
batch_size = 128
max_len = 5#0

In [26]:
def custom_preprocessing(self, raw_text:str) -> tf.string:

    lowercase = tf.strings.lower(raw_text)
    stripped_html = tf.strings.regex_replace(lowercase, "<br/>", "")

    txt = tf.strings.regex_replace(stripped_html, '[%s]' % re.escape(string.punctuation), '')
    txt = tf.strings.regex_replace(txt, "[ ]+", " ")

    return txt

In [1]:
with tf.device('/cpu:0'):
    text_vectorizer = TextVectorization(max_tokens=max_features, 
                                        standardize=custom_preprocessing, 
                                        output_mode='int', 
                                        output_sequence_length=max_len)

    text_vectorizer.adapt(train_data)

NameError: name 'tf' is not defined