### Requirements

In [None]:
numpy==1.19.5
scipy==1.6.0
pandas==1.1.5
scikit-learn==0.24.1
xgboost==1.2.0
tensorflow==2.4.1
jupyter>=1.0.0

### 1. NLP

#### A. Util function

In [None]:
import re
import json
import time
from datetime import datetime
import pandas as pd
from pathlib import Path
from typing import List, Optional, Dict, Any
general_regex = re.compile(r'[aA-zZ]+&[aA-zZ]|[A-Za-z]+[\w^\']*|[\w^\']*[A-Za-z]+[\w^\']*')


def load_stopwords(path: str) -> Dict[str, bool]:
    return {sw: True for sw in Path(path).read_text().split('\n')}


def clean_text(text: str, d_stopwords: Dict[str, bool], l_token_filter: Optional[List[str]] = None) -> List[str]:
    """
    Clean a text string for NLP analysis.

    Parameters
    ----------
    text: str
        A text (string) to normalize.

    l_token_filter: list of str
        Custom token to filter out.

    Returns
    ----------
    cleaned_text : str
        A text cleaned, needed for transferring text from human language to machine-readable format for further
        processing.

    """
    tokens = tokenize_text_pattern(text.lower(), d_stopwords)

    if l_token_filter is not None:
        l_token_filter = list(map(lambda x: x.lower(), l_token_filter))
        tokens = [t for t in tokens if t not in l_token_filter]

    return tokens

def tokenize_text_pattern(text: str, d_stopwords: Dict[str, bool]) -> List[str]:
    """
    Tokenize text

    Remove campaigns date, seek for <token>x<token> pattern and <c>&<c> patterns using re.pattern technique.

    Parameters
    ----------
    text : str
        text that should be tokenized.

    Returns
    -------
    list
        list of token (str) built from input text.
    """
    # Get token
    other_tokens = [x for x in general_regex.findall(text) if len(x) >= 2]

    # Remove stopwords
    l_tokens = [w for w in other_tokens if not d_stopwords.get(w, False)]

    return l_tokens

#### B. Usage

In [19]:
path_stopwords = 'nltk_data/stopwords/english'
test_text = """
    Hello ! my name is Pierre I'm 30  and I work at SIA Partner for + 6 month :) my phone number 
    is 0612 and my e-mail is pg@sia.com 
"""

d_stopwords = load_stopwords(path_stopwords)
l_tokens = clean_text(test_text, d_stopwords, ['pg'])

print(l_tokens)

['hello', 'name', 'pierre', "i'm", 'work', 'sia', 'partner', 'month', 'phone', 'number', 'mail', 'sia', 'com']


### 2. AutoEncoder 

#### A. Util class

In [27]:
from typing import Tuple
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras import optimizers
from tensorflow.keras import Model
from tensorflow.keras import metrics   


class Autoencoder(Model):
    def __init__(self, latent_dim: int, n_kernels: int, kernel_size: int, input_dim: Tuple[int, int]):
        
        super(Autoencoder, self).__init__()
        
        # I/O dim
        self.latent_dim, self.input_dim = latent_dim, input_dim
        
        # Kernel's dim
        self.n_kernels, self.kernel_size = n_kernels, kernel_size
        
        # Create encoder / decoder
        self.encoder = self.__create_encoder(self.latent_dim, self.n_kernels, self.kernel_size, self.input_dim)
        
        # TODO: self.decoder = ...
        # TODO: train autoencoder (handle padding, etc ...)
        
    @staticmethod
    def __create_encoder(
        latent_dim: int, n_kernels: int, kernel_size: int, input_dim: Tuple[int, int]
    ) -> Model:
        """
        Implement the forward propagation for the encoding layers:
        CONV1D -> RELU -> MAXPOOL -> DENSE -> OUTPUT

        inputs dim are (n_batch, n_steps, n_embedding)
        
        In the case of chain of character encoding n_embedding = 36 and n_steps is large enough so that most 
        of the sentence won't be truncated

        """
        X_input = layers.Input(input_dim)

        # 1D conv with activation
        X = layers.Conv1D(
            n_kernels, kernel_size, activation='relu', padding="same", input_shape=input_dim
        )(X_input)

        # Max pool layer
        X = layers.MaxPooling1D(pool_size=input_dim[0], padding='valid')(X)

        # Flatten
        X = layers.Flatten()(X)

        # End with a dense FC layer
        X = layers.Dense(latent_dim, name='output_layer')(X)

        model_encoding = Model(inputs=X_input, outputs=X, name='model_embedding')

        return model_encoding
    
    def call(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded


#### B. Usage

In [28]:
# Parameters
input_dim, latent_dim = (1000, 36), 25
n_kernels, kernel_size = 100, 5

autoencoder = Autoencoder(latent_dim, n_kernels, kernel_size, input_dim)

# Exemple of dataset => building 5 random matrices of shape (1000, 36) [final shape (5, 1000, 36)] :
ax_fake_data = np.stack([np.vstack([np.random.randn(36) for i in range(1000)]) for j in range(5)])

autoencoder.encoder(ax_fake_data)

<tf.Tensor: shape=(5, 25), dtype=float32, numpy=
array([[-1.2880787 ,  3.08959   ,  8.532077  ,  2.198817  ,  1.4167652 ,
         0.35796553, -3.099937  , -2.7360947 , -1.5057857 ,  1.1456753 ,
         1.1126702 ,  0.57407707, -0.6077388 , -1.3103471 , -2.8989682 ,
        -0.5535148 ,  0.04779186,  1.5541102 ,  3.1268792 ,  1.9724679 ,
         3.6592827 ,  5.032964  , -3.4500494 ,  3.4744127 , -0.47673264],
       [-1.4928756 ,  3.2384596 ,  8.42879   ,  3.2178411 ,  1.4419081 ,
         1.3574014 , -2.7454884 , -2.8523047 , -3.067702  ,  1.7219293 ,
         0.7400052 ,  0.36554098, -0.82152396, -0.8702831 , -2.797946  ,
        -1.3560873 , -0.2812631 ,  1.5519583 ,  2.8478532 ,  1.8341883 ,
         4.020779  ,  4.803356  , -3.4221663 ,  3.9082623 , -0.77048296],
       [-0.9479112 ,  3.8784826 ,  8.545241  ,  3.4300733 ,  0.6023769 ,
         1.0034474 , -2.3583612 , -3.0714471 , -2.2961705 ,  1.2910479 ,
         0.59781843,  0.05937263, -0.78640825, -1.1196426 , -2.2951586 ,
