<a href="https://colab.research.google.com/github/marco-siino/text_preprocessing_impact/blob/main/TextPreProImpact_FNS_SVM_NB.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## The impact of text preprocessing on state-of-the-art NLP models for modern challenges. An evaluative survey. - - - SVM ON FAKE NEWS SPREADERS DS EXPERIMENTS NOTEBOOK - - -
Support Vector Machine on Fake News Spreaders Dataset.
Code by M. Siino. 

From the paper: "The impact of text preprocessing on state-of-the-art NLP models for modern challenges. An evaluative survey." by M.Siino et al.



## Importing modules.

In [None]:
import matplotlib.pyplot as plt
import os
import random
import re
import shutil
import string
import tensorflow as tf
import nltk

from tensorflow.keras import layers
from tensorflow.keras import losses
from tensorflow.keras import preprocessing
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from google.colab import files
from io import open
from numpy.random import seed
import numpy as np
from pathlib import Path
from sklearn import naive_bayes,svm
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 
from nltk.stem import PorterStemmer
from textblob import TextBlob
nltk.download('stopwords')
nltk.download('punkt')

os.environ['TF_CUDNN_DETERMINISTIC']='true'
os.environ['TF_DETERMINISTIC_OPS']='true'

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


## Importing DS and extract in current working directory.

In [None]:
# Url obtained starting from this: https://drive.google.com/file/d/19ZcqEv88euKB71HfAWjTGN3uCKp2qsfP/ and forcing export=download.
urlTrainingSet = "https://drive.google.com/uc?export=download&id=19ZcqEv88euKB71HfAWjTGN3uCKp2qsfP"

training_set = tf.keras.utils.get_file("pan20-author-profiling-training-2020-02-23.zip", urlTrainingSet,
                                    extract=True, archive_format='zip',cache_dir='.',
                                    cache_subdir='')

training_set_dir = os.path.join(os.path.dirname(training_set), 'pan20-author-profiling-training-2020-02-23')

print(training_set)
print(training_set_dir)

!ls -A

Downloading data from https://drive.google.com/uc?export=download&id=1ifAvjKCUFff-8KlB99dUBCD50xu36dOd
./pan21-author-profiling-training-2021-03-14.zip
./pan21-author-profiling-training-2021-03-14
.config
pan21-author-profiling-training-2021-03-14
pan21-author-profiling-training-2021-03-14.zip
sample_data


## Build folders hierarchy to use Keras folders preprocessing function.

In [None]:
### Training Folders. ###

# First level directory.
if not os.path.exists('train_dir_en'):
    os.makedirs('train_dir_en')
if not os.path.exists('train_dir_es'):
    os.makedirs('train_dir_es')

# Class labels directory.
if not os.path.exists('train_dir_en/0'):
    os.makedirs('train_dir_en/0')
if not os.path.exists('train_dir_es/0'):
    os.makedirs('train_dir_es/0')
if not os.path.exists('train_dir_en/1'):
    os.makedirs('train_dir_en/1')
if not os.path.exists('train_dir_es/1'):
    os.makedirs('train_dir_es/1')

# Make Py variables.
train_dir='train_dir_'

!ls -A

.config						sample_data
pan21-author-profiling-training-2021-03-14	train_dir_en
pan21-author-profiling-training-2021-03-14.zip	train_dir_es


## Set language and directory paths.


In [None]:
# Set en and es ground truth file path for train_dir. We haven't a ground truth file for the test set.
language='en'

truth_file_training_dir_en=training_set_dir+'/'+language+'/'
truth_file_training_path_en = truth_file_training_dir_en+'truth.txt'

language='es'

truth_file_training_dir_es=training_set_dir+'/'+language+'/'
truth_file_training_path_es = truth_file_training_dir_es+'truth.txt'

## Read truth.txt to organize training dataset folders.



In [None]:
language='en'

# Open the file truth.txt with read only permit.
f = open(truth_file_training_path_en, "r")
# use readline() to read the first line 
line = f.readline()
# use the read line to read further.
# If the file is not empty keep reading one line
# at a time, till the file is empty
while line:
    # Split line at :::
    x = line.split(":::")
    fNameXml = x[0]+'.xml'
    fNameTxt = x[0]+'.txt'
    # Second coord [0] gets just the first character (label) and not /n too.
    label = x[1][0]

    # Now move the file to the right folder.
    if os.path.exists(truth_file_training_dir_en+fNameXml):
      os.rename(truth_file_training_dir_en+fNameXml, './train_dir_'+language+'/'+label+'/'+fNameTxt )

    # use readline() to read next line
    line = f.readline()

language='es'

# Open the file truth.txt with read only permit.
f = open(truth_file_training_path_es, "r")
# use readline() to read the first line 
line = f.readline()
# use the read line to read further.
# If the file is not empty keep reading one line
# at a time, till the file is empty
while line:
    # Split line at :::
    x = line.split(":::")
    fNameXml = x[0]+'.xml'
    fNameTxt = x[0]+'.txt'
    # Second coord [0] gets just the first character (label) and not /n too.
    label = x[1][0]

    # Now move the file to the right folder.
    if os.path.exists(truth_file_training_dir_es+fNameXml):
      os.rename(truth_file_training_dir_es+fNameXml, './train_dir_'+language+'/'+label+'/'+fNameTxt )

    # use readline() to read next line
    line = f.readline()

## Generate full training set (union of en and es). Then training and validation.



In [None]:
# Generate full randomized training set.
batch_size=1
language="es"
es_train_ds = tf.keras.preprocessing.text_dataset_from_directory(
    train_dir+language, 
    batch_size=batch_size,
    shuffle=False
    )

language="en"
en_train_ds = tf.keras.preprocessing.text_dataset_from_directory(
    train_dir+language, 
    batch_size=batch_size,
    shuffle=False
    )

full_train_ds=es_train_ds.concatenate(en_train_ds)
full_train_ds=full_train_ds.shuffle(400,seed=1, reshuffle_each_iteration=False)

full_train_ds_size=len(full_train_ds)

Found 200 files belonging to 2 classes.
Found 200 files belonging to 2 classes.


In [None]:
# 5 Cross fold generation example. %T->Training percentage, %V->Validation percentage.

# 1° Fold -> 80%T - 20%V
# 2° Fold -> 60%T - 20%V - 20%T
# 3° Fold -> 40%T - 20%V - 40%T
# 4° Fold -> 20%T - 20%V - 60%T
# 5° Fold -> 20%V - 80%T

train=[]
val=[]

# Percentage start and end of validation subset within full_train_ds.
val_percentage_start=80
val_percentage_end=100
val_percentage_size=20
total_fold_nr=5

for i in range(0,total_fold_nr):
  train.append(full_train_ds.take(int(full_train_ds_size*val_percentage_start/100)))
  train[i] = train[i].concatenate(full_train_ds.skip(int(full_train_ds_size*val_percentage_end/100)))

  val.append(full_train_ds.skip(int(full_train_ds_size*val_percentage_start/100)))
  val[i] = val[i].take(int(full_train_ds_size*val_percentage_size/100))

  val_percentage_start-=val_percentage_size
  val_percentage_end-=val_percentage_size

## Functions to pre-process source text. (A detailed discussione on our paper)

In [None]:
# Do-nothing function.
def do_nothing(input_data):

  formatting_removed_es_1 = tf.strings.regex_replace(input_data, '<author lang="es" class="1">\n\t', '<author_lang="es">')
  formatting_removed_es_0 = tf.strings.regex_replace(formatting_removed_es_1, '<author lang="es" class="0">\n\t', '<author_lang="es">')

  formatting_removed_en_1 = tf.strings.regex_replace(formatting_removed_es_0, '<author lang="en" class="1">\n\t', '<author_lang="en">')
  output_data = tf.strings.regex_replace(formatting_removed_en_1, '<author lang="en" class="0">\n\t', '<author_lang="en">')

  return output_data

# Our custom function.
def mareli(input_data):
  
  formatting_removed_es_1 = tf.strings.regex_replace(input_data, '<author lang="es" class="1">\n\t', '<author_lang="es">')
  formatting_removed_es_0 = tf.strings.regex_replace(formatting_removed_es_1, '<author lang="es" class="0">\n\t', '<author_lang="es">')

  formatting_removed_en_1 = tf.strings.regex_replace(formatting_removed_es_0, '<author lang="en" class="1">\n\t', '<author_lang="en">')
  formatting_removed_en_0 = tf.strings.regex_replace(formatting_removed_en_1, '<author lang="en" class="0">\n\t', '<author_lang="en">')

  tag_open_CDATA_removed = tf.strings.regex_replace(formatting_removed_en_0, '<\!\[CDATA\[', ' <')
  tag_closed_CDATA_removed = tf.strings.regex_replace(tag_open_CDATA_removed,'\]{1,}>', '')

  tag_open_documents_removed  = tf.strings.regex_replace(tag_closed_CDATA_removed, '<documents>\n(\t){0,2}', '')
  tag_closed_documents_removed = tf.strings.regex_replace(tag_open_documents_removed, '</documents>\n(\t){0,2}', '')

  tag_open_document_whitespace_removed = tf.strings.regex_replace(tag_closed_documents_removed, '<document> ', '<document>')
  tag_closed_document_add_whitespace = tf.strings.regex_replace(tag_open_document_whitespace_removed, '</document>\n(\t){0,2}', '</document> ')
 
  return  tag_closed_document_add_whitespace

# Lowercasing function.
def lowercase(input_data): 
  formatting_removed_es_1 = tf.strings.regex_replace(input_data, '<author lang="es" class="1">\n\t', '<author_lang="es">')
  formatting_removed_es_0 = tf.strings.regex_replace(formatting_removed_es_1, '<author lang="es" class="0">\n\t', '<author_lang="es">')

  formatting_removed_en_1 = tf.strings.regex_replace(formatting_removed_es_0, '<author lang="en" class="1">\n\t', '<author_lang="en">')
  output_data = tf.strings.regex_replace(formatting_removed_en_1, '<author lang="en" class="0">\n\t', '<author_lang="en">')

  return tf.strings.lower(output_data)

# Marks, special characters etc... removing function.
def only_alphanumeric(input_data):
  formatting_removed_es_1 = tf.strings.regex_replace(input_data, '<author lang="es" class="1">\n\t', '<author_lang="es">')
  formatting_removed_es_0 = tf.strings.regex_replace(formatting_removed_es_1, '<author lang="es" class="0">\n\t', '<author_lang="es">')

  formatting_removed_en_1 = tf.strings.regex_replace(formatting_removed_es_0, '<author lang="en" class="1">\n\t', '<author_lang="en">')
  output_data = tf.strings.regex_replace(formatting_removed_en_1, '<author lang="en" class="0">\n\t', '<author_lang="en">')

  
  output_data = tf.strings.regex_replace(output_data, r'[^a-zA-Z0-9\s]', ' ')
 
  return  output_data

# Stop word removal function. 
def stop_word_removal(input_data):
  formatting_removed_es_1 = tf.strings.regex_replace(input_data, '<author lang="es" class="1">\n\t', '<author_lang="es">')
  formatting_removed_es_0 = tf.strings.regex_replace(formatting_removed_es_1, '<author lang="es" class="0">\n\t', '<author_lang="es">')

  formatting_removed_en_1 = tf.strings.regex_replace(formatting_removed_es_0, '<author lang="en" class="1">\n\t', '<author_lang="en">')
  output_data = tf.strings.regex_replace(formatting_removed_en_1, '<author lang="en" class="0">\n\t', '<author_lang="en">')

  #print("\n\nInput data è il seguente tensore:")
  #print(output_data)

  #print("Lo converto in stringa e diventa:")
  # Il seguente try per l'adattamento del ts. Nell'except caso della simulazione vera e propria.
  try:
    input_string=output_data[0]

  # # # # # # # Questo è il caso della chiamata a funzione per la simulazione vera e propria.  
  except:
    #print("\n\n****CASO DELLA SIMULAZIONE VERA E PROPRIA****\n\n")
    #print("\nQuesto è il contenuto di output data in caso di simulazione")
    #print(output_data)
    input_string=output_data
    
    try:
      input_string = input_string.numpy()
    
    except:
      #print("This one is not a tensor!")
      return output_data

    else:
      #print("\nestraendo il contenuto del tensore risulta:")
      #print(input_string)
      input_string=(str(input_string))[2:-1]

    #print(input_string)
    blob = TextBlob(str(input_string)).words

    if re.search('.*<author_lang="es">.*',input_string):
      outputlist = [word for word in blob if word not in stopwords.words('spanish')]      
      #print("tolte le stopword spagnole diventa:")
    else:
      outputlist = [word for word in blob if word not in stopwords.words('english')]
      #print("tolte le stopword inglesi diventa:")

    output_string = (' '.join(word for word in outputlist))
    #print(output_string)  

    output_tensor=tf.constant(output_string)
    #print(output_tensor)

    return output_tensor

   # # # # # # # Questo è il caso dell'adattamento del TS.   
  else:
    
    try:
      input_string = input_string.numpy()[0]
      #print(input_string)
    
    except:
      #print("This one is not a tensor!")
      return output_data

    else:
      input_string=(str(input_string))[2:-1]

    #print(input_string)
    blob = TextBlob(str(input_string)).words

    if re.search('.*<author_lang="es">.*',input_string):
      outputlist = [word for word in blob if word not in stopwords.words('spanish')]      
      #print("tolte le stopword spagnole diventa:")
    else:
      outputlist = [word for word in blob if word not in stopwords.words('english')]
      #print("tolte le stopword inglesi diventa:")

    output_string = (' '.join(word for word in outputlist))
    #print(output_string)  

    output_tensor=tf.constant([[output_string]])
    #print(output_tensor)

    return output_tensor


## Define the 16 combs functions (optimized, so no call to other functions within a function).

In [None]:
def sw_lo(input_data):
  tmp_data = tf.strings.lower(input_data)

  formatting_removed_es_1 = tf.strings.regex_replace(tmp_data, '<author lang="es" class="1">\n\t', '<author_lang="es">')
  formatting_removed_es_0 = tf.strings.regex_replace(formatting_removed_es_1, '<author lang="es" class="0">\n\t', '<author_lang="es">')

  formatting_removed_en_1 = tf.strings.regex_replace(formatting_removed_es_0, '<author lang="en" class="1">\n\t', '<author_lang="en">')
  output_data = tf.strings.regex_replace(formatting_removed_en_1, '<author lang="en" class="0">\n\t', '<author_lang="en">')

  #print("\n\nInput data è il seguente tensore:")
  #print(output_data)

  #print("Lo converto in stringa e diventa:")
  # Il seguente try per l'adattamento del ts. Nell'except caso della simulazione vera e propria.
  try:
    input_string=output_data[0]

  # # # # # # # Questo è il caso della chiamata a funzione per la simulazione vera e propria.  
  except:
    #print("\n\n****CASO DELLA SIMULAZIONE VERA E PROPRIA****\n\n")
    #print("\nQuesto è il contenuto di output data in caso di simulazione")
    #print(output_data)
    input_string=output_data
    
    try:
      input_string = input_string.numpy()
    
    except:
      #print("This one is not a tensor!")
      return output_data

    else:
      #print("\nestraendo il contenuto del tensore risulta:")
      #print(input_string)
      input_string=(str(input_string))[2:-1]

    #print(input_string)
    blob = TextBlob(str(input_string)).words

    if re.search('.*<author_lang="es">.*',input_string):
      outputlist = [word for word in blob if word not in stopwords.words('spanish')]      
      #print("tolte le stopword spagnole diventa:")
    else:
      outputlist = [word for word in blob if word not in stopwords.words('english')]
      #print("tolte le stopword inglesi diventa:")

    output_string = (' '.join(word for word in outputlist))
    #print(output_string)  

    output_tensor=tf.constant(output_string)
    #print(output_tensor)

    return output_tensor

   # # # # # # # Questo è il caso dell'adattamento del TS.   
  else:
    
    try:
      input_string = input_string.numpy()[0]
      #print(input_string)
    
    except:
      #print("This one is not a tensor!")
      return output_data

    else:
      input_string=(str(input_string))[2:-1]

    #print(input_string)
    blob = TextBlob(str(input_string)).words

    if re.search('.*<author_lang="es">.*',input_string):
      outputlist = [word for word in blob if word not in stopwords.words('spanish')]      
      #print("tolte le stopword spagnole diventa:")
    else:
      outputlist = [word for word in blob if word not in stopwords.words('english')]
      #print("tolte le stopword inglesi diventa:")

    output_string = (' '.join(word for word in outputlist))
    #print(output_string)  

    output_tensor=tf.constant([[output_string]])
    #print(output_tensor)

    return output_tensor

def sw_lo_ma(input_data):
  tmp_data = tf.strings.lower(input_data)

  formatting_removed_es_1 = tf.strings.regex_replace(tmp_data, '<author lang="es" class="1">\n\t', '<author_lang="es">')
  formatting_removed_es_0 = tf.strings.regex_replace(formatting_removed_es_1, '<author lang="es" class="0">\n\t', '<author_lang="es">')

  formatting_removed_en_1 = tf.strings.regex_replace(formatting_removed_es_0, '<author lang="en" class="1">\n\t', '<author_lang="en">')
  formatting_removed_en_0 = tf.strings.regex_replace(formatting_removed_en_1, '<author lang="en" class="0">\n\t', '<author_lang="en">')

  tag_open_CDATA_removed = tf.strings.regex_replace(formatting_removed_en_0, '<\!\[CDATA\[', ' <')
  tag_closed_CDATA_removed = tf.strings.regex_replace(tag_open_CDATA_removed,'\]{1,}>', '')

  tag_open_documents_removed  = tf.strings.regex_replace(tag_closed_CDATA_removed, '<documents>\n(\t){0,2}', '')
  tag_closed_documents_removed = tf.strings.regex_replace(tag_open_documents_removed, '</documents>\n(\t){0,2}', '')

  tag_open_document_whitespace_removed = tf.strings.regex_replace(tag_closed_documents_removed, '<document> ', '<document>')
  output_data = tf.strings.regex_replace(tag_open_document_whitespace_removed, '</document>\n(\t){0,2}', '</document> ')

  #print("Lo converto in stringa e diventa:")
  # Il seguente try per l'adattamento del ts. Nell'except caso della simulazione vera e propria.
  try:
    input_string=output_data[0]

  # # # # # # # Questo è il caso della chiamata a funzione per la simulazione vera e propria.  
  except:
    #print("\n\n****CASO DELLA SIMULAZIONE VERA E PROPRIA****\n\n")
    #print("\nQuesto è il contenuto di output data in caso di simulazione")
    #print(output_data)
    input_string=output_data
    
    try:
      input_string = input_string.numpy()
    
    except:
      #print("This one is not a tensor!")
      return output_data

    else:
      #print("\nestraendo il contenuto del tensore risulta:")
      #print(input_string)
      input_string=(str(input_string))[2:-1]

    #print(input_string)
    blob = TextBlob(str(input_string)).words

    if re.search('.*<author_lang="es">.*',input_string):
      outputlist = [word for word in blob if word not in stopwords.words('spanish')]      
      #print("tolte le stopword spagnole diventa:")
    else:
      outputlist = [word for word in blob if word not in stopwords.words('english')]
      #print("tolte le stopword inglesi diventa:")

    output_string = (' '.join(word for word in outputlist))
    #print(output_string)  

    output_tensor=tf.constant(output_string)
    #print(output_tensor)

    return output_tensor

   # # # # # # # Questo è il caso dell'adattamento del TS.   
  else:
    
    try:
      input_string = input_string.numpy()[0]
      #print(input_string)
    
    except:
      #print("This one is not a tensor!")
      return output_data

    else:
      input_string=(str(input_string))[2:-1]

    #print(input_string)
    blob = TextBlob(str(input_string)).words

    if re.search('.*<author_lang="es">.*',input_string):
      outputlist = [word for word in blob if word not in stopwords.words('spanish')]      
      #print("tolte le stopword spagnole diventa:")
    else:
      outputlist = [word for word in blob if word not in stopwords.words('english')]
      #print("tolte le stopword inglesi diventa:")

    output_string = (' '.join(word for word in outputlist))
    #print(output_string)  

    output_tensor=tf.constant([[output_string]])
    #print(output_tensor)

    return output_tensor

def oa_ma(input_data):
  formatting_removed_es_1 = tf.strings.regex_replace(input_data, '<author lang="es" class="1">\n\t', '<author_lang="es">')
  formatting_removed_es_0 = tf.strings.regex_replace(formatting_removed_es_1, '<author lang="es" class="0">\n\t', '<author_lang="es">')

  formatting_removed_en_1 = tf.strings.regex_replace(formatting_removed_es_0, '<author lang="en" class="1">\n\t', '<author_lang="en">')
  tmp_out = tf.strings.regex_replace(formatting_removed_en_1, '<author lang="en" class="0">\n\t', '<author_lang="en">')

  tag_open_CDATA_removed = tf.strings.regex_replace(tmp_out, '<\!\[CDATA\[', ' <')
  tag_closed_CDATA_removed = tf.strings.regex_replace(tag_open_CDATA_removed,'\]{1,}>', '')

  tag_open_documents_removed  = tf.strings.regex_replace(tag_closed_CDATA_removed, '<documents>\n(\t){0,2}', '')
  tag_closed_documents_removed = tf.strings.regex_replace(tag_open_documents_removed, '</documents>\n(\t){0,2}', '')

  tag_open_document_whitespace_removed = tf.strings.regex_replace(tag_closed_documents_removed, '<document> ', '<document>')
  output_data = tf.strings.regex_replace(tag_open_document_whitespace_removed, '</document>\n(\t){0,2}', '</document> ')
  
  output_data = tf.strings.regex_replace(output_data, r'[^a-zA-Z0-9\s]', ' ')
 
  return  output_data

def oa_lo(input_data):
  tmp_data = tf.strings.lower(input_data)

  formatting_removed_es_1 = tf.strings.regex_replace(tmp_data, '<author lang="es" class="1">\n\t', '<author_lang="es">')
  formatting_removed_es_0 = tf.strings.regex_replace(formatting_removed_es_1, '<author lang="es" class="0">\n\t', '<author_lang="es">')

  formatting_removed_en_1 = tf.strings.regex_replace(formatting_removed_es_0, '<author lang="en" class="1">\n\t', '<author_lang="en">')
  output_data = tf.strings.regex_replace(formatting_removed_en_1, '<author lang="en" class="0">\n\t', '<author_lang="en">')

  
  output_data = tf.strings.regex_replace(output_data, r'[^a-zA-Z0-9\s]', ' ')
 
  return  output_data

def oa_lo_ma(input_data):
  tmp_data = tf.strings.lower(input_data)

  formatting_removed_es_1 = tf.strings.regex_replace(tmp_data, '<author lang="es" class="1">\n\t', '<author_lang="es">')
  formatting_removed_es_0 = tf.strings.regex_replace(formatting_removed_es_1, '<author lang="es" class="0">\n\t', '<author_lang="es">')

  formatting_removed_en_1 = tf.strings.regex_replace(formatting_removed_es_0, '<author lang="en" class="1">\n\t', '<author_lang="en">')
  tmp_output = tf.strings.regex_replace(formatting_removed_en_1, '<author lang="en" class="0">\n\t', '<author_lang="en">')

  tag_open_CDATA_removed = tf.strings.regex_replace(tmp_output, '<\!\[CDATA\[', ' <')
  tag_closed_CDATA_removed = tf.strings.regex_replace(tag_open_CDATA_removed,'\]{1,}>', '')

  tag_open_documents_removed  = tf.strings.regex_replace(tag_closed_CDATA_removed, '<documents>\n(\t){0,2}', '')
  tag_closed_documents_removed = tf.strings.regex_replace(tag_open_documents_removed, '</documents>\n(\t){0,2}', '')

  tag_open_document_whitespace_removed = tf.strings.regex_replace(tag_closed_documents_removed, '<document> ', '<document>')
  output_data = tf.strings.regex_replace(tag_open_document_whitespace_removed, '</document>\n(\t){0,2}', '</document> ')

  output_data = tf.strings.regex_replace(output_data, r'[^a-zA-Z0-9\s]', ' ')
 
  return  output_data

def oa_sw(input_data):
  formatting_removed_es_1 = tf.strings.regex_replace(input_data, '<author lang="es" class="1">\n\t', '<author_lang="es">')
  formatting_removed_es_0 = tf.strings.regex_replace(formatting_removed_es_1, '<author lang="es" class="0">\n\t', '<author_lang="es">')

  formatting_removed_en_1 = tf.strings.regex_replace(formatting_removed_es_0, '<author lang="en" class="1">\n\t', '<author_lang="en">')
  output_data = tf.strings.regex_replace(formatting_removed_en_1, '<author lang="en" class="0">\n\t', '<author_lang="en">')
  
  output_data = tf.strings.regex_replace(output_data, r'[^a-zA-Z0-9\s]', ' ')

  #print("\n\nInput data è il seguente tensore:")
  #print(output_data)

  #print("Lo converto in stringa e diventa:")
  # Il seguente try per l'adattamento del ts. Nell'except caso della simulazione vera e propria.
  try:
    input_string=output_data[0]

  # # # # # # # Questo è il caso della chiamata a funzione per la simulazione vera e propria.  
  except:
    #print("\n\n****CASO DELLA SIMULAZIONE VERA E PROPRIA****\n\n")
    #print("\nQuesto è il contenuto di output data in caso di simulazione")
    #print(output_data)
    input_string=output_data
    
    try:
      input_string = input_string.numpy()
    
    except:
      #print("This one is not a tensor!")
      return output_data

    else:
      #print("\nestraendo il contenuto del tensore risulta:")
      #print(input_string)
      input_string=(str(input_string))[2:-1]

    #print(input_string)
    blob = TextBlob(str(input_string)).words

    if re.search('.*<author_lang="es">.*',input_string):
      outputlist = [word for word in blob if word not in stopwords.words('spanish')]      
      #print("tolte le stopword spagnole diventa:")
    else:
      outputlist = [word for word in blob if word not in stopwords.words('english')]
      #print("tolte le stopword inglesi diventa:")

    output_string = (' '.join(word for word in outputlist))
    #print(output_string)  

    output_tensor=tf.constant(output_string)
    #print(output_tensor)

    return output_tensor

   # # # # # # # Questo è il caso dell'adattamento del TS.   
  else:
    
    try:
      input_string = input_string.numpy()[0]
      #print(input_string)
    
    except:
      #print("This one is not a tensor!")
      return output_data

    else:
      input_string=(str(input_string))[2:-1]

    #print(input_string)
    blob = TextBlob(str(input_string)).words

    if re.search('.*<author_lang="es">.*',input_string):
      outputlist = [word for word in blob if word not in stopwords.words('spanish')]      
      #print("tolte le stopword spagnole diventa:")
    else:
      outputlist = [word for word in blob if word not in stopwords.words('english')]
      #print("tolte le stopword inglesi diventa:")

    output_string = (' '.join(word for word in outputlist))
    #print(output_string)  

    output_tensor=tf.constant([[output_string]])
    #print(output_tensor)

    return output_tensor

def oa_sw_ma(input_data):
  formatting_removed_es_1 = tf.strings.regex_replace(input_data, '<author lang="es" class="1">\n\t', '<author_lang="es">')
  formatting_removed_es_0 = tf.strings.regex_replace(formatting_removed_es_1, '<author lang="es" class="0">\n\t', '<author_lang="es">')

  formatting_removed_en_1 = tf.strings.regex_replace(formatting_removed_es_0, '<author lang="en" class="1">\n\t', '<author_lang="en">')
  tmp_output = tf.strings.regex_replace(formatting_removed_en_1, '<author lang="en" class="0">\n\t', '<author_lang="en">')

  tag_open_CDATA_removed = tf.strings.regex_replace(tmp_output, '<\!\[CDATA\[', ' <')
  tag_closed_CDATA_removed = tf.strings.regex_replace(tag_open_CDATA_removed,'\]{1,}>', '')

  tag_open_documents_removed  = tf.strings.regex_replace(tag_closed_CDATA_removed, '<documents>\n(\t){0,2}', '')
  tag_closed_documents_removed = tf.strings.regex_replace(tag_open_documents_removed, '</documents>\n(\t){0,2}', '')

  tag_open_document_whitespace_removed = tf.strings.regex_replace(tag_closed_documents_removed, '<document> ', '<document>')
  output_data = tf.strings.regex_replace(tag_open_document_whitespace_removed, '</document>\n(\t){0,2}', '</document> ')
  
  output_data = tf.strings.regex_replace(output_data, r'[^a-zA-Z0-9\s]', ' ')

  #print("\n\nInput data è il seguente tensore:")
  #print(output_data)

  #print("Lo converto in stringa e diventa:")
  # Il seguente try per l'adattamento del ts. Nell'except caso della simulazione vera e propria.
  try:
    input_string=output_data[0]

  # # # # # # # Questo è il caso della chiamata a funzione per la simulazione vera e propria.  
  except:
    #print("\n\n****CASO DELLA SIMULAZIONE VERA E PROPRIA****\n\n")
    #print("\nQuesto è il contenuto di output data in caso di simulazione")
    #print(output_data)
    input_string=output_data
    
    try:
      input_string = input_string.numpy()
    
    except:
      #print("This one is not a tensor!")
      return output_data

    else:
      #print("\nestraendo il contenuto del tensore risulta:")
      #print(input_string)
      input_string=(str(input_string))[2:-1]

    #print(input_string)
    blob = TextBlob(str(input_string)).words

    if re.search('.*<author_lang="es">.*',input_string):
      outputlist = [word for word in blob if word not in stopwords.words('spanish')]      
      #print("tolte le stopword spagnole diventa:")
    else:
      outputlist = [word for word in blob if word not in stopwords.words('english')]
      #print("tolte le stopword inglesi diventa:")

    output_string = (' '.join(word for word in outputlist))
    #print(output_string)  

    output_tensor=tf.constant(output_string)
    #print(output_tensor)

    return output_tensor

   # # # # # # # Questo è il caso dell'adattamento del TS.   
  else:
    
    try:
      input_string = input_string.numpy()[0]
      #print(input_string)
    
    except:
      #print("This one is not a tensor!")
      return output_data

    else:
      input_string=(str(input_string))[2:-1]

    #print(input_string)
    blob = TextBlob(str(input_string)).words

    if re.search('.*<author_lang="es">.*',input_string):
      outputlist = [word for word in blob if word not in stopwords.words('spanish')]      
      #print("tolte le stopword spagnole diventa:")
    else:
      outputlist = [word for word in blob if word not in stopwords.words('english')]
      #print("tolte le stopword inglesi diventa:")

    output_string = (' '.join(word for word in outputlist))
    #print(output_string)  

    output_tensor=tf.constant([[output_string]])
    #print(output_tensor)

    return output_tensor

def oa_sw_lo(input_data):
  tmp_out = tf.strings.lower(input_data)

  formatting_removed_es_1 = tf.strings.regex_replace(tmp_out, '<author lang="es" class="1">\n\t', '<author_lang="es">')
  formatting_removed_es_0 = tf.strings.regex_replace(formatting_removed_es_1, '<author lang="es" class="0">\n\t', '<author_lang="es">')

  formatting_removed_en_1 = tf.strings.regex_replace(formatting_removed_es_0, '<author lang="en" class="1">\n\t', '<author_lang="en">')
  output_data = tf.strings.regex_replace(formatting_removed_en_1, '<author lang="en" class="0">\n\t', '<author_lang="en">')
  
  output_data = tf.strings.regex_replace(output_data, r'[^a-zA-Z0-9\s]', ' ')

  #print("\n\nInput data è il seguente tensore:")
  #print(output_data)

  #print("Lo converto in stringa e diventa:")
  # Il seguente try per l'adattamento del ts. Nell'except caso della simulazione vera e propria.
  try:
    input_string=output_data[0]

  # # # # # # # Questo è il caso della chiamata a funzione per la simulazione vera e propria.  
  except:
    #print("\n\n****CASO DELLA SIMULAZIONE VERA E PROPRIA****\n\n")
    #print("\nQuesto è il contenuto di output data in caso di simulazione")
    #print(output_data)
    input_string=output_data
    
    try:
      input_string = input_string.numpy()
    
    except:
      #print("This one is not a tensor!")
      return output_data

    else:
      #print("\nestraendo il contenuto del tensore risulta:")
      #print(input_string)
      input_string=(str(input_string))[2:-1]

    #print(input_string)
    blob = TextBlob(str(input_string)).words

    if re.search('.*<author_lang="es">.*',input_string):
      outputlist = [word for word in blob if word not in stopwords.words('spanish')]      
      #print("tolte le stopword spagnole diventa:")
    else:
      outputlist = [word for word in blob if word not in stopwords.words('english')]
      #print("tolte le stopword inglesi diventa:")

    output_string = (' '.join(word for word in outputlist))
    #print(output_string)  

    output_tensor=tf.constant(output_string)
    #print(output_tensor)

    return output_tensor

   # # # # # # # Questo è il caso dell'adattamento del TS.   
  else:
    
    try:
      input_string = input_string.numpy()[0]
      #print(input_string)
    
    except:
      #print("This one is not a tensor!")
      return output_data

    else:
      input_string=(str(input_string))[2:-1]

    #print(input_string)
    blob = TextBlob(str(input_string)).words

    if re.search('.*<author_lang="es">.*',input_string):
      outputlist = [word for word in blob if word not in stopwords.words('spanish')]      
      #print("tolte le stopword spagnole diventa:")
    else:
      outputlist = [word for word in blob if word not in stopwords.words('english')]
      #print("tolte le stopword inglesi diventa:")

    output_string = (' '.join(word for word in outputlist))
    #print(output_string)  

    output_tensor=tf.constant([[output_string]])
    #print(output_tensor)

    return output_tensor

def oa_sw_lo_ma(input_data):

  tmp_out = tf.strings.lower(input_data)
  formatting_removed_es_1 = tf.strings.regex_replace(tmp_out, '<author lang="es" class="1">\n\t', '<author_lang="es">')
  formatting_removed_es_0 = tf.strings.regex_replace(formatting_removed_es_1, '<author lang="es" class="0">\n\t', '<author_lang="es">')

  formatting_removed_en_1 = tf.strings.regex_replace(formatting_removed_es_0, '<author lang="en" class="1">\n\t', '<author_lang="en">')
  tmp_output = tf.strings.regex_replace(formatting_removed_en_1, '<author lang="en" class="0">\n\t', '<author_lang="en">')

  tag_open_CDATA_removed = tf.strings.regex_replace(tmp_output, '<\!\[CDATA\[', ' <')
  tag_closed_CDATA_removed = tf.strings.regex_replace(tag_open_CDATA_removed,'\]{1,}>', '')

  tag_open_documents_removed  = tf.strings.regex_replace(tag_closed_CDATA_removed, '<documents>\n(\t){0,2}', '')
  tag_closed_documents_removed = tf.strings.regex_replace(tag_open_documents_removed, '</documents>\n(\t){0,2}', '')

  tag_open_document_whitespace_removed = tf.strings.regex_replace(tag_closed_documents_removed, '<document> ', '<document>')
  output_data = tf.strings.regex_replace(tag_open_document_whitespace_removed, '</document>\n(\t){0,2}', '</document> ')
  
  output_data = tf.strings.regex_replace(output_data, r'[^a-zA-Z0-9\s]', ' ')

  #print("\n\nInput data è il seguente tensore:")
  #print(output_data)

  #print("Lo converto in stringa e diventa:")
  # Il seguente try per l'adattamento del ts. Nell'except caso della simulazione vera e propria.
  try:
    input_string=output_data[0]

  # # # # # # # Questo è il caso della chiamata a funzione per la simulazione vera e propria.  
  except:
    #print("\n\n****CASO DELLA SIMULAZIONE VERA E PROPRIA****\n\n")
    #print("\nQuesto è il contenuto di output data in caso di simulazione")
    #print(output_data)
    input_string=output_data
    
    try:
      input_string = input_string.numpy()
    
    except:
      #print("This one is not a tensor!")
      return output_data

    else:
      #print("\nestraendo il contenuto del tensore risulta:")
      #print(input_string)
      input_string=(str(input_string))[2:-1]

    #print(input_string)
    blob = TextBlob(str(input_string)).words

    if re.search('.*<author_lang="es">.*',input_string):
      outputlist = [word for word in blob if word not in stopwords.words('spanish')]      
      #print("tolte le stopword spagnole diventa:")
    else:
      outputlist = [word for word in blob if word not in stopwords.words('english')]
      #print("tolte le stopword inglesi diventa:")

    output_string = (' '.join(word for word in outputlist))
    #print(output_string)  

    output_tensor=tf.constant(output_string)
    #print(output_tensor)

    return output_tensor

   # # # # # # # Questo è il caso dell'adattamento del TS.   
  else:
    
    try:
      input_string = input_string.numpy()[0]
      #print(input_string)
    
    except:
      #print("This one is not a tensor!")
      return output_data

    else:
      input_string=(str(input_string))[2:-1]

    #print(input_string)
    blob = TextBlob(str(input_string)).words

    if re.search('.*<author_lang="es">.*',input_string):
      outputlist = [word for word in blob if word not in stopwords.words('spanish')]      
      #print("tolte le stopword spagnole diventa:")
    else:
      outputlist = [word for word in blob if word not in stopwords.words('english')]
      #print("tolte le stopword inglesi diventa:")

    output_string = (' '.join(word for word in outputlist))
    #print(output_string)  

    output_tensor=tf.constant([[output_string]])
    #print(output_tensor)

    return output_tensor

def lo_ma(input_data):
  formatting_removed_es_1 = tf.strings.regex_replace(input_data, '<author lang="es" class="1">\n\t', '<author_lang="es">')
  formatting_removed_es_0 = tf.strings.regex_replace(formatting_removed_es_1, '<author lang="es" class="0">\n\t', '<author_lang="es">')

  formatting_removed_en_1 = tf.strings.regex_replace(formatting_removed_es_0, '<author lang="en" class="1">\n\t', '<author_lang="en">')
  formatting_removed_en_0 = tf.strings.regex_replace(formatting_removed_en_1, '<author lang="en" class="0">\n\t', '<author_lang="en">')

  tag_open_CDATA_removed = tf.strings.regex_replace(formatting_removed_en_0, '<\!\[CDATA\[', ' <')
  tag_closed_CDATA_removed = tf.strings.regex_replace(tag_open_CDATA_removed,'\]{1,}>', '')

  tag_open_documents_removed  = tf.strings.regex_replace(tag_closed_CDATA_removed, '<documents>\n(\t){0,2}', '')
  tag_closed_documents_removed = tf.strings.regex_replace(tag_open_documents_removed, '</documents>\n(\t){0,2}', '')

  tag_open_document_whitespace_removed = tf.strings.regex_replace(tag_closed_documents_removed, '<document> ', '<document>')
  output_data = tf.strings.regex_replace(tag_open_document_whitespace_removed, '</document>\n(\t){0,2}', '</document> ')

  return tf.strings.lower(output_data)

def sw_ma(input_data):
  formatting_removed_es_1 = tf.strings.regex_replace(input_data, '<author lang="es" class="1">\n\t', '<author_lang="es">')
  formatting_removed_es_0 = tf.strings.regex_replace(formatting_removed_es_1, '<author lang="es" class="0">\n\t', '<author_lang="es">')

  formatting_removed_en_1 = tf.strings.regex_replace(formatting_removed_es_0, '<author lang="en" class="1">\n\t', '<author_lang="en">')
  formatting_removed_en_0 = tf.strings.regex_replace(formatting_removed_en_1, '<author lang="en" class="0">\n\t', '<author_lang="en">')

  tag_open_CDATA_removed = tf.strings.regex_replace(formatting_removed_en_0, '<\!\[CDATA\[', ' <')
  tag_closed_CDATA_removed = tf.strings.regex_replace(tag_open_CDATA_removed,'\]{1,}>', '')

  tag_open_documents_removed  = tf.strings.regex_replace(tag_closed_CDATA_removed, '<documents>\n(\t){0,2}', '')
  tag_closed_documents_removed = tf.strings.regex_replace(tag_open_documents_removed, '</documents>\n(\t){0,2}', '')

  tag_open_document_whitespace_removed = tf.strings.regex_replace(tag_closed_documents_removed, '<document> ', '<document>')
  output_data = tf.strings.regex_replace(tag_open_document_whitespace_removed, '</document>\n(\t){0,2}', '</document> ')

  #print("Lo converto in stringa e diventa:")
  # Il seguente try per l'adattamento del ts. Nell'except caso della simulazione vera e propria.
  try:
    input_string=output_data[0]

  # # # # # # # Questo è il caso della chiamata a funzione per la simulazione vera e propria.  
  except:
    #print("\n\n****CASO DELLA SIMULAZIONE VERA E PROPRIA****\n\n")
    #print("\nQuesto è il contenuto di output data in caso di simulazione")
    #print(output_data)
    input_string=output_data
    
    try:
      input_string = input_string.numpy()
    
    except:
      #print("This one is not a tensor!")
      return output_data

    else:
      #print("\nestraendo il contenuto del tensore risulta:")
      #print(input_string)
      input_string=(str(input_string))[2:-1]

    #print(input_string)
    blob = TextBlob(str(input_string)).words

    if re.search('.*<author_lang="es">.*',input_string):
      outputlist = [word for word in blob if word not in stopwords.words('spanish')]      
      #print("tolte le stopword spagnole diventa:")
    else:
      outputlist = [word for word in blob if word not in stopwords.words('english')]
      #print("tolte le stopword inglesi diventa:")

    output_string = (' '.join(word for word in outputlist))
    #print(output_string)  

    output_tensor=tf.constant(output_string)
    #print(output_tensor)

    return output_tensor

   # # # # # # # Questo è il caso dell'adattamento del TS.   
  else:
    
    try:
      input_string = input_string.numpy()[0]
      #print(input_string)
    
    except:
      #print("This one is not a tensor!")
      return output_data

    else:
      input_string=(str(input_string))[2:-1]

    #print(input_string)
    blob = TextBlob(str(input_string)).words

    if re.search('.*<author_lang="es">.*',input_string):
      outputlist = [word for word in blob if word not in stopwords.words('spanish')]      
      #print("tolte le stopword spagnole diventa:")
    else:
      outputlist = [word for word in blob if word not in stopwords.words('english')]
      #print("tolte le stopword inglesi diventa:")

    output_string = (' '.join(word for word in outputlist))
    #print(output_string)  

    output_tensor=tf.constant([[output_string]])
    #print(output_tensor)

    return output_tensor


## Get the length of the longest sample in training set. Then adapt text.



In [None]:
def preprocess_and_adapt_ts(preprocessing_function,training_set):
  # Set a very large sequence length to find the longest sample in the training set.
  sequence_length = 10000
  vectorize_layer = TextVectorization(
      standardize=preprocessing_function,
      output_mode='int',
      output_sequence_length=sequence_length)

  train_text = training_set.map(lambda x, y: x)
  vectorize_layer.adapt(train_text)
  #vectorize_layer.get_vocabulary()

  model = tf.keras.models.Sequential()
  model.add(tf.keras.Input(shape=(1,), dtype=tf.string))
  model.add(vectorize_layer)

  longest_sample_length=1

  for element in training_set:
    authorDocument=element[0]
    label=element[1]
    
    #print("Sample considered is: ", authorDocument[0].numpy())
    #print("Preprocessed: ", str(custom_standardization(authorDocument[0].numpy())))
    #print("And has label: ", label[0].numpy())

    # Count the number of zeros from the last non-zero token to the end of the sample. 
    # Shortest tokenized sample has less zeros than others.
    out=model(authorDocument)
    token_nr_index=sequence_length-1
    current_sample_zeros_counter=0
    while out.numpy()[0][token_nr_index]==0:
      token_nr_index-=1
      current_sample_zeros_counter+=1

    shortest_padding_length=sequence_length-longest_sample_length
    if current_sample_zeros_counter<shortest_padding_length:
      longest_sample_length=sequence_length-current_sample_zeros_counter

  #print(out.numpy()[0][3229:3400])
  #print(longest_sample_length)

  # After tokenization longest_sample_length covers all the document lenghts in our dataset.
  sequence_length = longest_sample_length

  vectorize_layer = TextVectorization(
      standardize=preprocessing_function,
      output_mode='int',
      output_sequence_length=sequence_length)

  # Finally adapt the vectorize layer.
  train_text = training_set.map(lambda x, y: x)
  vectorize_layer.adapt(train_text)
  return vectorize_layer

## Define a dictionary with -> function_names:prepro_function_caller. And a dictionary to store model results.




In [None]:
model_results = {}
prepro_functions_dict_base = {
    'DO_NOTHING':do_nothing,
    'MARELI':mareli, # MA
    'LOWERCASE':lowercase, #LO
    'STOP_WORD':stop_word_removal, #SW
    'ONLY_ALPHANUMERIC':only_alphanumeric #OA
    }

# 4 functions = 16 combs...+1 for do_nothing
"""
FUNCS->   OA  SW  LO  MA
BITS->    0    0   0   0                    
"""
prepro_functions_dict_comb = {
    # 0. Do nothing - 0000
    'DO_NOTHING':do_nothing,
    # 1. MA - 0001
    'MARELI':mareli,
    # 2. LO - 0010
    'LOWERCASE':lowercase, #LO
    # 4. SW - 0100
    'STOP_WORD':stop_word_removal, 
    # 8. OA - 1000
    'ONLY_ALPHANUMERIC':only_alphanumeric, 
    # 3 .LO+MA - 0011
    'LO+MA':lo_ma, 
    # 5. SW+MA - 0101
    'SW+MA':sw_ma,
    # 6. SW+LO - 0110
    'SW+LO':sw_lo,
    # 7. SW+LO+MA - 0111
    'SW+LO+MA':sw_lo_ma,
    # 9. OA+MA - 1001
    'OA+MA':oa_ma,
    # 10. OA+LO - 1010
    'OA+LO':oa_lo,
    # 11. OA+LO+MA - 1011
    'OA+LO+MA':oa_lo_ma,
    # 12. OA+SW - 1100
    'OA+SW':oa_sw,
    # 13. OA+SW+MA - 1101
    'OA+SW+MA':oa_sw_ma,
    # 14. OA+SW+LO - 1110
    'OA+SW+LO':oa_sw_lo,
    # 15. OA+SW+LO+MA - 1111
    'OA+SW+LO+MA':oa_sw_lo_ma,
}

for key in prepro_functions_dict_comb:
  print(key)
  model_results[key]={1:0,2:0,3:0,4:0,5:0}

DO_NOTHING
MARELI


## Models definition and evaluation.




In [None]:
for fold_nr in range(0,total_fold_nr):
  print("\n* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *")

  print("FOLD NR.: ", str(fold_nr+1))
 
  for key in prepro_functions_dict_comb:
    print("\n* * * * EVALUATION USING", key, "AS PREPROCESSING FUNCTION * * * *")

    # Preprocess training set to build a dictionary.
    vectorize_layer = preprocess_and_adapt_ts(prepro_functions_dict_comb[key],train[fold_nr])

    print("\n\n***** FINITO DI PROCESSARE E ADATTARE IL TRAINING SET, INIZIA LA SIMULAZIONE *******")
    # Print a raw and a preprocessed sample.
    for element in train[fold_nr]:
      authorDocument=element[0]
      label=element[1]
      
      print("Sample considered is: ", authorDocument[0])
      print("Preprocessed: ", str(prepro_functions_dict_comb[key](authorDocument[0].numpy())))
      break
    
    # # # - - - - - MODELS DEFINITION AND EVALUATION - - - - - # # #

    model = tf.keras.models.Sequential()
    model.add(tf.keras.Input(shape=(1,), dtype=tf.string))
    model.add(vectorize_layer)
    
    # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # 

    # --- SVM SECTION START --- #

    training_labels=[]
    training_samples=[]

    max_features=len(vectorize_layer.get_vocabulary()) + 1

    for element in train[fold_nr]:
      authorDocument=element[0]
      label=element[1]
      
      #print("Sample considered is: ", authorDocument[0])
      #print("Preprocessed: ", str(custom_standardization(authorDocument[0].numpy())))
      #print("And has label: ", label[0].numpy())
      
      text_vect_layer_model = tf.keras.Model(inputs=model.input,
                                          outputs=model.layers[0].output)
      text_vect_out = text_vect_layer_model(authorDocument)

      training_labels.append(label[0].numpy())
      current_sample=np.zeros(max_features)
      for current_token in text_vect_out[0][:].numpy():
        #print(current_token,end=' ')
        #print(vectorize_layer.get_vocabulary()[current_token])
        current_sample[current_token]+=1
      training_samples.append(current_sample)
      #break

    training_labels=np.array(training_labels)
    training_samples=np.array(training_samples)
    #print("\nLE LABELS DEI CAMPIONI DI TRAINING SONO:")
    #print(training_labels)
    #print("\nI SAMPLE DI TRAINING DOPO LA TEXT VECTORIZATION SONO:")
    #print(training_samples)

    test_labels=[]
    test_samples=[]

    for element in val[fold_nr]:
      authorDocument=element[0]
      label=element[1]
      
      text_vect_layer_model = tf.keras.Model(inputs=model.input,
                                          outputs=model.layers[0].output)
      text_vect_out = text_vect_layer_model(authorDocument)

      test_labels.append(label[0].numpy())
      current_sample=np.zeros(max_features)
      for current_token in text_vect_out[0][:].numpy():
        current_sample[current_token]+=1
      test_samples.append(current_sample)

    test_labels=np.array(test_labels)
    test_samples=np.array(test_samples)

    print("\n\nPerformance with SVM:",)

    SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
    SVM.fit(training_samples,training_labels)
    # predict the labels on training set
    #predictions_SVM = SVM.predict(training_samples)
    # Use accuracy_score function to get the accuracy
    #model_results[key]['SVM_train']=SVM.score(training_samples,training_labels)
    #print("SVM Accuracy Score on Training set -> ",model_results[key]['SVM_train'])

    # predict the labels on validation dataset
    predictions_SVM = SVM.predict(test_samples)
    # Use accuracy_score function to get the accuracy
    model_results[key][(fold_nr+1)]=SVM.score(test_samples,test_labels)
    print("SVM Accuracy Score on Test set -> ",model_results[key][(fold_nr+1)])

    # --- SVM SECTION END --- #

    # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
    
    # # # - - - - - MODEL DEFINITION AND EVALUATION END  - - - - - # # #


* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
FOLD NR.:  1

* * * * EVALUATION USING DO_NOTHING AS PREPROCESSING FUNCTION * * * *


***** FINITO DI PROCESSARE E ADATTARE IL TRAINING SET, INIZIA LA SIMULAZIONE *******


Performance with SVM:
SVM Accuracy Score on Test set ->  0.625

* * * * EVALUATION USING MARELI AS PREPROCESSING FUNCTION * * * *


***** FINITO DI PROCESSARE E ADATTARE IL TRAINING SET, INIZIA LA SIMULAZIONE *******


Performance with SVM:
SVM Accuracy Score on Test set ->  0.625

* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
FOLD NR.:  2

* * * * EVALUATION USING DO_NOTHING AS PREPROCESSING FUNCTION * * * *


***** FINITO DI PROCESSARE E ADATTARE IL TRAINING SET, INIZIA LA SIMULAZIONE *******


Performance with SVM:
SVM Accuracy Score on Test set ->  0.6375

* * * * EVALUATION USING MARELI AS PREPROCESSING FUNCTION * * * *


***** FINITO DI PROCESSARE E ADATTARE IL TRAINING SET, INIZIA LA SIMULAZIONE *******


Performance with SVM:


## Now show compact results in a table.

In [None]:
print(" VAL_FOLD_NR / PREPRO_FUNC\t    |  1st FOLD   |    2nd FOLD    |    3rd FOLD   |    4th FOLD    |   5th FOLD    |",end = '')

print("\n")
for prepro_func in prepro_functions_dict_comb:
  print('{:15s}'.format(prepro_func),"\t\t\t",end='')
  for model_set in model_results[prepro_func]: 
    print(format(round(model_results[prepro_func][model_set],4),'.4f'),"\t\t",end='')
  print("\n\n")
 

 VAL_FOLD_NR / PREPRO_FUNC	    |  1st FOLD   |    2nd FOLD    |    3rd FOLD   |    4th FOLD    |   5th FOLD    |

DO_NOTHING      			0.6250 		0.6375 		0.6250 		0.6875 		0.6375 		


MARELI          			0.6250 		0.6375 		0.6250 		0.6875 		0.6375 		


