<a href="https://colab.research.google.com/github/marco-siino/text_preprocessing_impact/blob/main/PCL_DS/BiLSTM_PCL_TextPreProImpact_NB.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Text preprocessing worth the time: A comparative survey on the impact of common techniques on NLP model performances. 
- - - 
BiLSTM ON PCL DS EXPERIMENTS NOTEBOOK 
- - -
Bidirectional Long Short-Term Memory Network on Patronizing and Condescending Language Dataset.
Code by M. Siino. 

From the paper: "Text preprocessing worth the time: A comparative survey on the impact of common techniques on NLP model performances." by M.Siino et al.

## Importing modules.

In [None]:
import matplotlib.pyplot as plt
import os
import re
import shutil
import string
import tensorflow as tf
import numpy as np
import pandas as pd
import nltk

from tensorflow.keras import layers
from tensorflow.keras import losses
from tensorflow.keras import preprocessing
from keras.models import Model
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from io import open
from pathlib import Path
from urllib import request

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 
from nltk.stem import PorterStemmer
from textblob import TextBlob
nltk.download('stopwords')
nltk.download('punkt')

os.environ['TF_CUDNN_DETERMINISTIC']='false'
os.environ['TF_DETERMINISTIC_OPS']='false'

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Domenico\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Domenico\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Fetch all the files needed.

In [None]:
urlDontPatronizeMe_PCL_tsv = "https://drive.google.com/uc?export=download&id=1KAncWruZ4OkKlvEGnxLkNbR1sxAErG8_"
urlDontPatronizeMe_categories_tsv = "https://drive.google.com/uc?export=download&id=1KMZJsskzKLbM-kgYiwXIF0h3Shj4F0mM"
urlTestSet_csv = "https://drive.google.com/uc?export=download&id=161-_6MH16_UHtLTqt0nd09l68pP5MEbQ"
urlDevSet_csv = "https://drive.google.com/uc?export=download&id=1KNuZ_h7NXTSwEz3_0XkaEd4DUAyxojAU"
urlTrainSet_csv = "https://drive.google.com/uc?export=download&id=1KVRrMC9UVwtQE9QfcLv8b11P5t2BhA8I"

tmp = tf.keras.utils.get_file("dontpatronizeme_pcl.tsv", urlDontPatronizeMe_PCL_tsv,
                                    extract=False, cache_dir='.',
                                    cache_subdir='')
tmp = tf.keras.utils.get_file("dontpatronizeme_categories.tsv", urlDontPatronizeMe_categories_tsv,
                                    extract=False, cache_dir='.',
                                    cache_subdir='')
tmp = tf.keras.utils.get_file("task4_test.csv", urlTestSet_csv,
                                    extract=False, cache_dir='.',
                                    cache_subdir='')
tmp = tf.keras.utils.get_file("dev_semeval_parids-labels.csv", urlDevSet_csv,
                                    extract=False, cache_dir='.',
                                    cache_subdir='')
tmp = tf.keras.utils.get_file("train_semeval_parids-labels.csv", urlTrainSet_csv,
                                    extract=False, cache_dir='.',
                                    cache_subdir='')


## Fetch Don't Patronize Me! data manager module




In [None]:
module_url = f"https://raw.githubusercontent.com/Perez-AlmendrosC/dontpatronizeme/master/semeval-2022/dont_patronize_me.py"
module_name = module_url.split('/')[-1]
print(f'Fetching {module_url}')
#with open("file_1.txt") as f1, open("file_2.txt") as f2
with request.urlopen(module_url) as f, open(module_name,'w') as outf:
  a = f.read()
  outf.write(a.decode('utf-8'))

Fetching https://raw.githubusercontent.com/Perez-AlmendrosC/dontpatronizeme/master/semeval-2022/dont_patronize_me.py


## Import and load dpm

In [None]:
from dont_patronize_me import DontPatronizeMe
dpm = DontPatronizeMe('.', '.')
dpm.load_task1()
dpm.load_task2(return_one_hot=True)

Map of label to numerical label:
{'Unbalanced_power_relations': 0, 'Shallow_solution': 1, 'Presupposition': 2, 'Authority_voice': 3, 'Metaphors': 4, 'Compassion': 5, 'The_poorer_the_merrier': 6}


## Load paragraph IDs

In [None]:
trids = pd.read_csv('train_semeval_parids-labels.csv')
teids = pd.read_csv('dev_semeval_parids-labels.csv')

In [None]:
print(trids.head())
print(len(trids))
print(len(teids))

   par_id                  label
0    4341  [1, 0, 0, 1, 0, 0, 0]
1    4136  [0, 1, 0, 0, 0, 0, 0]
2   10352  [1, 0, 0, 0, 0, 1, 0]
3    8279  [0, 0, 0, 1, 0, 0, 0]
4    1164  [1, 0, 0, 1, 1, 1, 0]
8375
2094


In [None]:
trids.par_id = trids.par_id.astype(str)
teids.par_id = teids.par_id.astype(str)

## Rebuild training set (Task 1)

In [None]:
rows = [] # will contain par_id, keyword, country, label and text
for idx in range(len(trids)):  
  parid = trids.par_id[idx]
  #print(parid)
  # select row from original dataset to retrieve `text` and binary label
  text = dpm.train_task1_df.loc[dpm.train_task1_df.par_id == parid].text.values[0]
  label = dpm.train_task1_df.loc[dpm.train_task1_df.par_id == parid].label.values[0]
  #keyword = dpm.train_task1_df.loc[dpm.train_task1_df.par_id == parid].keyword.values[0]
  #country = dpm.train_task1_df.loc[dpm.train_task1_df.par_id == parid].country.values[0]
  rows.append({
      'par_id':parid,
      #'keyword':keyword,
      #'country':country,
      'text':text,
      'label':label
  })
  

In [None]:
trdf1 = pd.DataFrame(rows)

In [None]:
len(rows)

8375

In [None]:
print(trdf1)

     par_id                                               text  label
0      4341  The scheme saw an estimated 150,000 children f...      1
1      4136  Durban 's homeless communities reconciliation ...      1
2     10352  The next immediate problem that cropped up was...      1
3      8279  Far more important than the implications for t...      1
4      1164  To strengthen child-sensitive social protectio...      1
...     ...                                                ...    ...
8370   8380  Rescue teams search for survivors on the rubbl...      0
8371   8381  The launch of ' Happy Birthday ' took place la...      0
8372   8382  The unrest has left at least 20,000 people dea...      0
8373   8383  You have to see it from my perspective . I may...      0
8374   8384  Yet there was one occasion when we went to the...      0

[8375 rows x 3 columns]


In [None]:
# Next row to balance the DS.
bal_trdf1 = trdf1.drop(trdf1.index[-6787:])

print("Total number of samples is:",len(bal_trdf1))
print("Total number of class 1 samples is:",sum(bal_trdf1.label == 1))
print("Total number of class 0 samples is:",sum(bal_trdf1.label == 0))

trdf1 = bal_trdf1

Total number of samples is: 1588
Total number of class 1 samples is: 794
Total number of class 0 samples is: 794


## Rebuild test set (Task 1)

In [None]:
rows = [] # will contain par_id, label and text
for idx in range(len(teids)):  
  parid = teids.par_id[idx]
  #print(parid)
  # select row from original dataset
  text = dpm.train_task1_df.loc[dpm.train_task1_df.par_id == parid].text.values[0]
  label = dpm.train_task1_df.loc[dpm.train_task1_df.par_id == parid].label.values[0]
  #keyword = dpm.train_task1_df.loc[dpm.train_task1_df.par_id == parid].keyword.values[0]
  #country = dpm.train_task1_df.loc[dpm.train_task1_df.par_id == parid].country.values[0]
  rows.append({
      'par_id':parid,
      #'keyword':keyword,
      #'country':country,
      'text':text,
      'label':label
  })
   

In [None]:
len(rows)

2094

In [None]:
tedf1 = pd.DataFrame(rows)

In [None]:
# Next row to balance the DS.
bal_tedf1 = tedf1.drop(tedf1.index[-1696:])

print("Total number of samples is:",len(bal_tedf1))
print("Total number of class 1 samples is:",sum(bal_tedf1.label == 1))
print("Total number of class 0 samples is:",sum(bal_tedf1.label == 0))

tedf1 = bal_tedf1

Total number of samples is: 398
Total number of class 1 samples is: 199
Total number of class 0 samples is: 199


## Preprocessing functions definitions

In [None]:
# Do-Nothing preprocessing function.
def DON(input_data):
  tag_open_CDATA_removed = tf.strings.regex_replace(input_data, '<\!\[CDATA\[', ' ')
  tag_closed_CDATA_removed = tf.strings.regex_replace(tag_open_CDATA_removed,'\]{1,}>', ' ')
  tag_author_lang_en_removed = tf.strings.regex_replace(tag_closed_CDATA_removed,'<author lang="en">', ' ')
  tag_closed_author_removed = tf.strings.regex_replace(tag_author_lang_en_removed,'</author>', ' ')
  tag_open_documents_removed = tf.strings.regex_replace(tag_closed_author_removed,'<documents>\n(\t){0,2}', '')
  output_data = tf.strings.regex_replace(tag_open_documents_removed,'</documents>\n(\t){0,2}', ' ')
  return output_data

# Lowercasing preprocessing function.
def LOW(input_data):  
  return tf.strings.lower(DON(input_data))

# Removing Stop Words function.
def RSW(input_data):
  output_data = DON(input_data)

  #print("\n\nInput data è il seguente tensore:")
  #print(output_data)

  #print("Lo converto in stringa e diventa:")
  # Il seguente try per l'adattamento del ts. Nell'except caso della simulazione vera e propria.
  try:
    input_string=output_data[0]

  # # # # # # # Questo è il caso della chiamata a funzione per la simulazione vera e propria.  
  except:
    #print("\n\n****CASO DELLA SIMULAZIONE VERA E PROPRIA****\n\n")
    #print("\nQuesto è il contenuto di output data in caso di simulazione")
    #print(output_data)
    input_string=output_data
    
    try:
      input_string = input_string.numpy()
    
    except:
      #print("This one is not a tensor!")
      return output_data

    else:
      #print("\nEstraendo il contenuto del tensore risulta:")
      input_string=(str(input_string))[2:-1]

    #print(input_string)
    blob = TextBlob(str(input_string)).words

    outputlist = [word for word in blob if word not in stopwords.words('english')]
    #print("tolte le stopword inglesi diventa:")

    output_string = (' '.join(word for word in outputlist))
    #print(output_string)  

    output_tensor=tf.constant(output_string)
    #print(output_tensor)

    return output_tensor

   # # # # # # # Questo è il caso dell'adattamento del TS.   
  else:
    
    try:

      # input_string = input_string.numpy() [0]
      input_string = input_string.numpy()
    
    except:
      #print("This one is not a tensor!")
      return output_data

    else:
      input_string=(str(input_string))[2:-1]

    #print(input_string)
    blob = TextBlob(str(input_string)).words

    outputlist = [word for word in blob if word not in stopwords.words('english')]
    #print("Tolte le stopword inglesi diventa:")

    output_string = (' '.join(word for word in outputlist))
    #print(output_string)  

    output_tensor=tf.constant([[output_string]])
    #print(output_tensor)

    return output_tensor

  return output_data

# Porter Stemmer preprocessing function.
def STM(input_data):
  output_data = DON(input_data)
  stemmer = PorterStemmer()

  #print("\n\nInput data è il seguente tensore:")
  #print(output_data)

  #print("Lo converto in stringa e diventa:")
  # Il seguente try per l'adattamento del ts. Nell'except caso della simulazione vera e propria.
  try:
    input_string=output_data[0]

  # # # # # # # Questo è il caso della chiamata a funzione per la simulazione vera e propria.  
  except:
    #print("\n\n****CASO DELLA SIMULAZIONE VERA E PROPRIA****\n\n")
    #print("\nQuesto è il contenuto di output data in caso di simulazione")
    #print(output_data)
    input_string=output_data
    
    try:
      input_string = input_string.numpy()
    
    except:
      #print("This one is not a tensor!")
      return output_data

    else:
      #print("\nEstraendo il contenuto del tensore risulta:")
      #print(input_string)
      input_string=(str(input_string))[2:-1]

    #print(input_string)
    blob = TextBlob(str(input_string)).words

    outputlist = [stemmer.stem(word) for word in blob]

    output_string = (' '.join(word for word in outputlist))
    #print(output_string)  

    output_tensor=tf.constant(output_string)
    #print(output_tensor)

    return output_tensor

   # # # # # # # Questo è il caso dell'adattamento del TS.   
  else:
    
    try:
      #input_string = input_string.numpy()[0]
      input_string = input_string.numpy()
      #print(input_string)
    
    except:
      #print("This one is not a tensor!")
      return output_data

    else:
      input_string=(str(input_string))[2:-1]

    #print(input_string)
    blob = TextBlob(str(input_string)).words

    outputlist = [stemmer.stem(word) for word in blob]

    output_string = (' '.join(word for word in outputlist))

    output_tensor=tf.constant([[output_string]])
    #print(output_tensor)

    return output_tensor

  return output_data

## Define the combined preprocessing functions. (The base functions are: DON, LOW, RSW and STM).

In [None]:
## SECTION WITH PAIRS OF PREPRO FUNCTIONS. APPLICATION ORDER MATTERS (...IN FOLLOWING SECTIONS TOO).
#...5
def LOW_RSW(input_data):
  return RSW(LOW(input_data))

# 6
def LOW_STM(input_data):
  return STM(LOW(input_data))

# 7
def RSW_LOW(input_data):
  return LOW(RSW(input_data))

# 8
def RSW_STM(input_data):
  return STM(RSW(input_data))

# 9
def STM_LOW(input_data):
  return LOW(STM(input_data))

# 10
def STM_RSW(input_data):
  return RSW(STM(input_data))
  
# 11
def LOW_STM_RSW(input_data):
  return RSW(STM(LOW(input_data)))

# 12
def LOW_RSW_STM(input_data):
  return STM(RSW(LOW(input_data)))

# 13
def STM_LOW_RSW(input_data):
  return RSW(LOW(STM(input_data)))

# 14
def STM_RSW_LOW(input_data):
  return LOW(RSW(STM(input_data)))

# 15
def RSW_LOW_STM(input_data):
  return STM(LOW(RSW(input_data)))

# 16
def RSW_STM_LOW(input_data):
  return LOW(STM(RSW(input_data)))

## Build a Tensorflow DS

In [None]:
tf.random.set_seed(1)

train_ds = False
for i in range(0,len(trdf1)):
  sample = [' '+trdf1['text'][i]]
  label = [trdf1['label'][i]]

  current_set = (
    tf.data.Dataset.from_tensor_slices(
        (
            [tf.cast(sample,tf.string)],
            [tf.cast(label, tf.int32)]
        )
    )
  )
  if train_ds != False:
    train_ds = train_ds.concatenate(current_set)
  else:
    train_ds = current_set
train_ds = train_ds.shuffle(len(train_ds), reshuffle_each_iteration=False)

test_ds = False
for i in range(0,len(tedf1)):
  sample = [' '+tedf1['text'][i]]
  label = [tedf1['label'][i]]

  current_set = (
    tf.data.Dataset.from_tensor_slices(
        (
            [tf.cast(sample,tf.string)],
            [tf.cast(label, tf.int32)]
        )
    )
  )
  if test_ds != False:
    test_ds = test_ds.concatenate(current_set)
  else:
    test_ds = current_set

for element in train_ds:
    authorDocument=element[0]
    label=element[1]
    print(authorDocument)
    print(label)
    break
    
print(train_ds)


tf.Tensor([b' LUSAKA Zambia ( Xinhua ) -- Zambia ? s Immigration Department said Wednesday that it had arrested at least 45 illegal immigrants in Lusaka , the country ? s capital .'], shape=(1,), dtype=string)
tf.Tensor([0], shape=(1,), dtype=int32)
<ShuffleDataset element_spec=(TensorSpec(shape=(1,), dtype=tf.string, name=None), TensorSpec(shape=(1,), dtype=tf.int32, name=None))>


## Get the length of the longest sample in training set. Then adapt text.



In [None]:
def preprocess_and_adapt_ts(preprocessing_function,training_set):
  # Set a very large sequence length to find the longest sample in the training set.
  sequence_length = 20000
  vectorize_layer = TextVectorization(
      standardize=preprocessing_function,
      output_mode='int',
      output_sequence_length=sequence_length)

  train_text = training_set.map(lambda x, y: x)
  vectorize_layer.adapt(train_text)
  #vectorize_layer.get_vocabulary()

  model = tf.keras.models.Sequential()
  model.add(tf.keras.Input(shape=(1,), dtype=tf.string))
  model.add(vectorize_layer)

  longest_sample_length=1

  for element in training_set:
    authorDocument=element[0]
    label=element[1]
    
    #print("Sample considered is: ", authorDocument[0].numpy())
    #print("Preprocessed: ", str(custom_standardization(authorDocument[0].numpy())))
    #print("And has label: ", label[0].numpy())

    # Count the number of zeros from the last non-zero token to the end of the sample. 
    # Shortest tokenized sample has less zeros than others.
    out=model(authorDocument)
    token_nr_index=sequence_length-1
    current_sample_zeros_counter=0
    while out.numpy()[0][token_nr_index]==0:
      token_nr_index-=1
      current_sample_zeros_counter+=1

    shortest_padding_length=sequence_length-longest_sample_length
    if current_sample_zeros_counter<shortest_padding_length:
      longest_sample_length=sequence_length-current_sample_zeros_counter

  #print(out.numpy()[0][3229:3400])
  #print(longest_sample_length)

  # After tokenization longest_sample_length covers all the document lenghts in our dataset.
  sequence_length = longest_sample_length

  vectorize_layer = TextVectorization(
      standardize=preprocessing_function,
      output_mode='int',
      output_sequence_length=sequence_length)

  # Finally adapt the vectorize layer.
  train_text = training_set.map(lambda x, y: x)
  vectorize_layer.adapt(train_text)
  return vectorize_layer

## Define a dictionary with -> function_names:prepro_function_caller. And a dictionary to store model results.




In [None]:
model_results = {}
prepro_functions_dict_base = {
    'DON':DON,
    'LOW':LOW,
    'RSW':RSW,
    'STM':STM
    }

# 3 prepro functions = 15 combs...+1 for do_nothing

prepro_functions_dict_comb = {
    # 1. Do nothing 
    'DON': DON,
    # 2. Lowercasing 
    'LOW':LOW,
    # 3. Removing Stopwords
    'RSW':RSW, 
    # 4. Porter Stemming
    'STM':STM,
    # 5. LOW->RSW
    'LOW_RSW':LOW_RSW, 
    # 6. LOW->STM
    'LOW_STM':LOW_STM,
    # 7. RSW->LOW
    'RSW_LOW':RSW_LOW,
    # 8. RSW->STM
    'RSW_STM':RSW_STM,
    # 9. STM->LOW
    'STM_LOW':STM_LOW,
    # 10. STM->RSW
    'STM_RSW':STM_RSW,
    # 11. LOW->STM->RSW
    'LOW_STM_RSW':LOW_STM_RSW,  
    # 12. LOW->RSW->STM
    'LOW_RSW_STM':LOW_RSW_STM,
    # 13. STM->LOW->RSW
    'STM_LOW_RSW':STM_LOW_RSW,
    # 14. STM->RSW->LOW
    'STM_RSW_LOW':STM_RSW_LOW,
    # 15. RSW->LOW->STM
    'RSW_LOW_STM':RSW_LOW_STM,
    # 16. RSW->STM->LOW
    'RSW_STM_LOW':RSW_STM_LOW
}

for key in prepro_functions_dict_comb:
  print(key)
  model_results[key]=[]

DON
LOW
RSW
STM
LOW_RSW
LOW_STM
RSW_LOW
RSW_STM
STM_LOW
STM_RSW
LOW_STM_RSW
LOW_RSW_STM
STM_LOW_RSW
STM_RSW_LOW
RSW_LOW_STM
RSW_STM_LOW


## Some training hyperparameters...

In [None]:
# Word embedding dimensions.
embedding_dim = 100

num_runs = 5 
# No need to go over the 20th epoch...Overfitting begins.
num_epochs_per_run = 20

opt = tf.keras.optimizers.RMSprop()

## Models definition and evaluation.




In [None]:
tf.random.set_seed(0)
TF_DISABLE_SEGMENT_REDUCTION_OP_DETERMINISM_EXCEPTIONS=True

# Reset model_results list.
for key in prepro_functions_dict_comb:
  model_results[key]=[]

for key in prepro_functions_dict_comb:
  runs_accuracy = []

  print("\n\n* * * * EVALUATION USING", key, "AS PREPROCESSING FUNCTION * * * *")

  # Preprocess training set to build a dictionary.
  vectorize_layer = preprocess_and_adapt_ts(prepro_functions_dict_comb[key],train_ds)

  max_features=len(vectorize_layer.get_vocabulary()) + 1
  print("Vocabulary size is:", max_features)

  for run in range(1,(num_runs+1)):
    epochs_accuracy=[]
    model = tf.keras.Sequential([
                                    tf.keras.Input(shape=(1,), dtype=tf.string),
                                    vectorize_layer,
                                    layers.Embedding(max_features + 1, embedding_dim),                     
                                    layers.Dropout(0.8),

                                    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64,  return_sequences=True)),
                                    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
                                    tf.keras.layers.Dense(64, activation='relu'),
                                    tf.keras.layers.Dropout(0.5),
                                    tf.keras.layers.Dense(1)                            
    ])
    model.compile(loss=losses.BinaryCrossentropy(from_logits=True), optimizer=opt, metrics=tf.metrics.BinaryAccuracy(threshold=0.0)) 

    for epoch in range (0,num_epochs_per_run):
        history = model.fit(
          train_ds,
          validation_data = test_ds,
          epochs=1,
          shuffle=False,
          # Comment the following line to do not save and download the model.
          #callbacks=[callbacks]
          )
        accuracy = history.history['val_binary_accuracy']
        print("Run: ",run,"/ Accuracy at epoch ",epoch," is: ", accuracy[0],"\n")
        epochs_accuracy.append(accuracy[0])

    print("Accuracies over epochs:",epochs_accuracy,"\n\n")
    runs_accuracy.append(max(epochs_accuracy))

  runs_accuracy.sort()
  print("\n\n Over all runs maximum accuracies on English are:", runs_accuracy)
  print("The median for English is:",runs_accuracy[2],"\n\n\n")
  
  if (runs_accuracy[2]-runs_accuracy[0])>(runs_accuracy[4]-runs_accuracy[2]):
    max_range_from_median = runs_accuracy[2]-runs_accuracy[0]
  else:
    max_range_from_median = runs_accuracy[4]-runs_accuracy[2]
  final_result = str(runs_accuracy[2])+" +/- "+ str(max_range_from_median)
  model_results[key].append(final_result)
  print("BiLSTM Accuracy Score on Test set -> ",model_results[key])

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Run:  4 / Accuracy at epoch  13  is:  0.7085427045822144 

Run:  4 / Accuracy at epoch  14  is:  0.6809045076370239 

Run:  4 / Accuracy at epoch  15  is:  0.6984924674034119 

Run:  4 / Accuracy at epoch  16  is:  0.713567852973938 

Run:  4 / Accuracy at epoch  17  is:  0.6206030249595642 

Run:  4 / Accuracy at epoch  18  is:  0.6683416962623596 

Run:  4 / Accuracy at epoch  19  is:  0.6708542704582214 

Accuracies over epochs: [0.5025125741958618, 0.5929648280143738, 0.6683416962623596, 0.6532663106918335, 0.6758794188499451, 0.7185929417610168, 0.7160804271697998, 0.7185929417610168, 0.7311557531356812, 0.7261306643486023, 0.7361809015274048, 0.733668327331543, 0.7211055159568787, 0.7085427045822144, 0.6809045076370239, 0.6984924674034119, 0.713567852973938, 0.6206030249595642, 0.6683416962623596, 0.6708542704582214] 


Run:  5 / Accuracy at epoch  0  is:  0.5 

Run:  5 / Accuracy at epoch  1  is:  0.653266310691833

## Now show compact results in a table.

In [None]:
print(" PREPRO FUNCTION    |  Test Accuracy   |",end = '')

print("\n")
for prepro_func in prepro_functions_dict_comb:
  #print(prepro_func,"\t\t\t",format(round(model_results[prepro_func][0],4),'.4f'),"\t\t",end='')
  result = model_results[prepro_func][0]
  # result = format(round(model_results[prepro_func][0],4),'.4f')
  print(f'{prepro_func:27}{ result :12}')
  print("\n")

 PREPRO FUNCTION    |  Test Accuracy   |

DON                        0.7462311387062073 +/- 0.02010047435760498


LOW                        0.7487437129020691 +/- 0.015075385570526123


RSW                        0.7562814354896545 +/- 0.03517591953277588


STM                        0.7487437129020691 +/- 0.022613048553466797


LOW_RSW                    0.7512562870979309 +/- 0.007537722587585449


LOW_STM                    0.7487437129020691 +/- 0.02010047435760498


RSW_LOW                    0.7437185645103455 +/- 0.007537662982940674


RSW_STM                    0.7512562870979309 +/- 0.020100533962249756


STM_LOW                    0.7462311387062073 +/- 0.012562811374664307


STM_RSW                    0.7487437129020691 +/- 0.01005023717880249


LOW_STM_RSW                0.7487437129020691 +/- 0.007537722587585449


LOW_RSW_STM                0.7512562870979309 +/- 0.012562811374664307


STM_LOW_RSW                0.7512562870979309 +/- 0.005025148391723633


STM_RSW_LOW  