<a href="https://colab.research.google.com/github/marco-siino/DA-ESWA/blob/main/code/evaluation/fns/CNN_FNS_augmented_IT_DE.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Investigating text data augmentation using back translation for author profiling
- - - 
CNN ON HSS DS EXPERIMENTS NOTEBOOK 
- - -
Convolutional Neural Network on Fake News Spreaders Dataset augmented with both IT and DE backtranslation.
Code by M. Siino. 

From the paper: "Investigating text data augmentation using back translation for author profiling" by M.Siino et al.



## Importing modules.

In [1]:
import matplotlib.pyplot as plt
import os
import random
import re
import shutil
import string
import tensorflow as tf

import numpy as np

from tensorflow.keras import layers
from tensorflow.keras import losses
from tensorflow.keras import preprocessing
from keras.models import Model
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

os.environ['TF_CUDNN_DETERMINISTIC']='true'
os.environ['TF_DETERMINISTIC_OPS']='true'

## Importing DS and extract in current working directory.

In [2]:
urlTrainingSet = "https://github.com/marco-siino/DA-ESWA/raw/main/data/fns/fns-training-augmented-IT-DE.zip"
training_set = tf.keras.utils.get_file("pan20-author-profiling-training-2020-02-23-augmented.zip", urlTrainingSet,
                                    extract=True, archive_format='zip',cache_dir='.',
                                    cache_subdir='')

urlTestSet="https://github.com/marco-siino/DA-ESWA/raw/main/data/fns/fns-test-original.zip"
test_set = tf.keras.utils.get_file("pan20-author-profiling-test-2020-02-23.zip", urlTestSet,
                                    extract=True, archive_format='zip',cache_dir='.',
                                    cache_subdir='')

urlTestSetAug="https://github.com/marco-siino/DA-ESWA/raw/main/data/fns/fns-test-augmented-it-de.zip"
test_set_aug = tf.keras.utils.get_file("pan20-author-profiling-test-2020-02-23-augmented.zip", urlTestSetAug,
                                    extract=True, archive_format='zip',cache_dir='.',
                                    cache_subdir='')

Downloading data from https://github.com/marco-siino/DA-ESWA/raw/main/data/fns/fns-training-augmented-IT-DE.zip
Downloading data from https://github.com/marco-siino/DA-ESWA/raw/main/data/fns/fns-test-original.zip
Downloading data from https://github.com/marco-siino/DA-ESWA/raw/main/data/fns/fns-test-augmented-it-de.zip


In [3]:
training_set_dir = os.path.join(os.path.dirname(training_set), 'pan20-author-profiling-training-2020-02-23-augmented')
test_set_dir = os.path.join(os.path.dirname(test_set), 'pan20-author-profiling-test-2020-02-23')
test_set_aug_dir = os.path.join(os.path.dirname(test_set_aug), 'pan20-author-profiling-test-2020-02-23-augmented')

!ls -A

.config
pan20-author-profiling-test-2020-02-23
pan20-author-profiling-test-2020-02-23-augmented
pan20-author-profiling-test-2020-02-23-augmented.zip
pan20-author-profiling-test-2020-02-23.zip
pan20-author-profiling-training-2020-02-23-augmented
pan20-author-profiling-training-2020-02-23-augmented.zip
sample_data


## Build folders hierarchy to use Keras folders preprocessing function.



In [4]:
### Training Folders. ###

# First level directory.
if not os.path.exists('train_dir_en'):
    os.makedirs('train_dir_en')

# Class labels directory.
if not os.path.exists('train_dir_en/0'):
    os.makedirs('train_dir_en/0')
if not os.path.exists('train_dir_en/1'):
    os.makedirs('train_dir_en/1')

# Make Py variables.
train_dir='train_dir_'

## Test Folders. ##
# First level directory.
if not os.path.exists('test_dir_en'):
    os.makedirs('test_dir_en')

# Class labels directory.
if not os.path.exists('test_dir_en/0'):
    os.makedirs('test_dir_en/0')
if not os.path.exists('test_dir_en/1'):
    os.makedirs('test_dir_en/1')

# Make Py variables.
test_dir='test_dir_'

## Test Augmented Folders. ##
# First level directory.
if not os.path.exists('test_aug_dir_en'):
    os.makedirs('test_aug_dir_en')

# Class labels directory.
if not os.path.exists('test_aug_dir_en/0'):
    os.makedirs('test_aug_dir_en/0')
if not os.path.exists('test_aug_dir_en/1'):
    os.makedirs('test_aug_dir_en/1')

# Make Py variables.
test_aug_dir='test_aug_dir_'

!ls -A

.config
pan20-author-profiling-test-2020-02-23
pan20-author-profiling-test-2020-02-23-augmented
pan20-author-profiling-test-2020-02-23-augmented.zip
pan20-author-profiling-test-2020-02-23.zip
pan20-author-profiling-training-2020-02-23-augmented
pan20-author-profiling-training-2020-02-23-augmented.zip
sample_data
test_aug_dir_en
test_dir_en
train_dir_en


## Set language and directory paths.

In [5]:
# Set en and es ground truth file path for train_dir. 
language='en'

truth_file_training_dir_en=training_set_dir+'/'+language+'/'
truth_file_training_path_en = truth_file_training_dir_en+'truth.txt'

truth_file_test_dir=test_set_dir
truth_file_test_path_en = truth_file_test_dir+'/'+'truth'+'.txt'

truth_file_test_aug_dir=test_set_aug_dir
truth_file_test_aug_path_en = truth_file_test_aug_dir+'/'+'truth'+'.txt'

## Read truth.txt to organize training and test dataset folders.

In [6]:
# Open the file truth.txt with read only permit.
f = open(truth_file_training_path_en, "r")
# use readline() to read the first line 
line = f.readline()
# use the read line to read further.
# If the file is not empty keep reading one line
# at a time, till the file is empty
while line:
    # Split line at :::
    x = line.split(":::")
    fNameXml = x[0]+'.xml'
    fNameTxt = x[0]+'.txt'
    # Second coord [0] gets just the first character (label) and not /n too.
    label = x[1][0]

    # Now move the file to the right folder.
    if os.path.exists(truth_file_training_dir_en+fNameXml):
      os.rename(truth_file_training_dir_en+fNameXml, './train_dir_'+language+'/'+label+'/'+fNameTxt )

    # use readline() to read next line
    line = f.readline()

# Open the file truth.txt with read only permit.
f = open(truth_file_test_path_en, "r")
# use readline() to read the first line 
line = f.readline()
# use the read line to read further.
# If the file is not empty keep reading one line
# at a time, till the file is empty
while line:
    # Split line at :::
    x = line.split(":::")
    fNameXml = x[0]+'.xml'
    fNameTxt = x[0]+'.txt'
    # Second coord [0] gets just the first character (label) and not /n too.
    label = x[1][0]

    # Now move the file to the right folder.
    if os.path.exists(truth_file_test_dir+'/'+language+'/'+fNameXml):
      os.rename(truth_file_test_dir+'/'+language+'/'+fNameXml, './test_dir_'+language+'/'+label+'/'+fNameTxt )

    # use readline() to read next line
    line = f.readline()

# Open the file truth.txt with read only permit.
f = open(truth_file_test_aug_path_en, "r")
# use readline() to read the first line 
line = f.readline()
# use the read line to read further.
# If the file is not empty keep reading one line
# at a time, till the file is empty
while line:
    # Split line at :::
    x = line.split(":::")
    fNameXml = x[0]+'.xml'
    fNameTxt = x[0]+'.txt'
    # Second coord [0] gets just the first character (label) and not /n too.
    label = x[1][0]

    # Now move the file to the right folder.
    if os.path.exists(truth_file_test_aug_dir+'/'+language+'/'+fNameXml):
      os.rename(truth_file_test_aug_dir+'/'+language+'/'+fNameXml, './test_aug_dir_'+language+'/'+label+'/'+fNameTxt )

    # use readline() to read next line
    line = f.readline()

## Generate full dataset.

In [7]:
# Generate full randomized training set.
batch_size=1

en_train_ds = tf.keras.preprocessing.text_dataset_from_directory(
    train_dir+language, 
    batch_size=batch_size,
    shuffle=False
    )

en_test_ds = tf.keras.preprocessing.text_dataset_from_directory(
    test_dir+language, 
    batch_size=batch_size,
    shuffle=False
    )

en_test_aug_ds = tf.keras.preprocessing.text_dataset_from_directory(
    test_aug_dir+language, 
    batch_size=batch_size,
    shuffle=False
    )

train_ds=en_train_ds.shuffle(300,seed=1, reshuffle_each_iteration=False)
test_ds=en_test_ds.shuffle(200,seed=1, reshuffle_each_iteration=False)
test_aug_ds=en_test_aug_ds.shuffle(200,seed=1, reshuffle_each_iteration=False)

train_ds_size=len(train_ds)
test_ds_size=len(test_ds)
test_aug_ds_size=len(test_aug_ds)

Found 300 files belonging to 2 classes.
Found 200 files belonging to 2 classes.
Found 200 files belonging to 2 classes.


## Functions to pre-process source text. 



In [8]:
# Preprocessing function to remove some noise due to the translation.
def clean_samples(input_data):
  tag_author_lang_en_removed = tf.strings.regex_replace(input_data,'<author lang="en">', '')
  tag_opening_document_miscased = tf.strings.regex_replace(tag_author_lang_en_removed,'<Document>', '<document>')
  tag_closing_document_miscased = tf.strings.regex_replace(tag_opening_document_miscased,'</Document>', '</document>')
  tag_opening_documents = tf.strings.regex_replace(tag_closing_document_miscased,'<documents>', '')
  tag_opening_cdata_removed = tf.strings.regex_replace(tag_opening_documents,'<\!\[CDATA\[', ' ')
  tag_closing_cdata_removed = tf.strings.regex_replace(tag_opening_cdata_removed,'\]\]>', ' >')
  output_data = tf.strings.regex_replace(tag_closing_cdata_removed,'</documents>', '')
  return output_data

## Get the length of the longest sample in training set. Then adapt text.



In [9]:
def preprocess_and_adapt_ts(training_set):
  # Set a large sequence length to find the longest sample in the training set.
  sequence_length = 20000
  vectorize_layer = TextVectorization(
      standardize=clean_samples,
      output_mode='int',
      output_sequence_length=sequence_length)

  train_text = training_set.map(lambda x, y: x)
  vectorize_layer.adapt(train_text)
  #vectorize_layer.get_vocabulary()

  model = tf.keras.models.Sequential()
  model.add(tf.keras.Input(shape=(1,), dtype=tf.string))
  model.add(vectorize_layer)

  longest_sample_length=1

  for element in training_set:
    authorDocument=element[0]
    label=element[1]
    
    #print("Sample considered is: ", authorDocument[0].numpy())
    #print("Preprocessed: ", str(custom_standardization(authorDocument[0].numpy())))
    #print("And has label: ", label[0].numpy())

    out=model(authorDocument)
    # Convert token list to numpy array.
    token_list = out.numpy()[0]
    token_list = np.trim_zeros(token_list,'b')
    if longest_sample_length < len(token_list):
      longest_sample_length = len(token_list)

  print("Length of the longest sample is:", longest_sample_length)

  # After tokenization longest_sample_length covers all the document lenghts in our dataset.
  sequence_length = longest_sample_length

  vectorize_layer = TextVectorization(
      standardize=clean_samples,
      output_mode='int',
      output_sequence_length=sequence_length)

  # Finally adapt the vectorize layer.
  train_text = training_set.map(lambda x, y: x)
  vectorize_layer.adapt(train_text)
  return vectorize_layer

## Some training hyperparameters...

In [10]:
# Word embedding dimensions.
embedding_dim = 100

num_runs = 5 
# No need to go over the 20th epoch...Overfitting begins.
num_epochs_per_run = 20

#opt = tf.keras.optimizers.RMSprop()

## Vectorization




In [11]:
print("\n\n* * * * VECTORIZATION STARTED * * * *")

# Preprocess training set to build a dictionary.
vectorize_layer = preprocess_and_adapt_ts(train_ds)

max_features=len(vectorize_layer.get_vocabulary()) + 1
print("Vocabulary size is:", max_features)


Instructions for updating:
Lambda fuctions will be no more assumed to be used in the statement where they are used, or at least in the same block. https://github.com/tensorflow/tensorflow/issues/56089




* * * * VECTORIZATION STARTED * * * *
Length of the longest sample is: 7175
Vocabulary size is: 98112


## Models definition, training and evaluation on original and on augmented test set.




In [12]:
tf.random.set_seed(1)

runs_accuracy = []
aug_runs_accuracy = []

for run in range(1,(num_runs+1)):
    epochs_accuracy = []
    aug_epochs_accuracy = []
    model = tf.keras.Sequential([
                                    tf.keras.Input(shape=(1,), dtype=tf.string),
                                    vectorize_layer,
                                    layers.Embedding(max_features + 1, embedding_dim),                     
                                    layers.Dropout(0.8),

                                    layers.Conv1D(256,16,activation='relu'),
                                    layers.MaxPooling1D(),
                                    layers.Dropout(0.6),

                                    layers.Dense(512,activation='relu'),
                           
                                    layers.GlobalAveragePooling1D(),
                                    layers.Dropout(0.2),
                                    layers.Dense(1)                            
    ])
    model.compile(loss=losses.BinaryCrossentropy(from_logits=True), optimizer='RMSprop', metrics=tf.metrics.BinaryAccuracy(threshold=0.0)) 

    for epoch in range (0,num_epochs_per_run):
        history = model.fit(
          train_ds,
          validation_data = test_ds,
          epochs=1,
          shuffle=False,
          # Comment the following line to do not save and download the model.
          #callbacks=[callbacks]
          )
        accuracy = history.history['val_binary_accuracy']
        aug_score = model.evaluate(test_aug_ds, verbose=0)
        aug_accuracy = aug_score[1]
        print("Run: ",run,"/ Accuracy on test ORIGINAL at epoch ",epoch," is: ", accuracy[0],"\n")
        print("Run: ",run,"/ Accuracy on test AUGMENTED at epoch ",epoch," is: ", aug_accuracy,"\n")
        epochs_accuracy.append(accuracy[0])
        aug_epochs_accuracy.append(aug_accuracy)

    print("Accuracies on ORIGINAL over epochs:",epochs_accuracy,"\n")
    runs_accuracy.append(max(epochs_accuracy))
    print("Accuracies on AUGMENTED over epochs:",aug_epochs_accuracy,"\n\n")
    aug_runs_accuracy.append(max(aug_epochs_accuracy))

runs_accuracy.sort()
aug_runs_accuracy.sort()
print("\n\n Over all runs maximum accuracies on ORIGINAL test set are:", runs_accuracy)
print("The median for English is:",runs_accuracy[2],"\n\n\n")
  
print("\n\n Over all runs maximum accuracies on AUGMENTED test set are:", aug_runs_accuracy)
print("The median for English is:",aug_runs_accuracy[2],"\n\n\n")

# Final Result on Original Test set
if (runs_accuracy[2]-runs_accuracy[0])>(runs_accuracy[4]-runs_accuracy[2]):
  max_range_from_median = runs_accuracy[2]-runs_accuracy[0]
else:
  max_range_from_median = runs_accuracy[4]-runs_accuracy[2]
final_result = str(runs_accuracy[2])+" +/- "+ str(max_range_from_median)
print("CNN Accuracy Score on Original Test set -> ",final_result)

# Final result on AUGMENTED test set.
if (aug_runs_accuracy[2]-aug_runs_accuracy[0])>(aug_runs_accuracy[4]-aug_runs_accuracy[2]):
  max_range_from_median = aug_runs_accuracy[2]-aug_runs_accuracy[0]
else:
  max_range_from_median = aug_runs_accuracy[4]-aug_runs_accuracy[2]
final_result = str(aug_runs_accuracy[2])+" +/- "+ str(max_range_from_median)
print("CNN Accuracy Score on AUGMENTED Test set -> ",final_result)

Run:  1 / Accuracy on test ORIGINAL at epoch  0  is:  0.5 

Run:  1 / Accuracy on test AUGMENTED at epoch  0  is:  0.5 

Run:  1 / Accuracy on test ORIGINAL at epoch  1  is:  0.625 

Run:  1 / Accuracy on test AUGMENTED at epoch  1  is:  0.5950000286102295 

Run:  1 / Accuracy on test ORIGINAL at epoch  2  is:  0.6600000262260437 

Run:  1 / Accuracy on test AUGMENTED at epoch  2  is:  0.6449999809265137 

Run:  1 / Accuracy on test ORIGINAL at epoch  3  is:  0.6549999713897705 

Run:  1 / Accuracy on test AUGMENTED at epoch  3  is:  0.6700000166893005 

Run:  1 / Accuracy on test ORIGINAL at epoch  4  is:  0.6600000262260437 

Run:  1 / Accuracy on test AUGMENTED at epoch  4  is:  0.6899999976158142 

Run:  1 / Accuracy on test ORIGINAL at epoch  5  is:  0.6949999928474426 

Run:  1 / Accuracy on test AUGMENTED at epoch  5  is:  0.6800000071525574 

Run:  1 / Accuracy on test ORIGINAL at epoch  6  is:  0.6700000166893005 

Run:  1 / Accuracy on test AUGMENTED at epoch  6  is:  0.65499