<a href="https://colab.research.google.com/github/marco-siino/DA-ESWA/blob/main/code/evaluation/iss/SVM_ISS_not_augmented.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Investigating text data augmentation using back translation for author profiling
- - - 
SVM ON ISS DS EXPERIMENTS NOTEBOOK 
- - -
SVM on Irony and Stereotype Spreaders Dataset augmented without backtranslation.
Code by M. Siino. 

From the paper: "Investigating text data augmentation using back translation for author profiling" by M.Siino et al.

## Importing modules.

In [1]:
import matplotlib.pyplot as plt
import os
import random
import re
import shutil
import string
import tensorflow as tf

from tensorflow.keras import layers
from tensorflow.keras import losses
from tensorflow.keras import preprocessing
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from google.colab import files
from io import open
from numpy.random import seed
import numpy as np
from pathlib import Path
from sklearn import svm

os.environ['TF_CUDNN_DETERMINISTIC']='true'
os.environ['TF_DETERMINISTIC_OPS']='true'

## Importing DS and extract in current working directory.

In [2]:
urlTrainingSet = "https://github.com/marco-siino/DA-ESWA/raw/main/data/iss/iss-training-original.zip"
training_set = tf.keras.utils.get_file("pan22-author-profiling-training-2022-03-29.zip", urlTrainingSet,
                                    extract=True, archive_format='zip',cache_dir='.',
                                    cache_subdir='')

urlTestSet="https://github.com/marco-siino/DA-ESWA/raw/main/data/iss/iss-test-original.zip"
test_set = tf.keras.utils.get_file("pan22-author-profiling-test-2022-04-22-without_truth.zip", urlTestSet,
                                    extract=True, archive_format='zip',cache_dir='.',
                                    cache_subdir='')

Downloading data from https://github.com/marco-siino/DA-ESWA/raw/main/data/iss/iss-training-original.zip
Downloading data from https://github.com/marco-siino/DA-ESWA/raw/main/data/iss/iss-test-original.zip


In [3]:
training_set_dir = os.path.join(os.path.dirname(training_set), 'pan22-author-profiling-training-2022-03-29')
test_set_dir = os.path.join(os.path.dirname(test_set), 'pan22-author-profiling-test-2022-04-22-without_truth')

!ls -A

.config
pan22-author-profiling-test-2022-04-22-without_truth
pan22-author-profiling-test-2022-04-22-without_truth.zip
pan22-author-profiling-training-2022-03-29
pan22-author-profiling-training-2022-03-29.zip
sample_data


## Build folders hierarchy to use Keras folders preprocessing function.

In [4]:
### Training Folders. ###

# First level directory.
if not os.path.exists('train_dir_en'):
    os.makedirs('train_dir_en')

# Class labels directory.
if not os.path.exists('train_dir_en/0'):
    os.makedirs('train_dir_en/0')
if not os.path.exists('train_dir_en/1'):
    os.makedirs('train_dir_en/1')

# Make Py variables.
train_dir='train_dir_'

## Test Folders. ##
# First level directory.
if not os.path.exists('test_dir_en'):
    os.makedirs('test_dir_en')

# Class labels directory.
if not os.path.exists('test_dir_en/0'):
    os.makedirs('test_dir_en/0')
if not os.path.exists('test_dir_en/1'):
    os.makedirs('test_dir_en/1')

# Make Py variables.
test_dir='test_dir_'

!ls -A

.config
pan22-author-profiling-test-2022-04-22-without_truth
pan22-author-profiling-test-2022-04-22-without_truth.zip
pan22-author-profiling-training-2022-03-29
pan22-author-profiling-training-2022-03-29.zip
sample_data
test_dir_en
train_dir_en


## Set language and directory paths.


In [5]:
# Set en and es ground truth file path for train_dir. 
language='en'

truth_file_training_dir_en=training_set_dir+'/'+language+'/'
truth_file_training_path_en = truth_file_training_dir_en+'truth.txt'

truth_file_test_dir=test_set_dir
truth_file_test_path_en = truth_file_test_dir+'/'+'truth'+'.txt'

## Read truth.txt to organize training and test dataset folders.



In [6]:
# Open the file truth.txt with read only permit.
f = open(truth_file_training_path_en, "r")
# use readline() to read the first line 
line = f.readline()
# use the read line to read further.
# If the file is not empty keep reading one line
# at a time, till the file is empty
while line:
    # Split line at :::
    x = line.split(":::")
    fNameXml = x[0]+'.xml'
    fNameTxt = x[0]+'.txt'
    # Second coord [0] gets just the first character (label) and not /n too.
    label = x[1][0]
    if label == 'I':
      label = '1'
    elif label == 'N':
      label = '0'

    # Now move the file to the right folder.
    if os.path.exists(truth_file_training_dir_en+fNameXml):
      os.rename(truth_file_training_dir_en+fNameXml, './train_dir_'+language+'/'+label+'/'+fNameTxt )

    # use readline() to read next line
    line = f.readline()

# Open the file truth.txt with read only permit.
f = open(truth_file_test_path_en, "r")
# use readline() to read the first line 
line = f.readline()
# use the read line to read further.
# If the file is not empty keep reading one line
# at a time, till the file is empty
while line:
    # Split line at :::
    x = line.split(":::")
    fNameXml = x[0]+'.xml'
    fNameTxt = x[0]+'.txt'
    # Second coord [0] gets just the first character (label) and not /n too.
    label = x[1][0]
    if label == 'I':
      label = '1'
    elif label == 'N':
      label = '0'

    # Now move the file to the right folder.
    if os.path.exists(truth_file_test_dir+'/'+language+'/'+fNameXml):
      os.rename(truth_file_test_dir+'/'+language+'/'+fNameXml, './test_dir_'+language+'/'+label+'/'+fNameTxt )

    # use readline() to read next line
    line = f.readline()
     

## Generate full training set.



In [7]:
# Generate full randomized training set.
batch_size=1

en_train_ds = tf.keras.preprocessing.text_dataset_from_directory(
    train_dir+language, 
    batch_size=batch_size,
    shuffle=False
    )

en_test_ds = tf.keras.preprocessing.text_dataset_from_directory(
    test_dir+language, 
    batch_size=batch_size,
    shuffle=False
    )

train_ds=en_train_ds.shuffle(300,seed=1, reshuffle_each_iteration=False)
test_ds=en_test_ds.shuffle(200,seed=1, reshuffle_each_iteration=False)

train_ds_size=len(train_ds)
test_ds_size=len(test_ds)

Found 420 files belonging to 2 classes.
Found 180 files belonging to 2 classes.


## Functions to pre-process source text. (A detailed discussion on our paper)

In [8]:
# Preprocessing function to remove some noise due to the translation.
def clean_samples(input_data):
  tag_author_lang_en_removed = tf.strings.regex_replace(input_data,'', '')  
  tag_opening_documents = tf.strings.regex_replace(tag_author_lang_en_removed,'', '')
  tag_opening_cdata_removed = tf.strings.regex_replace(tag_opening_documents,'<\!\[CDATA\[', ' ')
  tag_closing_cdata_removed = tf.strings.regex_replace(tag_opening_cdata_removed,'\]\]>', ' >')
  tag_closing_documents = tf.strings.regex_replace(tag_closing_cdata_removed,'', '')
  output_data = tf.strings.regex_replace(tag_closing_documents,'', '')
  return output_data

## Get the length of the longest sample in training set. Then adapt text.



In [10]:
def preprocess_and_adapt_ts(training_set):
  # Set a large sequence length to find the longest sample in the training set.
  sequence_length = 30000
  vectorize_layer = TextVectorization(
      standardize=clean_samples,
      output_mode='int',
      output_sequence_length=sequence_length)

  train_text = training_set.map(lambda x, y: x)
  vectorize_layer.adapt(train_text)
  #vectorize_layer.get_vocabulary()

  model = tf.keras.models.Sequential()
  model.add(tf.keras.Input(shape=(1,), dtype=tf.string))
  model.add(vectorize_layer)

  longest_sample_length=1

  for element in training_set:
    authorDocument=element[0]
    label=element[1]
    
    #print("Sample considered is: ", authorDocument[0].numpy())
    #print("Preprocessed: ", str(custom_standardization(authorDocument[0].numpy())))
    #print("And has label: ", label[0].numpy())

    out=model(authorDocument)
    # Convert token list to numpy array.
    token_list = out.numpy()[0]
    token_list = np.trim_zeros(token_list,'b')
    if longest_sample_length < len(token_list):
      longest_sample_length = len(token_list)

  print("Length of the longest sample is:", longest_sample_length)

  # After tokenization longest_sample_length covers all the document lenghts in our dataset.
  sequence_length = longest_sample_length

  vectorize_layer = TextVectorization(
      standardize=clean_samples,
      output_mode='int',
      output_sequence_length=sequence_length)

  # Finally adapt the vectorize layer.
  train_text = training_set.map(lambda x, y: x)
  vectorize_layer.adapt(train_text)
  return vectorize_layer

## Vectorization




In [11]:
print("\n\n* * * * VECTORIZATION STARTED * * * *")

# Preprocess training set to build a dictionary.
vectorize_layer = preprocess_and_adapt_ts(train_ds)

max_features=len(vectorize_layer.get_vocabulary()) + 1
print("Vocabulary size is:", max_features)

Instructions for updating:
Lambda fuctions will be no more assumed to be used in the statement where they are used, or at least in the same block. https://github.com/tensorflow/tensorflow/issues/56089




* * * * VECTORIZATION STARTED * * * *
Length of the longest sample is: 9808
Vocabulary size is: 187022


## Models definition and evaluation.




In [12]:
print("\n\n***** FINITO DI PROCESSARE E ADATTARE IL TRAINING SET, INIZIA LA SIMULAZIONE *******")
# Print a raw and a preprocessed sample.
for element in train_ds:
  authorDocument=element[0]
  label=element[1]
  
  print("Sample considered is: ", authorDocument[0])
  print("Preprocessed: ", str(clean_samples(authorDocument[0].numpy())))
  break

# # # - - - - - MODELS DEFINITION AND EVALUATION - - - - - # # #

model = tf.keras.models.Sequential()
model.add(tf.keras.Input(shape=(1,), dtype=tf.string))
model.add(vectorize_layer)

# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # 

# --- SVM SECTION START --- #

training_labels=[]
training_samples=[]

max_features=len(vectorize_layer.get_vocabulary()) + 1

for element in train_ds:
  authorDocument=element[0]
  label=element[1]
  
  #print("Sample considered is: ", authorDocument[0])
  #print("Preprocessed: ", str(custom_standardization(authorDocument[0].numpy())))
  #print("And has label: ", label[0].numpy())
  
  text_vect_layer_model = tf.keras.Model(inputs=model.input,
                                      outputs=model.layers[0].output)
  text_vect_out = text_vect_layer_model(authorDocument)

  training_labels.append(label[0].numpy())
  current_sample=np.zeros(max_features)
  for current_token in text_vect_out[0][:].numpy():
    #print(current_token,end=' ')
    #print(vectorize_layer.get_vocabulary()[current_token])
    current_sample[current_token]+=1
  training_samples.append(current_sample)
  #break

training_labels=np.array(training_labels)
training_samples=np.array(training_samples)
#print("\nLE LABELS DEI CAMPIONI DI TRAINING SONO:")
#print(training_labels)
#print("\nI SAMPLE DI TRAINING DOPO LA TEXT VECTORIZATION SONO:")
#print(training_samples)

test_labels=[]
test_samples=[]

for element in test_ds:
  authorDocument=element[0]
  label=element[1]
  
  text_vect_layer_model = tf.keras.Model(inputs=model.input,
                                      outputs=model.layers[0].output)
  text_vect_out = text_vect_layer_model(authorDocument)

  test_labels.append(label[0].numpy())
  current_sample=np.zeros(max_features)
  for current_token in text_vect_out[0][:].numpy():
    current_sample[current_token]+=1
  test_samples.append(current_sample)

test_labels=np.array(test_labels)
test_samples=np.array(test_samples)

SVM = svm.SVC(C=0.5, kernel='linear', gamma='auto')
SVM.fit(training_samples,training_labels)
# predict the labels on training set
#predictions_SVM = SVM.predict(training_samples)
# Use accuracy_score function to get the accuracy
result=SVM.score(training_samples,training_labels)
print("SVM Accuracy Score on Training set -> ",result)

# predict the labels on validation dataset
predictions_SVM = SVM.predict(test_samples)
# Use accuracy_score function to get the accuracy
result=SVM.score(test_samples,test_labels)
print("SVM Accuracy Score on Test set -> ",result)

# --- SVM SECTION END --- #

# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #

# # # - - - - - MODEL DEFINITION AND EVALUATION END  - - - - - # # #



***** FINITO DI PROCESSARE E ADATTARE IL TRAINING SET, INIZIA LA SIMULAZIONE *******
Sample considered is:  tf.Tensor(b'<author lang="en">\n\t<documents>\n\t\t<document><![CDATA[#USER# Obviously. He will definitely charge. The world is witness.]]></document>\n\t\t<document><![CDATA[You Know I m A Ho (Clean) Feat. Ice Cube by Master P now playing on #URL# #HASHTAG# #HASHTAG# #URL#]]></document>\n\t\t<document><![CDATA[Sometimes only accepting the reality ease ur suffering n pain...]]></document>\n\t\t<document><![CDATA[#USER# #USER# But he didn\'t ask for prayers for the devil, he asked for prayers Putin, for God to change his heart from doing evil.  That is how we pray for bad people, for God to change their hearts so they will turn away from evil. \nOur Lord Jesus died for sinners, that didn\'t make him evil.]]></document>\n\t\t<document><![CDATA[i been too nice lately. y\xe2\x80\x99all must\xe2\x80\x99ve forgot im the devil \xf0\x9f\x99\x83]]></document>\n\t\t<document><![CDATA[#US