<a href="https://colab.research.google.com/github/marco-siino/DA-ESWA/blob/main/code/augmentation/HSS_TrainingSet_Augmentation_DE_NB.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Importing modules.

In [1]:
import os
import os.path
from os import path
import re
import shutil
import string
import tensorflow as tf

!pip install -U deep-translator
from deep_translator import GoogleTranslator
from io import open
from pathlib import Path
from google.colab import files

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting deep-translator
  Downloading deep_translator-1.10.1-py3-none-any.whl (35 kB)
Collecting beautifulsoup4<5.0.0,>=4.9.1
  Downloading beautifulsoup4-4.11.2-py3-none-any.whl (129 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m129.4/129.4 KB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
Collecting soupsieve>1.2
  Downloading soupsieve-2.4-py3-none-any.whl (37 kB)
Installing collected packages: soupsieve, beautifulsoup4, deep-translator
  Attempting uninstall: beautifulsoup4
    Found existing installation: beautifulsoup4 4.6.3
    Uninstalling beautifulsoup4-4.6.3:
      Successfully uninstalled beautifulsoup4-4.6.3
Successfully installed beautifulsoup4-4.11.2 deep-translator-1.10.1 soupsieve-2.4


In [2]:
!pip install xmltodict
import xmltodict 

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting xmltodict
  Downloading xmltodict-0.13.0-py2.py3-none-any.whl (10.0 kB)
Installing collected packages: xmltodict
Successfully installed xmltodict-0.13.0


## Importing DS and extract in current working directory.

In [3]:
# Url obtained starting from this: https://drive.google.com/file/d/19ZcqEv88euKB71HfAWjTGN3uCKp2qsfP/ and forcing export=download.
train_set_url = 'https://github.com/marco-siino/DA-ESWA/raw/main/data/hss/hss-training-cleaned.zip'  # train FNS
train_set_path = tf.keras.utils.get_file("pan21-author-profiling-training-2021-03-14-augmented.zip", train_set_url,
                                    extract=True, archive_format='zip',cache_dir='.',
                                    cache_subdir='')

train_set_dir = os.path.join(os.path.dirname(train_set_path), 'pan21-author-profiling-training-2021-03-14-augmented.zip')

print(train_set_path)
print(train_set_dir)

Downloading data from https://github.com/marco-siino/DA-ESWA/raw/main/data/hss/hss-training-cleaned.zip
./pan21-author-profiling-training-2021-03-14-augmented.zip
./pan21-author-profiling-training-2021-03-14-augmented.zip


## Function to pre-process source text.

In [4]:
def custom_standardization(text):
  tag_open_CDATA_removed = re.sub('<\!\[CDATA\[', ' ',text)
  tag_closed_CDATA_removed = re.sub('\]{1,}>', ' ',tag_open_CDATA_removed)
  tag_author_lang_en_removed = re.sub('<author lang="en">', ' ',tag_closed_CDATA_removed)
  tag_author_lang_en_removed = re.sub('<author lang="en"', ' ',tag_closed_CDATA_removed)
  #only for test remove if needed
  tag_author_lang_en_removed = re.sub('class="0">', ' ', tag_author_lang_en_removed )
  tag_author_lang_en_removed = re.sub('class="1">', ' ', tag_author_lang_en_removed )
  ######
  tag_closed_author_removed = re.sub('</author>', '', tag_author_lang_en_removed)
  tag_closed_author_removed = re.sub('</author', '', tag_closed_author_removed)
  tag_open_documents_removed = re.sub('<documents>\n(\t){0,2}', '',tag_closed_author_removed)
  tag_closed_documents_removed = re.sub('</documents>\n(\t){0,2}', ' ',tag_open_documents_removed)
  lowercased_text = tag_closed_documents_removed.lower()
  return lowercased_text

## Data Augmentation function.

In [5]:
def chunkstring(string, length):
  res = list((string[0+i:length+i] for i in range(0, len(string), length)))
  return res

def enhance_one_sample(sample, TARGET='it', return_both=True):  
  preprocessed_text = custom_standardization(sample)

  #chunk to avoid character limits  
  TOBETRANS = chunkstring(preprocessed_text, 4000)
  translated_it = GoogleTranslator(source='en', target=TARGET).translate_batch(TOBETRANS)
  reversed_trans = GoogleTranslator(source=TARGET, target='en').translate_batch(translated_it)
  merged_chunks =''.join(reversed_trans)
  enhanced_sample = preprocessed_text+merged_chunks
  if return_both == False:
    enhanced_sample = merged_chunks
  return enhanced_sample

# Generating augmented train set.

In [6]:
## Generate the augmented training set as a zip file.

# pan22-author-profiling-training-2022-03-29-augmented.zip has to be created.
if not os.path.exists('pan21-author-profiling-training-2021-03-14-augmented'):
    os.makedirs('pan21-author-profiling-training-2021-03-14-augmented')
if not os.path.exists('pan21-author-profiling-training-2021-03-14-augmented/en/'):
    os.makedirs('pan21-author-profiling-training-2021-03-14-augmented/en/')

In [7]:
# Copy the ground truth file.
if os.path.exists('/content/pan21-author-profiling-training-2021-03-14-augmented'):
        shutil.copyfile('/content/pan21-author-profiling-training-2021-03-14-augmented/en/truth.txt', '/content/pan21-author-profiling-training-2021-03-14-augmented/truth.txt')

In [8]:
file_nr = 0
count_errors = 0
for filename in os.listdir('pan21-author-profiling-training-2021-03-14-augmented/en/'):
  file_nr += 1
  x = filename.split(".")      
  author_id = x[0]
  print("File nr.:", file_nr)
  print("Filename:", filename)
  text = open('pan21-author-profiling-training-2021-03-14-augmented/en/'+filename, 'r').read()
  try:
    enhanced_sample = enhance_one_sample(text, TARGET='de')
    #de_back = enhance_one_sample(text, TARGET='de', return_both=False)
    #enhanced_sample = enhanced_sample + de_back    
    f = open("pan21-author-profiling-training-2021-03-14-augmented/en/"+author_id+".xml", "a")
    f.write(enhanced_sample)
    f.close()
    print('Succes')
  except:
    print('!!! FAILED !!!!')
    count_errors += 1
    f = open("pan21-author-profiling-training-2021-03-14-augmented/en/"+author_id+".xml", "a")
    preprocessed_text = custom_standardization(text)
    f.write(preprocessed_text)
    f.close()

print('cunt err', count_errors)
!zip -r pan21-author-profiling-training-2021-03-14-augmented.zip pan21-author-profiling-training-2021-03-14-augmented
# If automatic download doesn't start, open the directory browser on the left menu and download the zip file manually.
files.download("pan21-author-profiling-training-2021-03-14-augmented.zip")

File nr.: 1
Filename: 7f269488a6576c9dc21085c1e2854142.xml
Succes
File nr.: 2
Filename: bbd46b0659de5a173339aaebac2523fa.xml
Succes
File nr.: 3
Filename: 62a6b5a0c5f53790c114639c7ec0a3ab.xml
Succes
File nr.: 4
Filename: 98e4c6520892b0218ab13ca7369785be.xml
Succes
File nr.: 5
Filename: d15ec49115f2e8febee7fda9d1893fa4.xml
Succes
File nr.: 6
Filename: b496caf332cb0ba97d2acefc44f153ac.xml
Succes
File nr.: 7
Filename: f00627537c48b43bf2045b98b3508d94.xml
Succes
File nr.: 8
Filename: 1a91d52030d1a433d35055fbeb6bdf3b.xml
Succes
File nr.: 9
Filename: 9d08913250938aadcf6c18d0c89a0d14.xml
Succes
File nr.: 10
Filename: 78b27238932fc8e666c5bf84681b460e.xml
Succes
File nr.: 11
Filename: 6f964e5458b4879513ebd14784180798.xml
Succes
File nr.: 12
Filename: 6fe4ed1ee7f13da10a62d3cd1a6cba07.xml
Succes
File nr.: 13
Filename: 5b5c08b665052e695064f2f2b04bdea7.xml
Succes
File nr.: 14
Filename: db83cb088416759db148c17b256a9652.xml
Succes
File nr.: 15
Filename: 10b2d013382e1fb3c9414ea28329f258.xml
Succes
File

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>