<a href="https://colab.research.google.com/github/marco-siino/DA-ESWA/blob/main/code/augmentation/FNS_TestSet_Augmentation_IT_NB.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Importing modules.

In [1]:
import os
import os.path
from os import path
import re
import shutil
import string
import tensorflow as tf

!pip install -U deep-translator
from deep_translator import GoogleTranslator
from io import open
from pathlib import Path
from google.colab import files

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting deep-translator
  Downloading deep_translator-1.10.1-py3-none-any.whl (35 kB)
Collecting beautifulsoup4<5.0.0,>=4.9.1
  Downloading beautifulsoup4-4.11.2-py3-none-any.whl (129 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m129.4/129.4 KB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
Collecting soupsieve>1.2
  Downloading soupsieve-2.4-py3-none-any.whl (37 kB)
Installing collected packages: soupsieve, beautifulsoup4, deep-translator
  Attempting uninstall: beautifulsoup4
    Found existing installation: beautifulsoup4 4.6.3
    Uninstalling beautifulsoup4-4.6.3:
      Successfully uninstalled beautifulsoup4-4.6.3
Successfully installed beautifulsoup4-4.11.2 deep-translator-1.10.1 soupsieve-2.4


In [2]:
!pip install xmltodict
import xmltodict 

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting xmltodict
  Downloading xmltodict-0.13.0-py2.py3-none-any.whl (10.0 kB)
Installing collected packages: xmltodict
Successfully installed xmltodict-0.13.0


## Importing DS and extract in current working directory.

In [3]:
# Url obtained starting from this: https://drive.google.com/file/d/19ZcqEv88euKB71HfAWjTGN3uCKp2qsfP/ and forcing export=download.
train_set_url = 'https://github.com/marco-siino/DA-ESWA/raw/main/data/fns/fns-test-original.zip'  # train FNS
train_set_path = tf.keras.utils.get_file("pan20-author-profiling-test-2020-02-23.zip", train_set_url,
                                    extract=True, archive_format='zip',cache_dir='.',
                                    cache_subdir='')

train_set_dir = os.path.join(os.path.dirname(train_set_path), 'pan20-author-profiling-test-2020-02-23.zip')

print(train_set_path)
print(train_set_dir)

Downloading data from https://github.com/marco-siino/DA-ESWA/raw/main/data/fns/fns-test-original.zip
./pan20-author-profiling-test-2020-02-23.zip
./pan20-author-profiling-test-2020-02-23.zip


## Function to pre-process source text.

In [4]:
def custom_standardization(text):
  tag_open_CDATA_removed = re.sub('<\!\[CDATA\[', ' ',text)
  tag_closed_CDATA_removed = re.sub('\]{1,}>', ' ',tag_open_CDATA_removed)
  tag_author_lang_en_removed = re.sub('<author lang="en">', ' ',tag_closed_CDATA_removed)
  tag_author_lang_en_removed = re.sub('<author lang="en"', ' ',tag_closed_CDATA_removed)
  #only for test remove if needed
  tag_author_lang_en_removed = re.sub('class="0">', ' ', tag_author_lang_en_removed )
  tag_author_lang_en_removed = re.sub('class="1">', ' ', tag_author_lang_en_removed )
  ######
  tag_closed_author_removed = re.sub('</author>', '', tag_author_lang_en_removed)
  tag_closed_author_removed = re.sub('</author', '', tag_closed_author_removed)
  tag_open_documents_removed = re.sub('<documents>\n(\t){0,2}', '',tag_closed_author_removed)
  tag_closed_documents_removed = re.sub('</documents>\n(\t){0,2}', ' ',tag_open_documents_removed)
  lowercased_text = tag_closed_documents_removed.lower()
  return lowercased_text

## Data Augmentation function.

In [5]:
def chunkstring(string, length):
  res = list((string[0+i:length+i] for i in range(0, len(string), length)))
  return res

def enhance_one_sample(sample, TARGET='it', return_both=True):  
  preprocessed_text = custom_standardization(sample)

  #chunk to avoid character limits  
  TOBETRANS = chunkstring(preprocessed_text, 4000)
  translated_it = GoogleTranslator(source='en', target=TARGET).translate_batch(TOBETRANS)
  reversed_trans = GoogleTranslator(source=TARGET, target='en').translate_batch(translated_it)
  merged_chunks =''.join(reversed_trans)
  enhanced_sample = preprocessed_text+merged_chunks
  if return_both == False:
    enhanced_sample = merged_chunks
  return enhanced_sample

# Generating augmented train set.

In [6]:
## Generate the augmented training set as a zip file.

# pan22-author-profiling-training-2022-03-29-augmented.zip has to be created.
if not os.path.exists('pan20-author-profiling-test-2020-02-23-augmented'):
    os.makedirs('pan20-author-profiling-test-2020-02-23-augmented')
if not os.path.exists('pan20-author-profiling-test-2020-02-23-augmented/en/'):
    os.makedirs('pan20-author-profiling-test-2020-02-23-augmented/en/')

In [8]:
# Copy the ground truth file.
if os.path.exists('/content/pan20-author-profiling-test-2020-02-23'):
        shutil.copyfile('/content/pan20-author-profiling-test-2020-02-23/truth.txt', '/content/pan20-author-profiling-test-2020-02-23-augmented/truth.txt')

In [9]:
file_nr = 0
count_errors = 0
for filename in os.listdir('pan20-author-profiling-test-2020-02-23/en/'):
  file_nr += 1
  x = filename.split(".")      
  author_id = x[0]
  print("File nr.:", file_nr)
  print("Filename:", filename)
  text = open('pan20-author-profiling-test-2020-02-23/en/'+filename, 'r').read()
  try:
    enhanced_sample = enhance_one_sample(text, TARGET='it')
    #de_back = enhance_one_sample(text, TARGET='de', return_both=False)
    #enhanced_sample = enhanced_sample + de_back    
    f = open("pan20-author-profiling-test-2020-02-23-augmented/en/"+author_id+".xml", "a")
    f.write(enhanced_sample)
    f.close()
    print('Succes')
  except:
    print('!!! FAILED !!!!')
    count_errors += 1
    f = open("pan20-author-profiling-test-2020-02-23-augmented/en/"+author_id+".xml", "a")
    preprocessed_text = custom_standardization(text)
    f.write(preprocessed_text)
    f.close()

print('cunt err', count_errors)
!zip -r pan20-author-profiling-test-2020-02-23-augmented.zip pan20-author-profiling-test-2020-02-23-augmented
# If automatic download doesn't start, open the directory browser on the left menu and download the zip file manually.
files.download("pan20-author-profiling-test-2020-02-23-augmented.zip")

File nr.: 1
Filename: 66xfnef04078fl2cjtpl75lcoa3urlw5.xml
Succes
File nr.: 2
Filename: x8hzj2iwaod69ghfm2trykq4nda7wsya.xml
Succes
File nr.: 3
Filename: g8f4l0wj2xptb483ivgytzcslwnj24e4.xml
Succes
File nr.: 4
Filename: 6cbcvkpdf0hhh7or4xo9in3iwex4lq0s.xml
Succes
File nr.: 5
Filename: ldrqcujt9ytmblge312p0wi64abt0vdj.xml
Succes
File nr.: 6
Filename: pzzy8v2i20vq0cpijs6l3081938r7eev.xml
Succes
File nr.: 7
Filename: pg8ox5qkjf6dwapf1nhei3tf1ohn6fln.xml
Succes
File nr.: 8
Filename: 8p6ioi78mxizzakpagdmvmlphhwd0dgs.xml
Succes
File nr.: 9
Filename: l2hpq42zdxr794yeebxxd33lo4ytfupc.xml
Succes
File nr.: 10
Filename: riecahk95ttxibfy7oh1v0bxkiighjei.xml
Succes
File nr.: 11
Filename: f4bepnrk5576eun0bpmpvx0frd05ih54.xml
Succes
File nr.: 12
Filename: kgbdfnx8y7a9zxtvopmqesmcu91u0t48.xml
Succes
File nr.: 13
Filename: 8h0mkknjo83qd8bdazzb0bmo619c4juk.xml
Succes
File nr.: 14
Filename: 9uoaby907nei5mdzukbhspwi1q9l9k1h.xml
Succes
File nr.: 15
Filename: q7fvurr0w3b8q8pykkk0mq1phokumf01.xml
Succes
File

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>