<a href="https://colab.research.google.com/github/marco-siino/G-Lab_ISS-PanClef2022/blob/main/G-Lab_ISS2022_TrainSet_AugmentationNB.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Importing modules.

In [None]:
import os
import os.path
from os import path
import re
import shutil
import string
import tensorflow as tf

!pip install -U deep-translator
from deep_translator import GoogleTranslator
from io import open
from pathlib import Path
from google.colab import files

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


## Importing DS and extract in current working directory.

In [None]:
# Url obtained starting from this: https://drive.google.com/file/d/19ZcqEv88euKB71HfAWjTGN3uCKp2qsfP/ and forcing export=download.
train_set_url = "https://github.com/marco-siino/iss/raw/main/pan22-author-profiling-training-2022-03-29.zip"

train_set_path = tf.keras.utils.get_file("pan22-author-profiling-training-2022-03-29.zip", train_set_url,
                                    extract=True, archive_format='zip',cache_dir='.',
                                    cache_subdir='')

train_set_dir = os.path.join(os.path.dirname(train_set_path), 'pan22-author-profiling-training-2022-03-29')

print(train_set_path)
print(train_set_dir)


Downloading data from https://github.com/marco-siino/iss/raw/main/pan22-author-profiling-training-2022-03-29.zip
./pan22-author-profiling-training-2022-03-29.zip
./pan22-author-profiling-training-2022-03-29


## Function to pre-process source text.

In [None]:
def custom_standardization(text):
  tag_open_CDATA_removed = re.sub('<\!\[CDATA\[', ' ',text)
  tag_closed_CDATA_removed = re.sub('\]{1,}>', ' ',tag_open_CDATA_removed)
  tag_author_lang_en_removed = re.sub('<author lang="en">', ' ',tag_closed_CDATA_removed)
  tag_closed_author_removed = re.sub('</author>', ' ',tag_author_lang_en_removed)
  tag_open_documents_removed = re.sub('<documents>\n(\t){0,2}', '',tag_closed_author_removed)
  tag_closed_documents_removed = re.sub('</documents>\n(\t){0,2}', ' ',tag_open_documents_removed)
  lowercased_text = tag_closed_documents_removed.lower()
  return lowercased_text

## Data Augmentation function.

In [None]:
def chunkstring(string, length):
  res = list((string[0+i:length+i] for i in range(0, len(string), length)))
  return res

def enhance_one_sample(sample):  
  preprocessed_text = custom_standardization(sample)

  #chunk to avoid character limits  
  TOBETRANS = chunkstring(preprocessed_text, 4000)
  translated_it = GoogleTranslator(source='en', target='it').translate_batch(TOBETRANS)
  reversed_trans = GoogleTranslator(source='it', target='en').translate_batch(translated_it)
  merged_chunks =''.join(reversed_trans)
  enhanced_sample = preprocessed_text+merged_chunks
  return enhanced_sample

# Generating augmented train set.

In [None]:
## Generate the augmented training set as a zip file.

# pan22-author-profiling-training-2022-03-29-augmented.zip has to be created.
if not os.path.exists('pan22-author-profiling-training-2022-03-29-augmented'):
    os.makedirs('pan22-author-profiling-training-2022-03-29-augmented')
if not os.path.exists('pan22-author-profiling-training-2022-03-29-augmented/en/'):
    os.makedirs('pan22-author-profiling-training-2022-03-29-augmented/en/')

# Copy the ground truth file.
if os.path.exists('pan22-author-profiling-training-2022-03-29-augmented/en/'):
        shutil.copyfile('pan22-author-profiling-training-2022-03-29/en.txt', 'pan22-author-profiling-training-2022-03-29-augmented/en.txt')
file_nr = 0
for filename in os.listdir('pan22-author-profiling-training-2022-03-29/en/'):
  file_nr += 1
  x = filename.split(".")      
  author_id = x[0]
  print("File nr.:", file_nr)
  print("Filename:", filename)
  text = open('pan22-author-profiling-training-2022-03-29/en/'+filename, 'r').read()
  enhanced_sample = enhance_one_sample(text)
  f = open("pan22-author-profiling-training-2022-03-29-augmented/en/"+author_id+".xml", "a")
  f.write(enhanced_sample)
  f.close()

!zip -r pan22-author-profiling-training-2022-03-29-augmented.zip pan22-author-profiling-training-2022-03-29-augmented
# If automatic download doesn't start, open the directory browser on the left menu and download the zip file manually.
files.download("pan22-author-profiling-training-2022-03-29-augmented.zip")

File nr.: 1
Filename: 8366580547454f40202845e40937d8b3.xml
File nr.: 2
Filename: a8a32c9c58c0f9c471a21bff57c00cc1.xml
File nr.: 3
Filename: a768d66447525a6076095d1bc5993fb6.xml
File nr.: 4
Filename: 9e87910ee9abaa291831ae0361665a28.xml
File nr.: 5
Filename: 2abe4cc70bbbe8c5a95ff3a1c6f9ecf6.xml
File nr.: 6
Filename: cd2746d3bc250be7f8420e243b1824a8.xml
File nr.: 7
Filename: 4cf96aa55124ecf055f0cd8d47276f7.xml
File nr.: 8
Filename: 6efbf0c5385bde90583d02f14750f7d9.xml
File nr.: 9
Filename: 642f8facac770ae79b052260fdd18608.xml
File nr.: 10
Filename: 78402653a876735397c89cf7a6f5ac7e.xml
File nr.: 11
Filename: a5223473202c91bd25d53dabdedcdc76.xml
File nr.: 12
Filename: 19ae31d4a3786435f6cd55e4afd928c8.xml
File nr.: 13
Filename: b585c4415b1fe50f2c31fa1698b271b7.xml
File nr.: 14
Filename: 83205437d862bc13b42a6d2dd91fcbe4.xml
File nr.: 15
Filename: 3282625174626c99a268f788082a73b0.xml
File nr.: 16
Filename: 9f5175aa2c26a77842a6bd9bc57d3e21.xml
File nr.: 17
Filename: fbd731341643aeac385338c74c1

UnicodeDecodeError: ignored

In [None]:
## Generate the augmented training set as a zip file.

# pan22-author-profiling-training-2022-03-29-augmented.zip has to be created.
if not os.path.exists('pan22-author-profiling-training-2022-03-29-augmented'):
    os.makedirs('pan22-author-profiling-training-2022-03-29-augmented')
if not os.path.exists('pan22-author-profiling-training-2022-03-29-augmented/en/'):
    os.makedirs('pan22-author-profiling-training-2022-03-29-augmented/en/')

# Copy the ground truth file.
if os.path.exists('pan22-author-profiling-training-2022-03-29-augmented/en/'):
        shutil.copyfile('pan22-author-profiling-training-2022-03-29/en.txt', 'pan22-author-profiling-training-2022-03-29-augmented/en.txt')
file_nr = 0
for filename in os.listdir('pan22-author-profiling-training-2022-03-29/en/'):
  file_nr += 1
  x = filename.split(".")      
  author_id = x[0]
  print("File nr.:", file_nr)
  print("Filename:", filename)
  if(path.exists("pan22-author-profiling-training-2022-03-29-augmented/en/"+author_id+".xml")):
    print("Skip! Already augmented!")
  else:
    if(filename!=".DS_Store"):
      text = open('pan22-author-profiling-training-2022-03-29/en/'+filename, 'r').read()
      enhanced_sample = enhance_one_sample(text)
      f = open("pan22-author-profiling-training-2022-03-29-augmented/en/"+author_id+".xml", "a")
      f.write(enhanced_sample)
      f.close()

!zip -r pan22-author-profiling-training-2022-03-29-augmented.zip pan22-author-profiling-training-2022-03-29-augmented
# If automatic download doesn't start, open the directory browser on the left menu and download the zip file manually.
files.download("pan22-author-profiling-training-2022-03-29-augmented.zip")

File nr.: 1
Filename: 8366580547454f40202845e40937d8b3.xml
Skip! Already augmented!
File nr.: 2
Filename: a8a32c9c58c0f9c471a21bff57c00cc1.xml
Skip! Already augmented!
File nr.: 3
Filename: a768d66447525a6076095d1bc5993fb6.xml
Skip! Already augmented!
File nr.: 4
Filename: 9e87910ee9abaa291831ae0361665a28.xml
Skip! Already augmented!
File nr.: 5
Filename: 2abe4cc70bbbe8c5a95ff3a1c6f9ecf6.xml
Skip! Already augmented!
File nr.: 6
Filename: cd2746d3bc250be7f8420e243b1824a8.xml
Skip! Already augmented!
File nr.: 7
Filename: 4cf96aa55124ecf055f0cd8d47276f7.xml
Skip! Already augmented!
File nr.: 8
Filename: 6efbf0c5385bde90583d02f14750f7d9.xml
Skip! Already augmented!
File nr.: 9
Filename: 642f8facac770ae79b052260fdd18608.xml
Skip! Already augmented!
File nr.: 10
Filename: 78402653a876735397c89cf7a6f5ac7e.xml
Skip! Already augmented!
File nr.: 11
Filename: a5223473202c91bd25d53dabdedcdc76.xml
Skip! Already augmented!
File nr.: 12
Filename: 19ae31d4a3786435f6cd55e4afd928c8.xml
Skip! Already a

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>