<a href="https://colab.research.google.com/github/marco-siino/G-Lab_ISS-PanClef2022/blob/main/G-Lab_ISS2022_DataEnhancement%2BCNN_ModelNB.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Importing modules.

In [None]:
import matplotlib.pyplot as plt
import os
import re
import shutil
import string
import tensorflow as tf
import numpy as np
import pandas as pd

from tensorflow.keras import layers
from tensorflow.keras import losses
from tensorflow.keras import preprocessing
from keras.models import Model
from keras.models import Sequential
from keras.layers import Dense
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from google.colab import files
from io import open
from pathlib import Path

## Importing DS and extract in current working directory.

In [None]:
# Url obtained starting from this: https://drive.google.com/file/d/19ZcqEv88euKB71HfAWjTGN3uCKp2qsfP/ and forcing export=download.
train_set_url = "https://github.com/marco-siino/iss/raw/main/pan22-author-profiling-training-2022-03-29-augmented.zip"
test_set_url="https://github.com/marco-siino/iss/raw/main/pan22-author-profiling-test-2022-04-22-without_truth-augmented.zip"

train_set_path = tf.keras.utils.get_file("pan22-author-profiling-training-2022-03-29-augmented.zip", train_set_url,
                                    extract=True, archive_format='zip',cache_dir='.',
                                    cache_subdir='')
test_set_path = tf.keras.utils.get_file("pan22-author-profiling-test-2022-04-22-without_truth-augmented.zip", test_set_url,
                                  extract=True, archive_format='zip',cache_dir='.',
                                  cache_subdir='')

train_set_dir = os.path.join(os.path.dirname(train_set_path), 'pan22-author-profiling-training-2022-03-29-augmented')
test_set_dir = os.path.join(os.path.dirname(test_set_path), 'pan22-author-profiling-test-2022-04-22-without_truth-augmented')

print(train_set_path)
print(train_set_dir)

!ls -A

Downloading data from https://github.com/marco-siino/iss/raw/main/pan22-author-profiling-training-2022-03-29-augmented.zip
Downloading data from https://github.com/marco-siino/iss/raw/main/pan22-author-profiling-test-2022-04-22-without_truth-augmented.zip
./pan22-author-profiling-training-2022-03-29-augmented.zip
./pan22-author-profiling-training-2022-03-29-augmented
.config
pan22-author-profiling-test-2022-04-22-without_truth-augmented
pan22-author-profiling-test-2022-04-22-without_truth-augmented.zip
pan22-author-profiling-training-2022-03-29-augmented
pan22-author-profiling-training-2022-03-29-augmented.zip
sample_data


## Build folders hierarchy to use Keras folders preprocessing function.

In [None]:
### Training Folders. ###
# First level directory.
if not os.path.exists('train_dir'):
    os.makedirs('train_dir')

# Class labels directory.
if not os.path.exists('train_dir/0'):
    os.makedirs('train_dir/0')
if not os.path.exists('train_dir/1'):
    os.makedirs('train_dir/1')

!ls -A

.config
pan22-author-profiling-test-2022-04-22-without_truth-augmented
pan22-author-profiling-test-2022-04-22-without_truth-augmented.zip
pan22-author-profiling-training-2022-03-29-augmented
pan22-author-profiling-training-2022-03-29-augmented.zip
sample_data
train_dir


## Set language and directory paths.


In [None]:
# Set train_dir and test_dir paths.
#truth_file_test_dir=test_set_dir

truth_file_train_dir=train_set_dir+'/'
truth_file_training_path = truth_file_train_dir+'en.txt'

#truth_file_test_path_en = truth_file_test_dir+'/'+language+'.txt'

## Read truth.txt to organize training dataset folders.



In [None]:
# Open the file truth.txt with read only permit.
f = open(truth_file_training_path, "r")
# use readline() to read the first line 
line = f.readline()
# use the read line to read further.
# If the file is not empty keep reading one line
# at a time, till the file is empty
while line:
    # Split line at :::
    x = line.split(":::")
    fNameXml = x[0]+'.xml'
    fNameTxt = x[0]+'.txt'
    # Second coord [0] gets just the first character (label) and not /n too.
    label = x[1][0]
    # Change Classes NI->0 and I->1.
    if (label=='I'):
      label=1
    else:
      label=0
   
    # Now move the file to the right folder.
    if os.path.exists(truth_file_train_dir+'en/'+fNameXml):
      os.rename(truth_file_train_dir+'en/'+fNameXml, './train_dir/'+str(label)+'/'+fNameTxt )

    # use readline() to read next line
    line = f.readline()

## Building the dataset.

In [None]:
batch_size=1  

full_train_ds = tf.keras.preprocessing.text_dataset_from_directory(
    'train_dir', 
    batch_size=batch_size, 
    shuffle=False,
    seed=1
    )

full_train_ds=full_train_ds.shuffle(420,seed=1, reshuffle_each_iteration=False)
full_train_ds_size=len(full_train_ds)

Found 420 files belonging to 2 classes.


## First model's layer: Text Vectorization.

In [None]:
# Function to generate a text_vectorization_layer.
def gen_text_vectorization_layer(train_set):
    # Set a very large sequence length to find the longest sample.
    sequence_length = 30000
    vectorize_layer_tmp = TextVectorization(
        standardize=None,
        output_mode='int',
        output_sequence_length=sequence_length)

    train_text = train_set.map(lambda x, y: x)
    vectorize_layer_tmp.adapt(train_text)
    #vectorize_layer.get_vocabulary()

    model = tf.keras.models.Sequential()
    model.add(tf.keras.Input(shape=(1,), dtype=tf.string))
    model.add(vectorize_layer_tmp)

    longest_sample_length=1
    max_nr_dictionary_entry = 1

    for element in train_set:
      authorDocument=element[0]
      label=element[1]
      
      #print("Sample considered is: ", authorDocument[0].numpy())
      #print("Preprocessed: ", str(custom_standardization(authorDocument[0].numpy())))
      #print("And has label: ", label[0].numpy())

      # Count the number of zeros from the last non-zero token to the end of the sample. 
      # Shortest tokenized sample has less zeros than others.
      out=model(authorDocument)
      token_nr_index=sequence_length-1
      current_sample_zeros_counter=0
      while out.numpy()[0][token_nr_index]==0:
        token_nr_index-=1
        current_sample_zeros_counter+=1

      shortest_padding_length=sequence_length-longest_sample_length
      if current_sample_zeros_counter<shortest_padding_length:
        longest_sample_length=sequence_length-current_sample_zeros_counter

      # Get greater token value (to set dictionary size) for current sample.
      if np.amax(out.numpy()[0])>max_nr_dictionary_entry:
        max_nr_dictionary_entry=np.amax(out.numpy()[0])

    # Be sure to include each ngram.
    max_nr_dictionary_entry+=1

    print("Length of the longest sample in train set is:",longest_sample_length)
    print("Dictionary size is:",max_nr_dictionary_entry)

    sequence_length = longest_sample_length

    vectorize_layer = TextVectorization(
        standardize=None,
        max_tokens=max_nr_dictionary_entry,
        output_mode='int',
        output_sequence_length=sequence_length)
    train_text = train_set.map(lambda x, y: x)
    vectorize_layer.adapt(train_text)
    
    return vectorize_layer,max_nr_dictionary_entry

## Split the full training set to do a 5-cross fold validation.




In [None]:
# 5 Cross fold generation example. 

# 1° Fold -> 80% - 20%V
# 2° Fold -> 60% - 20%V - 20%
# 3° Fold -> 40% - 20%V - 40%
# 4° Fold -> 20% - 20%V - 60%
# 5° Fold -> 20%V - 80%

train=[]
val=[]
test = []

# train_ds = train+val. DS used for model development.
full_train_ds_size = len(full_train_ds)
train_ds = full_train_ds.take(380)
# Final set to be used at the end of development phase. A little less than 10% of the original ds (420 samples).
test_set = full_train_ds.skip(380)
"""
        380           40        TOT = 420
|_________________| |_____|
      train_ds      test_set
"""

# Percentage start and end of validation subset within train_ds. (at each fold: train->304 samples, val->76 samples)
val_percentage_start=80
val_percentage_end=100
val_percentage_size=20
fold_nr=5

for i in range(0,fold_nr):
  train.append(train_ds.take(int(len(train_ds)*val_percentage_start/100)))
  train[i] = train[i].concatenate(train_ds.skip(int(len(train_ds)*val_percentage_end/100)))

  val.append(train_ds.skip(int(len(train_ds)*val_percentage_start/100)))
  val[i] = val[i].take(int(len(train_ds)*val_percentage_size/100))

  val_percentage_start-=val_percentage_size
  val_percentage_end-=val_percentage_size

## CNN model definition and training.

In [9]:
# Word embedding dimensions.
embedding_dim = 100
cnn = []
max_features = []

cnn_preds_results =[]

for current_fold in range(0,fold_nr):
  print("\n\nFold nr.: ", current_fold)

  vectorize_layer, max_features_tmp = gen_text_vectorization_layer(train[current_fold])
  max_features.append(max_features_tmp)

  cnn.append(tf.keras.Sequential([
                              tf.keras.Input(shape=(1,), dtype=tf.string),
                              vectorize_layer,
                              layers.Embedding(max_features[current_fold] + 1, embedding_dim),                     
                              layers.Dropout(0.4),

                              layers.Conv1D(64,16),
                              layers.MaxPooling1D(),
                              layers.Dropout(0.5),                 
                              
                              layers.GlobalAveragePooling1D(),
                              layers.Dense(1)
    ]))

  opt = tf.keras.optimizers.Adam()
  cnn[current_fold].compile(loss=losses.BinaryCrossentropy(from_logits=True), optimizer=opt, metrics=tf.metrics.BinaryAccuracy(threshold=0.0)) 

  epochs = 5
  history = cnn[current_fold].fit(
      train[current_fold],
      validation_data=val[current_fold],
      epochs=epochs,
      shuffle=False,
      # Comment the following line to do not save and download the model.
      #callbacks=[callbacks]
      )
  
  
  print("AVG accuracy on Val and Test set is:", str((cnn[current_fold].evaluate(test_set)[1]+cnn[current_fold].evaluate(val[current_fold])[1])/2))
  cnn_preds_results.append((cnn[current_fold].evaluate(test_set)[1]+cnn[current_fold].evaluate(val[current_fold])[1])/2)
  #model_en.summary()
print("\n\n******************************************************")
print("\n\nOver five folds AVG accuracy is:", str(sum(cnn_preds_results)/fold_nr))



Fold nr.:  0
Length of the longest sample in train set is: 20382
Dictionary size is: 165944
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
AVG accuracy on Val and Test set is: 0.8973684012889862


Fold nr.:  1
Length of the longest sample in train set is: 20382
Dictionary size is: 165152
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
AVG accuracy on Val and Test set is: 0.8717105388641357


Fold nr.:  2
Length of the longest sample in train set is: 20153
Dictionary size is: 165393
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
AVG accuracy on Val and Test set is: 0.9230263233184814


Fold nr.:  3
Length of the longest sample in train set is: 20382
Dictionary size is: 165420
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
AVG accuracy on Val and Test set is: 0.8967105448246002


Fold nr.:  4
Length of the longest sample in train set is: 20382
Dictionary size is: 167808
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
AVG accuracy on Val and Test set is: 0.9164473712444305


In [None]:
## Generate predictions and write into separate XML files. (1 file -> 1 author prediction)

# msiino_iss2022task_predictions/en/aId.xml
if not os.path.exists('glab_iss2022task_predictions'):
    os.makedirs('glab_iss2022task_predictions')
if not os.path.exists('glab_hss2021task_predictions/en/'):
    os.makedirs('glab_hss2021task_predictions/en/')

languages=['en']
prediction_counter=0
for language in languages: 
  for filename in os.listdir('pan22-author-profiling-test-2022-04-22-without_truth-augmented/en/'):
    if filename!=".txt":
      prediction_counter+=1
      print("\nFile nr.: ", prediction_counter)
      
      x = filename.split(".")
      
      author_id = x[0]
      print("Filename:", filename)

      if not os.path.exists('tmp_test_author_dir/'+author_id):
        os.makedirs('tmp_test_author_dir/'+author_id)
      
      if os.path.exists('pan22-author-profiling-test-2022-04-22-without_truth-augmented/en/'+filename):
        shutil.copyfile('pan22-author-profiling-test-2022-04-22-without_truth-augmented/en/'+filename, 'tmp_test_author_dir/'+author_id+'/'+author_id+'.txt' )

      test_ds = tf.keras.preprocessing.text_dataset_from_directory(
        'tmp_test_author_dir', 
        batch_size=1,
        shuffle=False
        )
      
      for current_sample in train[1]:
        print(current_sample[0])
        print(cnn[1].predict(current_sample[0])[0][0])
        if cnn[1].predict(current_sample[0])[0][0]>0.0:
          prediction="I"
        else:
          prediction="NI"
        #prediction = model.predict_classes(current_sample)[0][0]
      print("Author id:",author_id)
      print("Language:", language)
      print("Class predicted:", prediction)
      print("Model output: ", cnn[1].predict(current_sample[0]))
      xml_content= "<author id=\"" + author_id + "\" lang=\"" + language + "\" type=\"" + str(prediction) + "\" />"

      f = open("glab_iss2022task_predictions/"+author_id+".xml", "a")
      f.write(xml_content)
      f.close()

      shutil.rmtree('tmp_test_author_dir/'+author_id)

## Zip and Download the predictions (remember to set Callback to use this!).

!zip -r glab_iss2022task_predictions.zip glab_iss2022task_predictions
# If automatic download doesn't start, open the directory browser on the left menu and download the zip file manually.
files.download("glab_iss2022task_predictions.zip")