## Install packages

In [1]:
# !pip install xmltodict
# !pip install untangle
# !pip install pandas

## Import libraries

In [1]:
import json, xmltodict, os
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

## Functions

In [2]:
# Training data paths
path_train_en = '../Author profiling/PAN19-Author-Profiling/Datasets/pan19-author-profiling-training-2019-02-18/en/'
path_train_es = '../Author profiling/PAN19-Author-Profiling/Datasets/pan19-author-profiling-training-2019-02-18/es/'
path_train_labels_en = '../Author profiling/PAN19-Author-Profiling/Datasets/pan19-author-profiling-training-2019-02-18/en.txt'
path_train_labels_es = '../Author profiling/PAN19-Author-Profiling/Datasets/pan19-author-profiling-training-2019-02-18/es.txt'

# Testing data paths
path_test_en = '../Author profiling/PAN19-Author-Profiling/Datasets/pan19-author-profiling-test-2019-04-29/en/'
path_test_es = '../Author profiling/PAN19-Author-Profiling/Datasets/pan19-author-profiling-test-2019-04-29/es/'
path_test_labels_en = '../Author profiling/PAN19-Author-Profiling/Datasets/pan19-author-profiling-test-2019-04-29/en.txt'
path_test_labels_es = '../Author profiling/PAN19-Author-Profiling/Datasets/pan19-author-profiling-test-2019-04-29/es.txt'

# Saving data path
path_to_save = '../Author profiling/PAN19-Author-Profiling/Datasets/CSV/'

In [3]:
# TODO: remove this cell
path_en = 'pan19-author-profiling-earlybirds-20190320/en/'
path_es = 'pan19-author-profiling-earlybirds-20190320/es/'
path_to_save_en = 'JSON/pan19-author-profiling-earlybirds-20190320/en/'
path_to_save_es = 'JSON/pan19-author-profiling-earlybirds-20190320/es/'
path_labels_en = 'pan19-author-profiling-earlybirds-20190320/en.txt'
path_labels_es = 'pan19-author-profiling-earlybirds-20190320/es.txt'

In [4]:
def get_list_names(path):

    file_names = []

    for filename in os.listdir(path):
        f = os.path.join(path, filename)

        if os.path.isfile(f):
            if '.txt' not in filename:
                file_names.append(filename.split('.')[0])
    
    return file_names

In [5]:
# Obtaining all the file names 

# Data train
file_names_training_en = get_list_names(path_train_en)
file_names_training_es = get_list_names(path_train_es)

# Data test
file_names_test_en = get_list_names(path_test_en)
file_names_test_es = get_list_names(path_test_es)

In [6]:
print('Total data')
print(f'Training --> (EN): {len(file_names_training_en)}, (ES): {len(file_names_training_es)}')
print(f'Test --> (EN): {len(file_names_test_en)}, (ES): {len(file_names_test_es)}')

Total data
Training --> (EN): 4120, (ES): 3000
Test --> (EN): 2640, (ES): 1800


In [7]:
# Function to check file names 
def check_path(file_name):
    if file_name[0].isdigit():
        # return os.path.join(f'\', file_name)
        return file_name
    else:
        return file_name

In [8]:
def xml_to_json(path, file_names, path_to_save):

    for i, file_name in enumerate(file_names):
        file_name = check_path(file_name)
        final_path = f'{path}{file_name}.xml'

        # Reading XML file and converting it into text
        with open(final_path, encoding = 'utf-8') as xml_file:
            data = xmltodict.parse(xml_file.read())
        
        # Converting text into JSON style
        data = json.dumps(data, indent = 2)

        # Saving JSON file
        saving_path = f'{path_to_save}{file_name}.json'
        with open(saving_path, 'w') as json_file:
            json_file.write(data)

In [15]:
xml_to_json(path_en, file_names_en, path_to_save_en)
xml_to_json(path_es, file_names_es, path_to_save_es)

In [9]:
def get_labels(labels_path, file_name):
    # Leyendo y creando un DF de las etiquetas del archivo JSON
    df_labels = pd.read_csv(labels_path, sep = ':::', names = ['id', 'author', 'gender'], 
                        engine = 'python')
    
    author = df_labels[df_labels.id == file_name].author
    gender = df_labels[df_labels.id == file_name].gender

    return author.values[0], gender.values[0]

In [10]:
def json_to_csv(path, labels_path, file_names):
    data = pd.DataFrame(columns = ['id', 'tweet', 'author', 'gender'])

    for file_name in file_names:
        # Verificar PATH
        file_name = check_path(file_name)
        
        # Leer archivo JSON basado en el nombre del archivo
        with open(f'{path}{file_name}.json') as f:
            json_file = json.load(f)
        
        # Obteniendo la cantidad de documentos que tiene el archivo JSON
        file_len = len(json_file['author']['documents']['document'])

        # Obtner las etiquetas del JSON
        author, gender = get_labels(labels_path, file_name)

        dict = {'id': [file_name] * file_len,
                'tweet': json_file['author']['documents']['document'],
                'author': [author] * file_len,
                'gender': [gender] * file_len}

        df = pd.DataFrame(data = dict)
        data = data.append(df, ignore_index=True)
        
    return data

In [11]:
json_train_path_en = '../Author profiling/PAN19-Author-Profiling/Datasets/JSON/pan19-author-profiling-training-2019-02-18/en/'
json_train_path_es = '../Author profiling/PAN19-Author-Profiling/Datasets/JSON/pan19-author-profiling-training-2019-02-18/es/'

json_test_path_en = '../Author profiling/PAN19-Author-Profiling/Datasets/JSON/pan19-author-profiling-test-2019-04-29/en/'
json_test_path_es = '../Author profiling/PAN19-Author-Profiling/Datasets/JSON/pan19-author-profiling-test-2019-04-29/es/'

json_paths = [json_train_path_en, json_train_path_es, json_test_path_en, json_test_path_es]
labels = [path_train_labels_en, path_train_labels_es, path_test_labels_en, path_test_labels_es]
filenames = [file_names_training_en, file_names_training_es, file_names_test_en, file_names_test_es]
json_path_names = ['Train-EN', 'Train-ES', 'Test-EN', 'Test-ES']

In [14]:
for i in range(4):
    data = json_to_csv(json_paths[i], labels[i], filenames[i])
    data.to_csv(f'{path_to_save}{json_path_names[i]}.csv', encoding='utf-8')
    print(f'Total CSV data: {data.shape[0]}')

Total CSV data: 412000
Total CSV data: 300000
Total CSV data: 264000
Total CSV data: 180000
