# Extracting and cleaning data

## Import libraries

In [1]:
import sys
sys.path.append('../../../Scripts/')

In [2]:
import json, xmltodict, os
import text_preprocessing_v3 as tpv3
import numpy as np, pandas as pd
import warnings
warnings.filterwarnings("ignore")

DEPRECATION: https://github.com/explosion/spacy-models/releases/download/es_core_news_sm-3.3.0/es_core_news_sm-3.3.0-py3-none-any.whl#egg=es_core_news_sm==3.3.0 contains an egg fragment with a non-PEP 508 name pip 25.0 will enforce this behaviour change. A possible replacement is to use the req @ url syntax, and remove the egg fragment. Discussion can be found at https://github.com/pypa/pip/issues/11617


Collecting es-core-news-sm==3.3.0
  Downloading https://github.com/explosion/spacy-models/releases/download/es_core_news_sm-3.3.0/es_core_news_sm-3.3.0-py3-none-any.whl (12.9 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 12.9/12.9 MB 14.5 MB/s eta 0:00:00
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('es_core_news_sm')


[nltk_data] Downloading package stopwords to /Users/mash/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Extracting data

### Paths

In [2]:
# Training data paths
path_train_en = '../Datasets/pan20-author-profiling-training-2020-02-23/en/'
path_train_es = '../Datasets/pan20-author-profiling-training-2020-02-23/es/'
path_train_labels_en = '../Datasets/pan20-author-profiling-training-2020-02-23/en.txt'
path_train_labels_es = '../Datasets/pan20-author-profiling-training-2020-02-23/es.txt'

# Testing data paths
path_test_en = '../Datasets/pan20-author-profiling-test-2020-02-23/en/'
path_test_es = '../Datasets/pan20-author-profiling-test-2020-02-23/es/'
path_test_labels_en = '../Datasets/pan20-author-profiling-test-2020-02-23/en.txt'
path_test_labels_es = '../Datasets/pan20-author-profiling-test-2020-02-23/es.txt'

# Saving data path
path_to_save = '../Datasets/CSV/'
path_to_save_en = '../Datasets/JSON/en/'
path_to_save_es = '../Datasets/JSON/es/'

### Functions

In [3]:
def get_list_names(path):

    file_names = []

    for filename in os.listdir(path):
        f = os.path.join(path, filename)

        if os.path.isfile(f):
            if '.txt' not in filename:
                file_names.append(filename.split('.')[0])
    
    return file_names

In [4]:
# Obtaining all the file names 

# Data train
file_names_training_en = get_list_names(path_train_en)
file_names_training_es = get_list_names(path_train_es)

# Data test
file_names_test_en = get_list_names(path_test_en)
file_names_test_es = get_list_names(path_test_es)

In [5]:
print('Total data')
print(f'Training --> (EN): {len(file_names_training_en)}, (ES): {len(file_names_training_es)}')
print(f'Test --> (EN): {len(file_names_test_en)}, (ES): {len(file_names_test_es)}')

Total data
Training --> (EN): 300, (ES): 300
Test --> (EN): 200, (ES): 200


In [6]:
# Function to check file names 
def check_path(file_name):
    if file_name[0].isdigit():
        # return os.path.join(f'\', file_name)
        return file_name
    else:
        return file_name

In [7]:
def xml_to_json(path, file_names, path_to_save):

    for i, file_name in enumerate(file_names):
        file_name = check_path(file_name)
        final_path = f'{path}{file_name}.xml'

        # Reading XML file and converting it into text
        with open(final_path, encoding = 'utf-8') as xml_file:
            data = xmltodict.parse(xml_file.read())
        
        # Converting text into JSON style
        data = json.dumps(data, indent = 2)

        # Saving JSON file
        saving_path = f'{path_to_save}{file_name}.json'
        with open(saving_path, 'w') as json_file:
            json_file.write(data)

In [8]:
xml_to_json(path_train_en, file_names_training_en, f'{path_to_save_en}train/')
xml_to_json(path_train_es, file_names_training_es, f'{path_to_save_es}train/')
xml_to_json(path_test_en, file_names_test_en, f'{path_to_save_en}test/')
xml_to_json(path_test_es, file_names_test_es, f'{path_to_save_es}test/')

In [9]:
def get_labels(labels_path, file_name):
    # Leyendo y creando un DF de las etiquetas del archivo JSON
    df_labels = pd.read_csv(labels_path, sep = ':::', names = ['id', 'label'], 
                        engine = 'python')
    
    labels = df_labels[df_labels.id == file_name].label

    return labels.values[0]

In [13]:
def json_to_csv(path, labels_path, file_names):
    data = pd.DataFrame(columns = ['id', 'tweet', 'label'])

    for file_name in file_names:
        # Verificar PATH
        file_name = check_path(file_name)
        
        # Leer archivo JSON basado en el nombre del archivo
        with open(f'{path}{file_name}.json') as f:
            json_file = json.load(f)
        
        # Obteniendo la cantidad de documentos que tiene el archivo JSON
        file_len = len(json_file['author']['documents']['document'])

        # Obtner las etiquetas del JSON
        labels = get_labels(labels_path, file_name)

        dict = {'id': [file_name] * file_len,
                'tweet': json_file['author']['documents']['document'],
                'label': [labels] * file_len}

        df = pd.DataFrame(data = dict)
        data = data.append(df, ignore_index=True)
        
    return data

In [14]:
json_paths = [f'{path_to_save_en}train/', f'{path_to_save_es}train/', f'{path_to_save_en}test/', f'{path_to_save_es}test/']
labels = [path_train_labels_en, path_train_labels_es, path_test_labels_en, path_test_labels_es]
filenames = [file_names_training_en, file_names_training_es, file_names_test_en, file_names_test_es]
json_path_names = ['Train-EN', 'Train-ES', 'Test-EN', 'Test-ES']

In [15]:
for i in range(4):
    data = json_to_csv(json_paths[i], labels[i], filenames[i])
    data.to_csv(f'{path_to_save}{json_path_names[i]}.csv', encoding='utf-8')
    print(f'Total CSV data: {data.shape[0]}')

Total CSV data: 30000
Total CSV data: 30000
Total CSV data: 20000
Total CSV data: 20000


## Cleaning data

### Paths

In [3]:
main_path = '../Datasets/CSV/'
data_training_en_path = f'{main_path}Train-EN.csv'
data_training_es_path = f'{main_path}Train-ES.csv'
data_test_en_path = f'{main_path}Test-EN.csv'
data_test_es_path = f'{main_path}Test-ES.csv'

### Data

In [4]:
# English
data_training_en = pd.read_csv(data_training_en_path)
data_test_en = pd.read_csv(data_test_en_path)

# Spanish
data_training_es = pd.read_csv(data_training_es_path)
data_test_es = pd.read_csv(data_test_es_path)

### Preprocess data v3.0

In [7]:
prep_en = tpv3.Preprocessing(language='english')
prep_es = tpv3.Preprocessing(language='spanish')

#### English

In [8]:
data_training_en = prep_en.main_preprocess(data=data_training_en, 
                                column='tweet', 
                                remove_stop_words=False, 
                                is_dataframe=True,
                                lemmatize=True, 
                                emoji_path=None,
                                tweet=True)

In [None]:
data_test_en = prep_en.main_preprocess(data=data_test_en, 
                                column='tweet', 
                                remove_stop_words=False, 
                                is_dataframe=True,
                                lemmatize=True, 
                                emoji_path=None,
                                tweet=True)

#### Spanish

In [None]:
data_training_es = prep_es.main_preprocess(data=data_training_es, 
                                column='tweet', 
                                remove_stop_words=False, 
                                is_dataframe=True,
                                lemmatize=True, 
                                emoji_path=None,
                                tweet=True)

In [None]:
data_test_es = prep_es.main_preprocess(data=data_test_es, 
                                column='tweet', 
                                remove_stop_words=False, 
                                is_dataframe=True,
                                lemmatize=True, 
                                emoji_path=None,
                                tweet=True)

### Save data

In [None]:
data = [data_training_en, data_test_en, data_training_es, data_test_es]
data_names = ['data_training_en_lemma', 'data_test_en_lemma', 'data_training_es_lemma', 'data_test_es_lemma']

In [None]:
for i in range(4):
    data[i].to_csv(f'{main_path}Clean/{data_names[i]}.csv')