In [1]:
import pandas as pd
from pathlib import Path
import html

#### Data pre-processing

In [2]:
'''Dataset Construction'''
# Parent directories, 'orginal' stores the original chapters translated by machine or by different translators; 
#                     'cleaned' stores the cleaned files by data pre-processing
ORIGINAL_DIR = Path('original')
CLEANED_DIR = Path('cleaned')

# Chuildren directories for each parent dir
PO_GOO_DIR = Path('po-goo') # stores chapters translated by Google Translate from the original version (Polish) of the book Solaris to English
PO_DL_DIR = Path('po-dl') # stores chapters translated by DeepL API from the original version (Polish) of the book Solaris to English
FR_GOO_DIR = Path('fr-goo') # stores chapters translated by Google Translate from the translated French version of the book Solaris to English
FR_DL_DIR = Path('fr-dl') # stores chapters translated by DeepL API from the translated French version of the book Solaris to English
EN1_DIR = Path('fr-h') # stores chapters of first translated version to English of the book Solaris
EN2_DIR = Path('po-h') # stores chapters of second translated version to English of the book Solaris

# List to access children directories
DIRS = [PO_GOO_DIR, PO_DL_DIR, FR_GOO_DIR, FR_DL_DIR, EN1_DIR, EN2_DIR]

# List for all the translations
BOOKS = ['po-goo', 'po-dl', 'fr-goo', 'fr-dl', 'fr-h', 'po-h']

In [3]:
'''Data analysis'''
# Find out the replacement by checking the vocabularies in parellel chapters for confirmation
TO_REPLACE = {
    'ö': 'o',
    'é': 'e',
    'ë': 'e',
    'ï': 'i',
    'ą': 'a',
    'í': 'i',
    'æ': 'ae',
    'ð': 'ny', # Original - fr-dl[8] 'caðon', changed to based on po-en-goo 'Canyon'
    'à': 'a',
    'å': 'a',
    'â': 'a',
    "'": '', # Replace apostrophe with nothing to protect contractions like don't -> dont
    '’': '',
    ' `': '',
}

In [4]:
# Function to check special characters by checking each translated texts.
allowed="abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 \n"
def get_not_allowed(txt):
    others = set()
    for i in txt:
        if i not in allowed:
            others.add(i)
    return others

others = set()
for p in DIRS:
    for i in (ORIGINAL_DIR / p).iterdir(): # Checked by each directory contained target data
        with open(i) as f:
            txt = f.read()
            # Change the html punctuation format to the normal one
            txt = html.unescape(txt)
        others = others.union(get_not_allowed(txt))
print(others)


{'ą', '—', 'ð', '»', 'ï', ':', ';', 'ö', '?', '"', 'å', '\x0c', 'ë', '³', 'é', 'à', '_', '’', '^', '…', 'â', '(', ')', 'í', '‘', '•', '–', '.', '-', '*', '`', '!', "'", '”', '“', 'æ', ','}


In [5]:
# Data is by manually selection based on the source
PROPER_NOUNS = ['harey','snaut','sartorius','kelvin','gibarian','kris']

In [6]:
'''Data Cleaning'''

for i in DIRS:
    for j in (ORIGINAL_DIR / i).iterdir():
        with open(j) as f:
            txt = f.read()
        
        # Change the html punctuation format to the normal one
        txt = html.unescape(txt)

        # Replace special characters
        for key in TO_REPLACE.keys():
            txt = txt.replace(key, TO_REPLACE[key])

        # Relace the rest of the characters with empty string
        for each in others:
            txt = txt.replace(each, ' ')
        
        # Lowercase everything, incase it counts "I" and 'i' as different words
        txt = txt.lower()

        # Exclude proper nouns
        for each in PROPER_NOUNS:
            txt = txt.replace(each, ' ')
        
        if not (CLEANED_DIR / i).is_dir():
            (CLEANED_DIR / i).mkdir(parents=True)

        # Write to the cleaned directory
        with open(CLEANED_DIR / i / j.name, 'w') as f:
            f.write(txt)
            