<H1 style="color:#961296; text-align:center;">Multilingual translation tool for SimpleSite</H1>
<div style="color:#962412;">
<H3>Purpose</H3>
The aim of this tool is to facilitate continuous updates to the language files for SimpleSite's website creation platform.

<H3>Challange</H3>
Currenly, update files from language supporters containing new keys or modyfied values are in XML format and target language files are in JSON format. That makes not possible to use merge tools to apply language changes and the procedure has to be handled manually.

<H3>Proposed solution</H3> 
<H5>Number 1  ~> XML to JSON conversion</H5>
One way is to convert provided XML file to exact corresponding JSON format, hence allowing to compare new  and old values in a merge tool of one's choice.
<img src="http://i.imgur.com/vpv0KFZ.png">
<H5>Number 2 ~> Automated merge</H5>
The ideal way would be attempt an automated merge of the chages into the language files

</div>

<H3 style="color:#961296;">XML to JSON conversion</H3> 
<div style="color:#962412;">
<h5>Description</h5>
This procedure converts XML files provided in 'XML_files' directory to desired JSON format and saves each of the dataset in text file in 'Text_files' directory.
<h5>Remarks</h5>
Currently, the files are saved with LF newline with a white space at the end of most lines, whereas language files the changes will be merged to have CRLF new lines.
</div>

In [None]:
import json
from xml.dom import minidom
from pprint import pprint
from collections import OrderedDict
from xml.parsers import expat
import codecs

# Lanuage codes of the files to be updated
langCodes = ['de', 'it', 'fr', 'nl', 'nb', 'sv', 'da', 'ru']

# test file with a few sample items
langCodes = ['it']

# Iterate through selected languages
for lc in langCodes:
    
    # Parse HML file with current language to minidom format
    xmldoc = minidom.parse('XML_files/translation_'+ lc +'.xml')
    
    # Extract all items by their name 'unit' 
    itemlist = xmldoc.getElementsByTagName('unit')
    
    # there are 4 set of translations files 
    transtationFolders = ['base', 'home', 'login', 'signup']
    
    # iterate through all translation sets
    for tf in transtationFolders:
        
        # open current translation file to me updated
        with open('original_json_files/'+ tf +'/'+ lc +'.json') as json_data:
            translations = json.load(json_data)
    
        # iterate through each item from XML file
        for item in reversed(itemlist):

            # get item's id which a set of nested keys f.ex. "HOME.DESIGN_EDITOR.BACKGROUND"
            unitId = item.attributes['id'].value
            # split id by the '.' to individual keys
            keys = unitId.split('.')
            # get item's value
            sourceVal = item.getElementsByTagName('source').item(0).firstChild.nodeValue

            # first key determines translation set it is
            if keys[0].lower() == tf:

                # build nested dictionary
                parentNode = translations
                for idx, key in enumerate(keys):
                    # if last element, add translation value to the current key
                    if idx == len(keys)-1:
                        parentNode[key] = sourceVal
                    # else, if key exist, add itself of create a new dictionary and step down the tree
                    else:
                        childNode = parentNode
                        childNode[key] = childNode.get(key, {})
                        parentNode = childNode[key]

        # save each language data to file encoded in ISO-8859-1 unicode
        with codecs.open('updated_json_files/'+ tf +'/updated_'+ lc +'.json', 'w', encoding="UTF-8") as f:
            # sort each nested dictionary alphabetically and apply a new line with 2 space indent
            json.dump(translations, f, indent=2, sort_keys=True, ensure_ascii = False)


<H3 style="color:#961296;">Spreadsheet to JSON conversion</H3> 
<img src="https://i.imgur.com/dkHI1F9.png">

In [2]:
import json
from xml.dom import minidom
from pprint import pprint
from collections import OrderedDict
from xml.parsers import expat
import codecs

lang_files_path = 'C:/123/frontendprototype/app/src/translations/'
items_delimiter = '\n'
keys_delimiter = '\t'
encoding_key = 'UTF-8'

file = open('TXT_Files/all.txt', 'r', encoding=encoding_key)
items = file.read().split(items_delimiter)

# take language keys from the first row and remove first index of that row
langs =  items[0].split(keys_delimiter)
langs.pop(0)

# loop through languages
for lang_idx, lang in enumerate(langs):
    
    # loop through translation sets
    for trans_set in ['base', 'home']:
        
        # open current translation file to me updated
        with open(lang_files_path + trans_set +'/'+ lang +'.json', encoding=encoding_key) as json_data:
            translations = json.load(json_data)

        # Extract data from each translation entry
        for item in items:
            item_elements = item.split(keys_delimiter)

            # Get keys f.ex. "HOME.DESIGN_EDITOR.BACKGROUND" and split by the '.' to individual keys
            keys = item_elements[0].split('.')
            # get item's value
            sourceVal = item_elements[lang_idx+1]

            # first key determines translation set it is
            if keys[0].lower() == trans_set:

                # build nested dictionary
                parentNode = translations

                for idx, key in enumerate(keys):
                    # if last element, add translation value to the current key
                    if idx == len(keys)-1:
                        parentNode[key] = sourceVal
                    # else, if key exist, add itself of create a new dictionary and step down the tree
                    else:
                        childNode = parentNode
                        childNode[key] = childNode.get(key, {})
                        parentNode = childNode[key]

        # save each language data to file
        with codecs.open(lang_files_path + trans_set +'/'+ lang +'.json', 'w', encoding=encoding_key) as f:
            # sort each nested dictionary alphabetically and apply a new line with 2 space indent
            json.dump(translations, f, indent=2, sort_keys=True, ensure_ascii = False)
