In [4]:
import os
import sys
import requests
import csv
import json


In [51]:
# Read Yandex API key
fh = open('.apikey_yandex', 'r')
apikey = fh.read()
fh.close()
urlbase = 'https://translate.yandex.net/api/v1.5/tr.json/translate?key={}'.format(apikey)

# Yandex API query string parameters:
# ? [key=<API key>]
# & [text=<text to translate>]
# & [lang=<translation direction>]
# & [format=<text format>]
# & [options=<translation options>]
# & [callback=<callback function name>]

# Read list of affixes; sources:
# https://examples.yourdictionary.com/reference/examples/prefix-examples.html (prefixes)
# https://examples.yourdictionary.com/list-of-suffixes-and-suffix-examples.html (suffixes)
fh = open('affixes.csv', 'r')
affixes = []
rdr = csv.reader(fh)
for row in rdr:
    affixes.append({'type': row[0], 'affix': row[1], 'definition': row[2], 'examples': row[3]})
fh.close()

# List of languages we're interested in
languages = {
    'Basque': 'Basque_Euskara',
    'Danish': 'Danish_Dansk',
    'Dutch': 'Dutch_Nederlands',
    'Catalan': 'Catalan_Catala',
    'English': 'English',
    'Estonian': 'Estonian_Eesti',
    'Finnish': 'Finnish_Suomi',
    'French': 'French_Francais',
    'German': 'German_Deutsch',
    'Hungarian': 'Hungarian_Magyar',
    'Icelandic': 'Icelandic_Yslenska',
    'Irish': 'IrishGaelic_Gaeilge',
    'Italian': 'Italian_Italiano',
    'Latin': 'Latin_Latina',
    'Latvian': 'Latvian',
    'Luxembourg': 'Luxembourgish_Letzebuergeusch',
    'Norwegian': 'Norwegian_Norsk-Bokmal',
    'Portuguese': 'Portuguese_Portugues',
    'Romani': 'Romani',
    'Romansch': 'Rhaeto-Romance_Rumantsch',
    'Scottish': 'ScottishGaelic_GaidhligAlbanach',
    'Spanish': 'Spanish_Espanol',
    'Swedish': 'Swedish_Svenska',
    'Welsh': 'Welsh_Cymraeg'
}

# Read list of Yandex supported languages;
# source: https://yandex.com/dev/translate/doc/dg/concepts/api-overview.html
yandex_langs = {}
fh = open('yandex_langs.csv', 'r')
rdr = csv.reader(fh)
for row in rdr:
    # Each row will contain a language supported by Yandex and the language code, e.g.:
    # Azerbaijani,az
    yandex_langs[row[0]] = row[1]
fh.close()

# Initialize list to store results
trans = []

# See if the translation file already exists on disk
if os.path.exists('translated.csv'):
    
    # File exists, so load from disk instead of querying Yandex server
    print('loading translated words from disk')
    fh = open('translated.csv', 'r', encoding='latin-1')
    rdr = csv.reader(fh)
    header = []
    header = next(rdr)
    ct = 0
    for row in rdr:
        d = {}
        ct += 1
        for i, e in enumerate(row):
            d[header[i]] = e
        trans.append(d)
    print('loaded ' + str(ct) + ' words from disk')

else:
    
    # File doesn't exist on disk, so load from Yandex server
    # Iterate through list of affixes
    ct = 0
    for affix in affixes:
        print(affix['affix'])
        for lang in languages:
            if lang == 'English': continue    # don't translate English into English!
            if lang in yandex_langs.keys():
                print('\t' + lang)
                words = affix['examples'].split(',')
                for word in words:
                    print('\t\t' + word)
                    url = urlbase + '&lang=en-' + yandex_langs[lang] + '&text=' + word
                    r = requests.get(url)
                    if r.status_code == 200:
                        print(r.text)
                        try:
                            j = json.loads(r.text)
                            if 'text' in j.keys():
                                if len(j['text']) > 0:
                                    row = {
                                        'type': affix['type'],
                                        'affix': affix['affix'],
                                        'lang': lang,
                                        'word': word,
                                        'trans': j['text'][0].strip()
                                    }
                                    print(row)
                                    trans.append(row)
                                    ct += 1
                                else:
                                    print('no translation')
                        except Exception as ex:
                            print('err converting to json: ' + r.text + '; ' + str(ex))
                    else:
                        print('err: ' + r.text)
            else:
                print('\tlanguage not supported by Yandex:', lang)
    print('translations made:', ct)

    # Save to csv
    #fh = open('translated.csv', 'w', newline='', encoding='latin-1')
    dw = csv.DictWriter(fh, trans2[0].keys())
    dw.writeheader()
    errct = 0
    for e in trans2:
        try:
            dw.writerow(e)
        except Exception as ex:
            e2 = str(e)
            try:
                e2 = e2.encode("iso-8859-1")
                j = loads(e2)
                dw.writerow(j)
            except Exception as ex2:
                errct += 1
    print(errct, 'errors writing to csv')
    fh.close()


loading translated words from disk
loaded 4672 words from disk


In [5]:
s = """{"type": "suffix", "affix": "ism", "lang": "Basque", "word": "criticism", "trans": "kritika"}
{"type": "suffix", "affix": "ism", "lang": "Basque", "word": "humanism", "trans": "humanismoa"}
{"type": "suffix", "affix": "ism", "lang": "Basque", "word": "patriotism", "trans": "patriotismoa"}
{"type": "suffix", "affix": "ism", "lang": "Basque", "word": "professionalism", "trans": "profesionaltasuna"}
{"type": "suffix", "affix": "ism", "lang": "Catalan", "word": "criticism", "trans": "crítiques"}
{"type": "suffix", "affix": "ism", "lang": "Catalan", "word": "humanism", "trans": "humanisme"}
{"type": "suffix", "affix": "ism", "lang": "Catalan", "word": "patriotism", "trans": "patriotisme"}
{"type": "suffix", "affix": "ism", "lang": "Catalan", "word": "professionalism", "trans": "professionalitat"}
{"type": "suffix", "affix": "ism", "lang": "Danish", "word": "criticism", "trans": "kritik"}
{"type": "suffix", "affix": "ism", "lang": "Danish", "word": "humanism", "trans": "humanisme"}
{"type": "suffix", "affix": "ism", "lang": "Danish", "word": "patriotism", "trans": "patriotisme"}
{"type": "suffix", "affix": "ism", "lang": "Danish", "word": "professionalism", "trans": "professionalisme"}
{"type": "suffix", "affix": "ism", "lang": "Dutch", "word": "criticism", "trans": "kritiek"}
{"type": "suffix", "affix": "ism", "lang": "Dutch", "word": "humanism", "trans": "humanisme"}
{"type": "suffix", "affix": "ism", "lang": "Dutch", "word": "patriotism", "trans": "vaderlandsliefde"}
{"type": "suffix", "affix": "ism", "lang": "Dutch", "word": "professionalism", "trans": "professionaliteit"}
{"type": "suffix", "affix": "ism", "lang": "Estonian", "word": "criticism", "trans": "kriitika"}
{"type": "suffix", "affix": "ism", "lang": "Estonian", "word": "humanism", "trans": "humanism"}
{"type": "suffix", "affix": "ism", "lang": "Estonian", "word": "patriotism", "trans": "patriotism"}
{"type": "suffix", "affix": "ism", "lang": "Estonian", "word": "professionalism", "trans": "professionaalsus"}
{"type": "suffix", "affix": "ism", "lang": "Finnish", "word": "criticism", "trans": "kritiikki"}
{"type": "suffix", "affix": "ism", "lang": "Finnish", "word": "humanism", "trans": "humanismi"}
{"type": "suffix", "affix": "ism", "lang": "Finnish", "word": "patriotism", "trans": "isänmaallisuus"}
{"type": "suffix", "affix": "ism", "lang": "Finnish", "word": "professionalism", "trans": "ammattitaito"}
{"type": "suffix", "affix": "ism", "lang": "French", "word": "criticism", "trans": "critique"}
{"type": "suffix", "affix": "ism", "lang": "French", "word": "humanism", "trans": "humanisme"}
{"type": "suffix", "affix": "ism", "lang": "French", "word": "patriotism", "trans": "patriotisme"}
{"type": "suffix", "affix": "ism", "lang": "French", "word": "professionalism", "trans": "professionnalisme"}
{"type": "suffix", "affix": "ism", "lang": "German", "word": "criticism", "trans": "Kritik"}
{"type": "suffix", "affix": "ism", "lang": "German", "word": "humanism", "trans": "Humanismus"}
{"type": "suffix", "affix": "ism", "lang": "German", "word": "patriotism", "trans": "Patriotismus"}
{"type": "suffix", "affix": "ism", "lang": "German", "word": "professionalism", "trans": "Professionalität"}
{"type": "suffix", "affix": "ism", "lang": "Hungarian", "word": "criticism", "trans": "kritika"}
{"type": "suffix", "affix": "ism", "lang": "Hungarian", "word": "humanism", "trans": "humanizmus"}
{"type": "suffix", "affix": "ism", "lang": "Hungarian", "word": "patriotism", "trans": "hazafiság"}
{"type": "suffix", "affix": "ism", "lang": "Hungarian", "word": "professionalism", "trans": "professzionalizmus"}
{"type": "suffix", "affix": "ism", "lang": "Icelandic", "word": "criticism", "trans": "gagnrýni"}
{"type": "suffix", "affix": "ism", "lang": "Icelandic", "word": "humanism", "trans": "húmanismi"}
{"type": "suffix", "affix": "ism", "lang": "Icelandic", "word": "patriotism", "trans": "föðurlandsást"}
{"type": "suffix", "affix": "ism", "lang": "Icelandic", "word": "professionalism", "trans": "fagmennska"}
{"type": "suffix", "affix": "ism", "lang": "Irish", "word": "criticism", "trans": "cáineadh"}
{"type": "suffix", "affix": "ism", "lang": "Irish", "word": "humanism", "trans": "daonnachas"}
{"type": "suffix", "affix": "ism", "lang": "Irish", "word": "patriotism", "trans": "tírghrá"}
{"type": "suffix", "affix": "ism", "lang": "Irish", "word": "professionalism", "trans": "gairmiúlacht"}
{"type": "suffix", "affix": "ism", "lang": "Italian", "word": "criticism", "trans": "critica"}
{"type": "suffix", "affix": "ism", "lang": "Italian", "word": "humanism", "trans": "umanesimo"}
{"type": "suffix", "affix": "ism", "lang": "Italian", "word": "patriotism", "trans": "patriottismo"}
{"type": "suffix", "affix": "ism", "lang": "Italian", "word": "professionalism", "trans": "professionalita"}
{"type": "suffix", "affix": "ism", "lang": "Latin", "word": "criticism", "trans": "reprehensione"}
{"type": "suffix", "affix": "ism", "lang": "Latin", "word": "humanism", "trans": "humanitatis"}
{"type": "suffix", "affix": "ism", "lang": "Latin", "word": "patriotism", "trans": "patriae"}
{"type": "suffix", "affix": "ism", "lang": "Latin", "word": "professionalism", "trans": "phasellus"}
{"type": "suffix", "affix": "ism", "lang": "Latvian", "word": "criticism", "trans": "kritika"}
{"type": "suffix", "affix": "ism", "lang": "Latvian", "word": "humanism", "trans": "humānisms"}
{"type": "suffix", "affix": "ism", "lang": "Latvian", "word": "patriotism", "trans": "patriotisms"}
{"type": "suffix", "affix": "ism", "lang": "Latvian", "word": "professionalism", "trans": "profesionalitāte"}
{"type": "suffix", "affix": "ism", "lang": "Luxembourg", "word": "criticism", "trans": "Kritik"}
{"type": "suffix", "affix": "ism", "lang": "Luxembourg", "word": "humanism", "trans": "Humanismus"}
{"type": "suffix", "affix": "ism", "lang": "Luxembourg", "word": "patriotism", "trans": "Patriotismus"}
{"type": "suffix", "affix": "ism", "lang": "Luxembourg", "word": "professionalism", "trans": "Professionalität"}
{"type": "suffix", "affix": "ism", "lang": "Norwegian", "word": "criticism", "trans": "kritikk"}
{"type": "suffix", "affix": "ism", "lang": "Norwegian", "word": "humanism", "trans": "humanisme"}
{"type": "suffix", "affix": "ism", "lang": "Norwegian", "word": "patriotism", "trans": "patriotisme"}
{"type": "suffix", "affix": "ism", "lang": "Norwegian", "word": "professionalism", "trans": "profesjonalitet"}
{"type": "suffix", "affix": "ism", "lang": "Portuguese", "word": "criticism", "trans": "critico"}
{"type": "suffix", "affix": "ism", "lang": "Portuguese", "word": "humanism", "trans": "humanismo"}
{"type": "suffix", "affix": "ism", "lang": "Portuguese", "word": "patriotism", "trans": "patriotismo"}
{"type": "suffix", "affix": "ism", "lang": "Portuguese", "word": "professionalism", "trans": "profissionalismo"}
{"type": "suffix", "affix": "ism", "lang": "Scottish", "word": "criticism", "trans": "a bhith a"}
{"type": "suffix", "affix": "ism", "lang": "Scottish", "word": "humanism", "trans": "humanism"}
{"type": "suffix", "affix": "ism", "lang": "Scottish", "word": "patriotism", "trans": "patriotism"}
{"type": "suffix", "affix": "ism", "lang": "Scottish", "word": "professionalism", "trans": "tha proifeiseantachd"}
{"type": "suffix", "affix": "ism", "lang": "Spanish", "word": "criticism", "trans": "crítica"}
{"type": "suffix", "affix": "ism", "lang": "Spanish", "word": "humanism", "trans": "humanismo"}
{"type": "suffix", "affix": "ism", "lang": "Spanish", "word": "patriotism", "trans": "patriotismo"}
{"type": "suffix", "affix": "ism", "lang": "Spanish", "word": "professionalism", "trans": "profesionalidad"}
{"type": "suffix", "affix": "ism", "lang": "Swedish", "word": "criticism", "trans": "kritik"}
{"type": "suffix", "affix": "ism", "lang": "Swedish", "word": "humanism", "trans": "humanism"}
{"type": "suffix", "affix": "ism", "lang": "Swedish", "word": "patriotism", "trans": "patriotism"}
{"type": "suffix", "affix": "ism", "lang": "Swedish", "word": "professionalism", "trans": "professionalism"}
{"type": "suffix", "affix": "ism", "lang": "Welsh", "word": "criticism", "trans": "beirniadaeth"}
{"type": "suffix", "affix": "ism", "lang": "Welsh", "word": "humanism", "trans": "dyneiddiaeth"}
{"type": "suffix", "affix": "ism", "lang": "Welsh", "word": "patriotism", "trans": "gwladgarwch"}
{"type": "suffix", "affix": "ism", "lang": "Welsh", "word": "professionalism", "trans": "proffesiynoldeb"}"""

trans2 = []
arr = s.split('\n')
for e in arr:
    try:
        j = json.loads(e)
    except Exception as ex:
        print(ex, e)
    j['word'] = j['word'].strip()
    j['trans'] = j['trans'].strip()
    if not ' ' in j['trans']:    # only include single words, not phrases, because this will mess up the tokenizer
        trans2.append(j)

# Save to csv
fh = open('translated.csv', 'a', newline='', encoding='latin-1')
dw = csv.DictWriter(fh, trans2[0].keys())
dw.writeheader()
errct = 0
for e in trans2:
    try:
        dw.writerow(e)
    except Exception as ex:
        e2 = str(e)
        try:
            e2 = e2.encode("iso-8859-1")
            j = loads(e2)
            dw.writerow(j)
        except Exception as ex2:
            errct += 1
print(errct, 'errors writing to csv')
fh.close()


2 errors writing to csv
