# Old translation / tokenizer notebook: data_exploration_text
This notebook explores a more efficient translation method (using ctranslate2 or OpenNMT). Tokenizers are done by using BERT tokenizers.

In [1]:
import data
import pandas as pd
import numpy as np
import time
import math

In [13]:
data.topics["language"].value_counts()

en     36161
es     13910
pt      4177
ar      3701
fr      3701
sw      2895
bg      2867
gu      2320
bn      2176
hi      1786
it       866
zh       862
mr       300
fil      247
as       167
my       135
km       121
kn       119
te        93
or        70
ur        66
ta        60
pnb       51
pl        43
tr        40
ru        34
mul        4
Name: language, dtype: int64

In [14]:
data.contents["language"].value_counts()

en     65939
es     30844
fr     10682
pt     10435
ar      7418
bg      6050
hi      4042
zh      3849
gu      3677
bn      2513
sw      1942
it      1300
mr       999
as       641
fil      516
km       505
kn       501
or       326
pl       319
te       285
ur       245
tr       225
ta       216
my       206
ru       188
pnb      184
Name: language, dtype: int64

In [4]:
pd.concat([data.contents["language"], data.topics["language"]]).value_counts()

en     102100
es      44754
pt      14612
fr      14383
ar      11119
bg       8917
gu       5997
hi       5828
zh       4711
bn       4689
sw       4307
it       2166
mr       1299
as        808
fil       763
km        626
kn        620
swa       530
or        396
te        378
pl        362
my        341
ur        311
ta        276
tr        265
pnb       235
ru        222
mul         4
Name: language, dtype: int64

In [33]:
list(has_langs) + ["en"]

['gu',
 'zh',
 'km',
 'it',
 'ru',
 'sw',
 'mul',
 'ta',
 'tr',
 'mr',
 'my',
 'or',
 'kn',
 'ur',
 'bn',
 'fil',
 'te',
 'pl',
 'pnb',
 'ar',
 'en',
 'bg',
 'hi',
 'as',
 'swa',
 'fr',
 'es',
 'pt',
 'en']

In [11]:
has_langs = set(data.topics["language"].value_counts().index)
print(has_langs)
available_langs = set(list_available_langs() + ["en"])
print(available_langs)
missing_langs = data.topics["language"].value_counts()[has_langs.difference(available_langs)].sort_values(ascending = False)
display(missing_langs)
print(missing_langs.sum())
has_langs = data.topics["language"].value_counts()[list(has_langs.intersection(available_langs))].sort_values(ascending = False)
display(has_langs)
print(has_langs.sum())

{'hi', 'tr', 'es', 'pnb', 'fil', 'zh', 'it', 'en', 'ta', 'bg', 'my', 'or', 'as', 'kn', 'km', 'bn', 'ur', 'mul', 'mr', 'pl', 'te', 'pt', 'swa', 'gu', 'fr', 'ar', 'sw', 'ru'}
{'hi', 'es', 'ja', 'sv', 'zh', 'it', 'en', 'bg', 'as', 'id', 'de', 'bn', 'hu', 'mr', 'pl', 'nl', 'pt', 'fi', 'gu', 'ar', 'fr', 'sw', 'ru', 'ko'}


  missing_langs = data.topics["language"].value_counts()[has_langs.difference(available_langs)].sort_values(ascending = False)


fil    247
my     135
km     121
kn     119
te      93
or      70
ur      66
ta      60
pnb     51
tr      40
swa     35
mul      4
Name: language, dtype: int64

1041


en    36161
es    13910
pt     4177
ar     3701
fr     3701
bg     2867
sw     2860
gu     2320
bn     2176
hi     1786
it      866
zh      862
mr      300
as      167
pl       43
ru       34
Name: language, dtype: int64

75931


In [12]:
detection_available_langs = set(["af", "ar", "bg", "bn", "ca", "cs", "cy", "da", "de", "el", "en", "es", "et", "fa", "fi", "fr", "gu", "he", "hi", "hr", "hu", "id", "it", "ja", "kn", "ko", "lt", "lv", "mk", "ml", "mr", "ne", "nl", "no", "pa", "pl", "pt", "ro", "ru", "sk", "sl", "so", "sq", "sv", "sw", "ta", "te", "th", "tl", "tr", "uk", "ur", "vi"])
# available languages in langdetect 1.0.9
print(available_langs.difference(detection_available_langs))

{'as', 'zh'}


# Translation

In [2]:
import ctranslate2
import sentencepiece as spm
import os
import config_translation
import langdetect

langdetect.DetectorFactory.seed = 0

def get_language_model_folder(lang):
    return config_translation.resources_path + "ct2_models_from_opus/" + lang + "-en/"

def list_available_langs():
    return [lang_folder[:2] for lang_folder in os.listdir(config_translation.resources_path + "ct2_models_from_opus/")]

def load_model(lang):
    lang_dir = get_language_model_folder(lang)
    if not os.path.isfile(lang_dir + "source.spm"):
        raise Exception("Language does not exist")
    tokenizer = spm.SentencePieceProcessor()
    tokenizer.load(lang_dir + "source.spm")
    return tokenizer, ctranslate2.Translator(lang_dir)
    
def translate_sentences(sentences, translate_info):
    tokenizer, translator = translate_info
    tokens = tokenizer.encode(sentences, out_type=str)
    results = translator.translate_batch(tokens)
    output = tokenizer.decode([result.hypotheses[0] for result in results])
    output = [text.replace("▁"," ").strip() for text in output]
    return output

def detect_language(text):
    try:
        langdetect.DetectorFactory.seed = 0
        probs = langdetect.detect_langs(text)
        probs = {probs[0].lang:probs[0].prob for x in probs} # convert to dict
        """for lang in probs.keys():
            if lang.startswith("zh"):
                prob = probs[lang]
                probs.pop(lang, None)
                if "zh" in probs:
                    probs["zh"] = probs["zh"] + prob
                probs["zh"] = """ # note chinese is splitted into zh-tw, zh-cn. we do not need this yet since we use en only.
        return "en" in probs and probs["en"] > 0.9
    except:
        return False

# returns a list of bools, indicating whether the string is English
def detect_languages(sentences):
    lang_en = [detect_language(text) for text in sentences]
    return lang_en

def obtain_actual_languages(dframe, k):
    row = dframe.iloc[k]
    title = row["title"]
    description = row["description"]
    language = row["language"]
    if language == "en":
        return language, language
    if type(title) == str and detect_language(title):
        title = "en"
    else:
        title = language
    if type(description) == str and detect_language(description):
        description = "en"
    else:
        description = language
    return description, title

def obtain_language_info(dframe):
    description_lang = np.zeros(shape = (len(dframe)), dtype = "object")
    title_lang = np.zeros(shape = (len(dframe)), dtype = "object")
    for k in range(len(dframe)):
        description, title = obtain_actual_languages(dframe, k)
        description_lang[k] = description
        title_lang[k] = title
    return pd.DataFrame(data = {"description_lang": description_lang, "title_lang": title_lang}, index = dframe.index)

## Detect "actual" languages of text

In [3]:
import translate_helpers

data.topics.loc[data.topics["language"] == "swa", "language"] = "sw"
data.contents.loc[data.contents["language"] == "swa", "language"] = "sw"

ctime = time.time()  
topics_lang = translate_helpers.obtain_language_info(data.topics)
contents_lang = translate_helpers.obtain_language_info(data.contents)
ctime = time.time() - ctime
print("Obtaining language info: ",ctime)
display((topics_lang["description_lang"] != data.topics["language"]).sum())
display((topics_lang["title_lang"] != data.topics["language"]).sum())
display((contents_lang["description_lang"] != data.contents["language"]).sum())
display((contents_lang["title_lang"] != data.contents["language"]).sum())

topics_lang["description_idx"] = np.arange(len(topics_lang))
topics_lang["title_idx"] = np.arange(len(topics_lang), 2 * len(topics_lang))
contents_lang["description_idx"] = np.arange(2 * len(topics_lang), len(contents_lang) + 2 * len(topics_lang))
contents_lang["title_idx"] = np.arange(len(contents_lang) + 2 * len(topics_lang), 2 * len(contents_lang) + 2 * len(topics_lang))

topics_lang.loc[data.topics["description"].isnull(), "description_lang"] = "none"
topics_lang.loc[data.topics["title"].isnull(), "title_lang"] = "none"
contents_lang.loc[data.contents["description"].isnull(), "description_lang"] = "none"
contents_lang.loc[data.contents["title"].isnull(), "title_lang"] = "none"

topics_lang.to_csv("detected_topics_lang.csv")
contents_lang.to_csv("detected_contents_lang.csv")

mtext = np.empty(shape = (2 * len(topics_lang) + 2 * len(contents_lang)), dtype = "object")
mlang = np.empty(shape = (2 * len(topics_lang) + 2 * len(contents_lang)), dtype = "object")

mlang[:len(topics_lang)] = topics_lang["description_lang"]
mtext[:len(topics_lang)] = data.topics["description"]

mlang[len(topics_lang): (2 * len(topics_lang))] = topics_lang["title_lang"]
mtext[len(topics_lang): (2 * len(topics_lang))] = data.topics["title"]

mlang[(2 * len(topics_lang)):(len(contents_lang) + 2 * len(topics_lang))] = contents_lang["description_lang"]
mtext[(2 * len(topics_lang)):(len(contents_lang) + 2 * len(topics_lang))] = data.contents["description"]

mlang[(len(contents_lang) + 2 * len(topics_lang)):(2 * len(contents_lang) + 2 * len(topics_lang))] = contents_lang["title_lang"]
mtext[(len(contents_lang) + 2 * len(topics_lang)):(2 * len(contents_lang) + 2 * len(topics_lang))] = data.contents["title"]

total_text = pd.DataFrame(index = np.arange(2 * len(contents_lang) + 2 * len(topics_lang)), data = {"lang":mlang, "text":mtext})

total_text.to_csv("total_text_before_translate.csv")

1217.622895002365


In [18]:
total_topic_langs = pd.concat([topics_lang["title_lang"],topics_lang["description_lang"]]).value_counts()
has_langs = set(total_topic_langs.index)
print(has_langs)
available_langs = set(list_available_langs() + ["en"])
print(available_langs)
missing_langs = total_topic_langs[has_langs.difference(available_langs)].sort_values(ascending = False)
display(missing_langs)
print(missing_langs.sum())
has_langs = total_topic_langs[list(has_langs.intersection(available_langs))].sort_values(ascending = False)
display(has_langs)
print(has_langs.sum())

{'gu', 'pl', 'ur', 'pt', 'bn', 'mr', 'kn', 'pnb', 'tr', 'mul', 'fr', 'my', 'swa', 'te', 'zh', 'hi', 'bg', 'ar', 'ta', 'en', 'es', 'it', 'or', 'ru', 'km', 'sw', 'fil', 'as'}
{'gu', 'pl', 'bn', 'mr', 'pt', 'fr', 'id', 'hu', 'zh', 'hi', 'ar', 'bg', 'nl', 'ja', 'en', 'de', 'es', 'it', 'ru', 'ko', 'sw', 'sv', 'fi', 'as'}


  missing_langs = total_topic_langs[has_langs.difference(available_langs)].sort_values(ascending = False)


fil    417
my     269
km     241
kn     156
te     113
or      98
tr      79
ur      77
ta      74
pnb     64
swa     58
mul      8
dtype: int64

1654


en    77803
es    27523
pt     8151
fr     7266
ar     6997
bg     5728
gu     4441
bn     4274
hi     3084
sw     2936
zh     1696
it     1580
mr      379
as      281
pl       83
ru       68
dtype: int64

152290


## Test translation here

In [4]:
batch_size = 40
languages = total_text.lang.value_counts().index

langs_to_translate = list(pd.Index(translate_helpers.list_available_langs()).intersection(languages))
translated_results = np.empty(shape = len(total_text), dtype = "object")
translated_results[:] = np.nan


ctime = time.time()
for language in langs_to_translate:
    translate_info = translate_helpers.load_model(language)
    lang_idx = total_text.loc[total_text["lang"] == language].index
    length = len(lang_idx)
    
    for k in range(min(int(math.ceil((length + 0.0) / batch_size)), 2)):
        low = k * batch_size
        high = min((k+1) * batch_size, length)
        text_idx = lang_idx[np.arange(low, high)]
        sentences = list(total_text.loc[text_idx, "text"])
        translated_sentences = translate_helpers.translate_sentences(sentences, translate_info)
        translated_results[np.array(text_idx, dtype = np.int64)] = translated_sentences
        
    del translate_info
    print("Translated ",language)
total_text["text_translate"] = translated_results
ctime = time.time() - ctime
print("Translation time used: ",ctime)
total_text.to_csv("total_text.csv")

topics_lang["description"] = np.array(total_text.loc[np.arange(len(topics_lang)), "text_translate"])
topics_lang["title"] = np.array(total_text.loc[np.arange(len(topics_lang), 2 * len(topics_lang)), "text_translate"])
contents_lang["description"] = np.array(total_text.loc[np.arange(2 * len(topics_lang), len(contents_lang) + 2 * len(topics_lang)), "text_translate"])
contents_lang["title"] = np.array(total_text.loc[np.arange(len(contents_lang) + 2 * len(topics_lang), 2 * len(contents_lang) + 2 * len(topics_lang)), "text_translate"])

topics_lang.to_csv("topics_translated.csv")
contents_lang.to_csv("contents_translated.csv")

Translated  ar
Translated  as
Translated  bg
Translated  bn
Translated  es
Translated  fr
Translated  gu
Translated  hi
Translated  it
Translated  mr
Translated  pl
Translated  pt
Translated  ru
Translated  sw
Translated  zh
Translation time used:  61.385201930999756


In [9]:
sub = total_text.loc[total_text["lang"] == "ar"]
txt = sub.iloc[0:80]["text"]
trans = sub.iloc[0:80]["text_translate"]
for k in range(len(txt)):
    print("Original :  ", txt.iloc[k])
    print("Translate:  ", trans.iloc[k])

Original :   5acb7211ecf6d9049f561089
Translate:   5acb7211ecf6d9049f561089
Original :   مفهوم الحفرية
Translate:   The concept of excavation
Original :   5a60ac4b3d99e104fb62ce19
Translate:   5a60ac4b3d99e104fb62ce19
Original :   5a4c843e7dd197090857f29c
Translate:   5a4c843e7d197090857f29c
Original :   5ae17d4e6b9064043d25f7ee
Translate:   5ae17d4e6b9064043d25f7ee
Original :   5acb6cff6b9064043d877401
Translate:   5acb6cff6b90640443d877401
Original :   5addd9508b01ea04997272a2
Translate:   5add9508b01ea04997272a2
Original :   اتعلم طرق زراعة الأسطح وابتدي مشروعك أو جمّل سطح بيتك
Translate:   You know how to grow roofs, start your project, or you can get your roof together.
Original :   5a60a2e80ed49f0498cb213c
Translate:   5a60a2e80ed49f0498cb213c
Original :   5acc8bbeecf6d904a0288e7d
Translate:   5acc8bbeecf6d904a0288e7d
Original :   في الدرس ده هنتعرف على الدي إن إيه وإزاي بيتترجم لبروتين وكمان هنتعرف على الـ RNA ودوره في عملية الترجمة
Translate:   In this course, we recognize DNA 

In [17]:
mstr = "5af43ee999d5e6049fefa291"

" " not in mstr and sum(c.isdigit() for c in mstr) > 7 and sum(c.isalpha() for c in mstr) > 7

True

In [54]:
# translate_info = translate_helpers.load_model("zh")
ctime = time.time()
translate_helpers.translate_sentences(list(total_text.loc[lang_idx[np.arange(0, 10)], "text"]), translate_info)
ctime = time.time() - ctime
print("Translation time used: ",ctime)

Translation time used:  16.97045397758484


In [35]:
np.array(lang_idx[np.arange(low, high)], dtype = np.int64)

array([64330, 64410, 64577, 64691, 64777, 64846, 64906, 64931, 65419,
       65429, 65562, 65863, 65865, 66050, 66319], dtype=int64)

In [49]:
list(total_text.loc[lang_idx[np.arange(low, high)], "text"])

['你的动脉、静脉、毛细血管和小静脉可以容下约5升血液.你说这有啥用？它携带了呼吸作用必须的氧气以及全身上下的血脂,荷尔蒙等其他物质.在受到创伤导致失血的情况下,对医生来说区别不同的血型至关重要.我们将了解到血液系统的复杂性.',
 '掌握求平行四边形，三角形，梯形以及简单的组合图形的面积的方法。',
 '理解圆的概念，会求圆的周长及面积',
 '认识生活中的小数与小数的读法，掌握一位小数的含义与写法，掌握一位小数的大小比较，计算一位小数的加、减法',
 '学习加、减的含义.',
 '没有肌肉，我们不能做太多的东西。本教程探索肌肉细胞是什么和它们如何合作以运作我们的身体。',
 '学习含有变量的等式与不等式. 这些教程着重于解方程和了解不等式的解.',
 '用20以内的加减法解决生活中的简单问题',
 '掌握计算长方体和正方体的表面积的方法。',
 '理解并掌握分数除法']