# Google Translate

In [1]:
import urllib

import requests
from nltk import sent_tokenize

In [2]:
BASE_URL = "https://translate.google.com/m"
# BASE_URL = "https://translate.google.com/"
CHAR_LIMIT = 5000

In [3]:
from typing import Sequence

In [4]:
source_lang_code = "en"
target_lang_code = "de"
text = "What you see is what you get."

query_url = urllib.parse.urljoin(
    BASE_URL,
    f"?sl={source_lang_code}&tl={target_lang_code}&q={text}"
)
query_url

'https://translate.google.com/m?sl=en&tl=de&q=What you see is what you get.'

## The `requests` Package

In [5]:
response = requests.get(query_url)
response

<Response [200]>

In [6]:
type(response)

requests.models.Response

In [7]:
response.status_code

200

In [8]:
html_str = response.text

In [9]:
print(html_str[:500], "\n\n...\n\n", html_str[-500:])

<!DOCTYPE html><html dir="ltr" lang="en-US"><head><title>Google Translate</title><meta name="google" content="notranslate"><meta name="viewport" content="width=device-width,initial-scale=1"><link rel="icon" href="//ssl.gstatic.com/translate/favicon.ico" sizes="64x64"><style nonce="fGjWyIZ0p7tpgBJ41EESFg">
    body {
      font-family: 'Arial', sans-serif;
      margin: 0;
    }
    a:link,
    a:visited,
    a:active {
      color: #1a73e8; /* blue 600 */
      text-decoration: none;
    }
    . 

...

 ="Translate" class="translate-button"></div></form></div><div class="result-container">Was Sie sehen, ist, was Sie bekommen.</div><div class="links-container"><ul><li><a href="https://www.google.com/m?hl=en-US">Google home</a></li><li><a href="https://www.google.com/tools/feedback/survey/xhtml?productId=95112&hl=en-US">Send feedback</a></li><li><a href="https://www.google.com/intl/en-US/policies">Privacy and terms</a></li><li><a href="./full">Switch to full site</a></li></ul></div></bod

## Parse HTML

In [11]:
from lxml import html

In [12]:
tree = html.fromstring(html_str)

In [13]:
type(tree)

lxml.html.HtmlElement

In [14]:
result_list = tree.xpath("//div[@class='result-container']")

In [15]:
result_list[0]

<Element div at 0x7b4fe088a9e0>

In [16]:
result_list[0].text_content()

'Was Sie sehen, ist, was Sie bekommen.'

## Write The Above into (Synchronous) Functions

In [18]:
from typing import TypeAlias

#Response: TypeAlias = 
#type Response = requests.models.responseonse | aiohttp.Clientresponseonse

def short_sync_google_translate(
    text: str,
    *,
    source_lang_code: str,
    target_lang_code: str
) -> str:
    query_url = urllib.parse.urljoin(
        BASE_URL,
        f"?sl={source_lang_code}&tl={target_lang_code}&q={text[:CHAR_LIMIT]}",
    )
    response = requests.get(query_url)
    if response.status_code != 200:
        return f"Status code: {response.status_code}"

    return extract_translation(response.text)


def extract_translation(html_str: str):
    tree = html.fromstring(html_str)
    found_list = tree.xpath("//div[@class='result-container']")
    if len(found_list) == 0:
        return ""
    else:
        return found_list[0].text_content()

In [19]:
short_sync_google_translate(
    text,
    source_lang_code="en",
    target_lang_code="de",
)

'Was Sie sehen, ist, was Sie bekommen.'

In [20]:
sent_tokenize("")

[]

In [None]:
def sync_google_translate(
    text: str,
    *,
    source_lang_code: str,
    target_lang_code: str,
) -> str:
    if len(text) <= CHAR_LIMIT:
        return short_sync_google_translate(
            text,
            source_lang_code=source_lang_code,
            target_lang_code=target_lang_code,
        )

    chunks = split_into_chunks(text)
    return " ".join(
        short_sync_google_translate(
            chunk,
            source_lang_code=source_lang_code,
            target_lang_code=target_lang_code,
        )
        for chunk in chunks
    )

def split_into_chunks(text: str, chunk_char_limit: int = CHAR_LIMIT) -> list[str]:
    """
    Google translate accepts as many as CHAR_LIMIT characters
    in a single request.

    sent_tokenize's return value is a list of sentences.

    We concatenate sentences into chunks as close to CHAR_LIMIT
    as possible to reduce the number of requests.
    """
    sentences = sent_tokenize(text)
    return seq_concat(sentences, chunk_char_limit)

In [None]:
# def seq_concat(
#     sentences: Sequence[str],
#     chunk_char_limit: int = CHAR_LIMIT
# ) -> list[str]:
#     n_chars = 0
#     start_index = 0
#     end_index = 0
#     chunks = []
#     for i, sent in enumerate(sentences):
#         if len(sent) > chunk_char_limit:
#             raise ValueError(
#                 f"sentences[{i}] contains {len(sent)} > {chunk_char_limit = } chars."
#             )
#         n_chars += len(sent) + 1
#         if n_chars > chunk_char_limit:
#             end_index = i
#             chunk = " ".join(sentences[start_index : end_index])
#             chunks.append(chunk)
#             start_index = i
#             n_chars = 0

#     # sentences as a single chunk not even reachs CHAR_LIMIT
#     if len(chunks) == 0:
#         chunks = [" ".join(sentences)]
#         return chunks

#     # Avoid overlook of last chunk
#     if end_index < len(sentences) - 1:
#         chunk = " ".join(sentences[start_index :])
#         chunks.append(chunk)
#         return chunks

In [26]:
from typing import Generator

def seq_concat(
    sentences: Sequence[str],
    chunk_char_limit: int = CHAR_LIMIT
) -> Generator[str, None, None]:

    # sentences as a single chunk not even reachs CHAR_LIMIT
    if sum(len(sent) for sent in sentences) <= chunk_char_limit:
        yield " ".join(sentences)
        return

    n_chars = 0
    start_index = 0
    end_index = 0
    for i, sent in enumerate(sentences):
        if len(sent) > chunk_char_limit:
            raise ValueError(
                f"sentences[{i}] contains {len(sent)} > {chunk_char_limit = } chars."
            )
        n_chars += len(sent) + 1
        if n_chars > chunk_char_limit:
            end_index = i
            chunk = " ".join(sentences[start_index : end_index])
            yield chunk
            start_index = i
            n_chars = 0

    # Avoid overlook of last chunk
    if end_index < len(sentences) - 1:
        chunk = " ".join(sentences[start_index :])
        yield chunk

## Long Text `animal_farm.txt`
Text downloaded from <https://gutenberg.net.au/ebooks01/0100011.txt>.

In [42]:
with open("animal_farm.txt", "r") as f:
    long_text = f.read()

In [47]:
n_chars = 1000
print(long_text[:n_chars], "\n\n...\n\n", long_text[-n_chars:])



Project Gutenberg Australia



Title:      Animal Farm
Author:     George Orwell (pseudonym of Eric Blair) (1903-1950)
* A Project Gutenberg of Australia eBook *
eBook No.:  0100011.txt
Language:   English
Date first posted:          August 2001
Date most recently updated: March 2008

Project Gutenberg of Australia eBooks are created from printed editions
which are in the public domain in Australia, unless a copyright notice
is included. We do NOT keep any eBooks in compliance with a particular
paper edition.

Copyright laws are changing all over the world. Be sure to check the
copyright laws for your country before downloading or redistributing this
file.

This eBook is made available at no cost and with almost no restrictions
whatsoever. You may copy it, give it away or re-use it under the terms
of the Project Gutenberg of Australia License which may be viewed online at
gutenberg.net.au/licence.html

To contact Project Gutenberg of Australia go to gutenberg.net.au


Title:      Ani

In [29]:
# Testing
g = seq_concat([long_text])
for i in g:
    pass

ValueError: sentences[0] contains 10446 > chunk_char_limit = 5000 chars.

In [30]:
sync_google_translate(long_text, source_lang_code="en", target_lang_code="de")

'Vor vielen Jahren lebte ein Kaiser, der so sehr auf neue Kleider stand, dass er sein ganzes Geld für Kleidung ausgab. Er kümmerte sich nicht im Geringsten um seine Soldaten, und er ging weder gern ins Theater noch auf die Jagd, es sei denn, er hatte Gelegenheit, seine neuen Kleider zu zeigen. Er trug für jede Stunde des Tages einen anderen Anzug, und wie von jedem anderen König oder Kaiser sagt man gewöhnlich: „Er sitzt im Rat“, so sagte man von ihm immer: „Der Kaiser sitzt in seiner Garderobe.“ In der großen Stadt, die seine Hauptstadt war, verging die Zeit fröhlich; jeden Tag kamen Fremde an den Hof. Eines Tages tauchten zwei Schurken auf, die sich Weber nannten. Sie gaben vor, Stoffe in den schönsten Farben und mit den aufwendigsten Mustern zu weben, und die daraus hergestellten Kleider sollten die wunderbare Eigenschaft haben, für jeden unsichtbar zu bleiben, der für sein Amt ungeeignet war oder einen außergewöhnlich einfachen Charakter hatte. „Das müssen wirklich prächtige Kleide

## Async Counterpart

We need to write
- [ ] an `async_google_translate` function
- [ ] an `short_async_google_translate` function

In [31]:
import asyncio
import aiohttp

In [32]:
import nest_asyncio
nest_asyncio.apply()

In [33]:
async def short_async_google_translate(
    text: str,
    *,
    source_lang_code: str,
    target_lang_code: str,
) -> str:
    query_url = urllib.parse.urljoin(
        BASE_URL,
        f"?sl={source_lang_code}&tl={target_lang_code}&q={text[:CHAR_LIMIT]}",
    )
    async with aiohttp.ClientSession() as session:
        async with session.get(query_url) as response:
            response.raise_for_status()
            html_str = await response.text()
    return extract_translation(html_str)

In [34]:
asyncio.run(short_async_google_translate(
    "A person who never made a mistake never tried anything new.",
    source_lang_code="en",
    target_lang_code="de",
))

'Eine Person, die nie einen Fehler gemacht hat, hat nie etwas Neues ausprobiert.'

In [35]:
async def async_google_translate(
    text: str,
    *,
    source_lang_code: str,
    target_lang_code: str,
) -> str:
    if len(text) <= CHAR_LIMIT:
        # TODO: cannot return like this
        return await short_async_google_translate(
            text,
            source_lang_code=source_lang_code,
            target_lang_code=target_lang_code,
        )

    chunks = split_into_chunks(text)
    tasks = []
    async with asyncio.TaskGroup() as tg:
        for chunk in chunks:
            tasks.append(tg.create_task(
                short_async_google_translate(
                    chunk,
                    source_lang_code=source_lang_code,
                    target_lang_code=target_lang_code,
                )
            ))
    return " ".join(
        task.result()
        for task in tasks
    )

In [36]:
asyncio.run(
    async_google_translate(
        long_text,
        source_lang_code="en",
        target_lang_code="de",
    )
)

'Vor vielen Jahren lebte ein Kaiser, der so sehr auf neue Kleider stand, dass er sein ganzes Geld für Kleidung ausgab. Er kümmerte sich nicht im Geringsten um seine Soldaten, und er ging weder gern ins Theater noch auf die Jagd, es sei denn, er hatte Gelegenheit, seine neuen Kleider zu zeigen. Er trug für jede Stunde des Tages einen anderen Anzug, und wie von jedem anderen König oder Kaiser sagt man gewöhnlich: „Er sitzt im Rat“, so sagte man von ihm immer: „Der Kaiser sitzt in seiner Garderobe.“ In der großen Stadt, die seine Hauptstadt war, verging die Zeit fröhlich; jeden Tag kamen Fremde an den Hof. Eines Tages tauchten zwei Schurken auf, die sich Weber nannten. Sie gaben vor, Stoffe in den schönsten Farben und mit den aufwendigsten Mustern zu weben, und die daraus hergestellten Kleider sollten die wunderbare Eigenschaft haben, für jeden unsichtbar zu bleiben, der für sein Amt ungeeignet war oder einen außergewöhnlich einfachen Charakter hatte. „Das müssen wirklich prächtige Kleide

## Time Consumption Comparison

In [48]:
sum(1 for _ in split_into_chunks(long_text))

34

In [56]:
%%time
sync_google_translate(
    long_text,
    source_lang_code="en",
    target_lang_code="de",
);

CPU times: user 1.3 s, sys: 76.4 ms, total: 1.38 s
Wall time: 10.6 s


In [55]:
%%time
asyncio.run(
    async_google_translate(
        long_text,
        source_lang_code="en",
        target_lang_code="de",
    )
);

CPU times: user 574 ms, sys: 88.1 ms, total: 662 ms
Wall time: 1.15 s


If `long_text` contains too many chunks, it's not
recommended to use `timeit`.

## Rate Limiting (using Semaphore)