In [2]:
import csv
import logging
import requests
from pathlib import Path
from pprint import pprint
from decouple import config
from random import choice
from bs4 import BeautifulSoup
from xml.etree import ElementTree

In [3]:
headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:87.0) Gecko/20100101 Firefox/87.0'
}
azure_access_token = ''

In [4]:
def reverso_request(start, length):
    base_url = f"https://context.reverso.net/bst-web-user/user/favourites/shared"
    
    params = {
        'userName': 'lmirandam07',
        'start': start,
        'length': length,
        'order': 10
    }

    try:
        request = requests.get(base_url, params=params, headers=headers)
        request.raise_for_status()
        
        return request.json()
    except requests.exceptions.HTTPError as e:
        print(e)

In [5]:
# Function to drop the duplicated words from the JSON
def words_filter(words, csv_headers): 
    csv_file = Path("./words_list.csv")
    csv_file.touch(exist_ok=True)
    empty_file = False

    with open(csv_file, 'r') as file: 
      reader = list(csv.reader(file))

      if len(reader) == 1:
        return words

      if len(reader) < 1:
        empty_file = True
      elif len(reader) >=2:
        last_word_de = reader[-1][0]

    with open(csv_file, 'w') as file: 
      writer = csv.writer()
      if empty_file:
        writer.writerow(csv_headers.keys())
        return words

      last_updated_idx = words.index(list(filter(lambda w: w['srcText'] == last_word_de if w['srcLang'] == 'de' else w['trgText'] == last_word_de, words))[0])

      return words[:last_updated_idx]


In [6]:
def get_word_tag(de_word):
    tags = ('adjetivo', 'sustantivo', 'adverbio', 'verbo')
    reverso_url = f"https://context.reverso.net/traduccion/aleman-espanol/{de_word}"

    try:
        req = requests.get(reverso_url, headers=headers)
        soup = BeautifulSoup(req.text, "html.parser")
        word_tag = soup.select("#pos-filters button")[0].text.strip().lower() or ""

        if word_tag not in tags:
            print(de_word, word_tag)
            return ""
        
        return word_tag
    except requests.exceptions.HTTPError as e:
        print(e)

In [7]:
def get_noun_article(de_word):
    leo_url = f"https://dict.leo.org/alem%C3%A1n-espa%C3%B1ol/{de_word}"

    try:
        req = requests.get(leo_url, headers=headers)
        soup = BeautifulSoup(req.text, "html.parser")        
        de_noun = soup.select("#section-subst td[lang='de'] samp")

        de_article = de_noun[0].text.split(' ')[0] or ''
        de_plural = de_noun[0].find('small').text or ''
        de_word = f"{de_article} {de_word} - {de_plural}"
        return de_word
        
    except requests.exceptions.HTTPError as e:
        print(e)


In [8]:
def get_access_token(sub_key, region):
    fetch_token_url = f"https://{region}.api.cognitive.microsoft.com/sts/v1.0/issueToken"
    headers = {
        'Ocp-Apim-Subscription-Key': sub_key
    }

    try:
        response = requests.post(fetch_token_url, headers=headers)
        return str(response.text)
    except requests.exceptions.HTTPError as e:
        print(e)

In [9]:
def get_sentence_audio(de_sentence):
    AZURE_API_KEY = config('AZURE_API_KEY')
    AZURE_REGION = config('AZURE_REGION')
    global azure_access_token

    if not azure_access_token:
        azure_access_token = get_access_token(AZURE_API_KEY, AZURE_REGION)

    try:
        langs_and_voices = {
            'de-DE': ('de-DE-ConradNeural', 'de-DE-KatjaNeural'),
            'de-AT': ('de-AT-JonasNeural', 'de-AT-IngridNeural'),
            'de-CH': ('de-CH-LeniNeural', 'de-CH-JanNeural')
        }
        # From the list of voices in german in the API, randomly select one
        lang_choice = choice(list(langs_and_voices.keys()))
        voice_choice = choice(langs_and_voices[lang_choice])

        rate = 0
        pitch = 0
        
        azure_api_url = f'https://{AZURE_REGION}.tts.speech.microsoft.com/cognitiveservices/v1'
        headers = {
            'Authorization': f'Bearer {azure_access_token}',
            'Content-Type': 'application/ssml+xml',
            'X-Microsoft-OutputFormat': 'audio-24khz-96kbitrate-mono-mp3'
        }

        xml_body = ElementTree.Element('speak', version='1.0')
        xml_body.set('{http://www.w3.org/XML/1998/namespace}lang', lang_choice)

        voice = ElementTree.SubElement(xml_body, 'voice')
        voice.set('{http://www.w3.org/XML/1998/namespace}lang', lang_choice)
        voice.set(
            'name', voice_choice)

        prosody = ElementTree.SubElement(voice, 'prosody')
        prosody.set('rate', f'{rate}%')
        prosody.set('pitch', f'{pitch}%')
        prosody.text = de_sentence

        body = ElementTree.tostring(xml_body)

        response = requests.post(azure_api_url, headers=headers, data=body)
        return response.content

    except Exception as e:
        print(e)

In [10]:
print(get_sentence_audio('Ich heiße Luis'))

0441a9c3fd394ffb8b1281fb32f90fb8 southcentralus
b'\xff\xf3\xa4\xc4\x00\x00\x00\x03H\x00\x00\x00\x00LAME3.100UUUUUUUUUUULAME3.100UUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUU\xff\xf3\xa4\xc4\x00\x00\x00\x03H\x00\x00\x00\x00LAME3.100UUUUUUUUUUULAME3.100UUUUUUUU\x81\x96a\xc6\xcc[\t\xc18\x1e\x82\x10\x94\'\xe0\x1a\x00p,\x0c#|#\xe0\xe7\x135\xb2\xfe\x17\xe0g\t\x19/]\t j\x040X\xd8\x89@j\x03P.\tB\xfe=a \x17\x04 \xef\x16\xf2\x16\\\xdc\x95\x88y\xa6\x85\x9fP\x80\x91A@\xac6\xffPP\x80\x91\xba\n\x01\x80\xc3\x05\xc11Y&u\xc2\xe1\xb6\xf5@LO9\xc35\x00\xa06\xb0 \x18r6\xd4@\x91\x00\xa0\x90\xb9>\xdc\xe1\x04\t\x82`\x999@\xc0m\xe4\x04\x90 ap\xb8\x10t\x03\x81m\x10\n\x18\xf3\xa4\x0cz\x86\\ \xa0\xa0\xc0\\\r\xb7\t\xf5\xc13}D\x08\x05m\xa8(\x04\x0f\x11\x93\xc9\xe4\x04\x96\x17\r\x9c$Q\xb2\x00@\xf0\\OH\x1c\x81\x06.\x17\

In [11]:
def get_words_list(word, words_dict):
    words_dict_c = words_dict.copy()
    # If src lang is german, then return the first order otherwise change the order
    language_order = lambda lang: ('src', 'trg') if lang =='de' else ('trg', 'src')

    de_order, es_order = language_order(word['srcLang'])
    words_dict_c['de_word'] = word[f'{de_order}Text']
    words_dict_c['es_word'] = word[f'{es_order}Text']
    # To remove <em> tags 
    words_dict_c['de_sentence'] = BeautifulSoup(word[f'{de_order}Context']).get_text()
    words_dict_c['es_sentence'] = BeautifulSoup(word[f'{es_order}Context']).get_text()

    words_dict_c['tags'] = get_word_tag(words_dict_c['de_word'])

    if words_dict_c['tags'] == 'sustantivo':
        words_dict_c['de_word'] = get_noun_article(words_dict_c['de_word'])

    words_dict['de_audio'] = get_sentence_audio(words_dict['de_sentence'])
    return words_dict_c

In [12]:
def main():
    # logging.basicConfig(filename='scraper.log', level=logging.INFO)
    words_dict = {
        'de_word': '',
        'de_sentence': '',
        'es_word': '',
        'es_sentence': '',
        'de_audio': '',
        'tags': ''
    }
    start = 0
    length = 10
    data = reverso_request(start, length)
    words_results = data['results']
    num_total_results = data['numTotalResults']
    if num_total_results > length:
        # Starts the requests in the end of the previus and make another with all the remaining words
        start = length
        length = num_total_results - length
        data = reverso_request(start, length)
        words_results.extend(data['results'])

    filtered_words = words_filter(words_results, words_dict)

    words_dict_list = [get_words_list(f_w, words_dict) for f_w in filtered_words]
    print(len(words_dict_list))
    pprint(words_dict_list)

In [11]:
if __name__ == '__main__':
   main()

60
[{'de_audio': '',
  'de_sentence': 'Wie du siehst, ist hier alles tipptopp.',
  'de_word': 'tipptopp',
  'es_sentence': 'Como verás, todo está impecable.',
  'es_word': 'impecable',
  'tags': 'adjetivo'},
 {'de_audio': '',
  'de_sentence': 'Tom kaufte Mary einen teuren Regenschirm.',
  'de_word': 'der Regenschirm - pl.: die Regenschirme',
  'es_sentence': 'Tom le compró a María un paraguas caro.',
  'es_word': 'paraguas',
  'tags': 'sustantivo'},
 {'de_audio': '',
  'de_sentence': 'Habe ihnen meinen Fernseher für $300 verkauft.',
  'de_word': 'der Fernseher - pl.: die Fernseher',
  'es_sentence': 'Les vendí el televisor por 300 u$s.',
  'es_word': 'televisor',
  'tags': 'sustantivo'},
 {'de_audio': '',
  'de_sentence': 'Das war alles in meinem Kühlschrank.',
  'de_word': 'der Kühlschrank - pl.: die Kühlschränke',
  'es_sentence': 'Eso era todo lo que había en mi nevera.',
  'es_word': 'nevera',
  'tags': 'sustantivo'},
 {'de_audio': '',
  'de_sentence': 'Versuch du es mit einer Mund