In [7]:
import csv
import logging
import requests
from pathlib import Path
from pprint import pprint
from bs4 import BeautifulSoup
from azure.cognitiveservices.speech import AudioDataStream, SpeechConfig, SpeechSynthesizer, SpeechSynthesisOutputFormat
from azure.cognitiveservices.speech.audio import AudioOutputConfig

In [8]:
headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:87.0) Gecko/20100101 Firefox/87.0'
}

In [9]:
def reverso_request(start, length):
    base_url = f"https://context.reverso.net/bst-web-user/user/favourites/shared"
    
    params = {
        'userName': 'lmirandam07',
        'start': start,
        'length': length,
        'order': 10
    }

    try:
        request = requests.get(base_url, params=params, headers=headers)
        request.raise_for_status()
        
        return request.json()
    except requests.exceptions.HTTPError as e:
        print(e)

In [83]:
# Function to drop the duplicated words from the JSON
def words_filter(words, csv_header): 
    csv_file = Path("./words_list.csv")
    csv_file.touch(exist_ok=True)
    empty_file = False

    with open(csv_file, 'r') as file: 
      reader = list(csv.reader(file))

      if len(reader) == 1:
        return words

      if len(reader) < 1:
        empty_file = True
      elif len(reader) >=2:
        last_word_de = reader[-1][0]

    with open(csv_file, 'w') as file: 
      writer = csv.writer()
      if empty_file:
        writer.writerow(csv_header.keys())
        return words
      #  TODO: Filter repeated words
      last_updated_idx = words.index(list(filter(lambda w: w['srcText'] == last_word_de if w['srcLang'] == 'de' else w['trgText'] == last_word_de, words))[0])

      return words[:last_updated_idx]


In [84]:
def get_word_tag(de_word):
    tags = {
        'adj.': 'adjetivos',
        'v.': 'verbos',
        'n.': 'sustantivos',
        'adv.': 'adverbios'
    }
    reverso_url = f"https://context.reverso.net/traduccion/aleman-espanol/{de_word}"

    try:
        req = requests.get(reverso_url, headers=headers)
        soup = BeautifulSoup(req.text, "html.parser")
        word_tag = soup.select("#pos-filters button")[0]["data-pos"]   

        return tags[word_tag]
    except requests.exceptions.HTTPError as e:
        print(e)

In [106]:
def get_noun_article(de_word, es_word):
    leo_url = f"https://dict.leo.org/alem%C3%A1n-espa%C3%B1ol/{de_word}"

    try:
        req = requests.get(leo_url, headers=headers)
        soup = BeautifulSoup(req.text, "html.parser")        
        es_noun = soup.select("#section-subst td[lang='es'] samp")
        de_noun = soup.select("#section-subst td[lang='de'] samp")
        
        for es_row, de_row in zip(es_noun, de_noun):
            # If the spanish word from reverso its the same as the one on the Leo row
            if es_row.text.split(' ')[1] == es_word:
                # Get the article for the noun
                de_article = de_row.text.split(' ')[0] or ''
                # Get the plural form of the noun
                de_plural = de_row.find('small').text or ''

                return (de_article, de_plural)            
        
    except requests.exceptions.HTTPError as e:
        print(e)
get_noun_article('Wetter','Clima')

In [100]:
def main():
    # logging.basicConfig(filename='scraper.log', level=logging.INFO)
    words_dict = {
        'de_word': '',
        'de_sentence': '',
        'es_word': '',
        'es_sentence': '',
        'de_audio': '',
        'tags': ''
    }

    start = 0
    length = 10
    data = reverso_request(start, length)
    words_results = data['results']
    num_total_results = data['numTotalResults']
    if num_total_results > length:
        # Starts the requests in the end of the previus and make another with all the remaining words
        start = length
        length = num_total_results - length
        data = reverso_request(start, length)
        words_results.extend(data['results'])

    filtered_words = words_filter(words_results, words_dict)

    for word in filtered_words:
        if word['srcLang'] == 'de':    
            # words_dict['de_word'] = word['srcText'] if word['srcLang'] == 'de' else word['trgText']
            words_dict['de_word'] = word['srcText']
            words_dict['es_word'] = word['trgText']
            # To remove <em> tags 
            words_dict['de_sentence'] = BeautifulSoup(word['srcContext']).get_text()
            words_dict['es_sentence'] = BeautifulSoup(word['trgContext']).get_text()

        else:
            words_dict['de_word'] = word['trgText']
            words_dict['es_word'] = word['srcText']

            words_dict['de_sentence'] = BeautifulSoup(word['trgContext']).get_text()
            words_dict['es_sentence'] = BeautifulSoup(word['srcContext']).get_text()

        words_dict['tags'] = get_word_tag(words_dict['de_word'])

        if words_dict['tags'] == 'sustantivos':
            article, plural = get_noun_article(words_dict['de_word'], words_dict['es_word'])
            words_dict['de_word'] = f"{article} {words_dict['de_word']} {plural}"
        pprint(words_dict)


In [101]:
if __name__ == '__main__':
   main()

{'de_audio': '',
 'de_sentence': 'Das ist meine Zeit, zu glänzen.',
 'de_word': 'glänzen',
 'es_sentence': 'Este es mi momento de brillar.',
 'es_word': 'brillar',
 'tags': 'verbos'}
{'de_audio': '',
 'de_sentence': 'Mir gefällt der Gedanke nicht besonders.',
 'de_word': 'der Gedanke pl.: die Gedanken',
 'es_sentence': 'No estoy seguro que me guste mucho la idea.',
 'es_word': 'idea',
 'tags': 'sustantivos'}
{'de_audio': '',
 'de_sentence': 'Gebt Gwen zumindest einen fairen Prozess.',
 'de_word': 'zumindest',
 'es_sentence': 'Al menos dele a Gwen un juicio justo.',
 'es_word': 'al menos',
 'tags': 'adverbios'}
{'de_audio': '',
 'de_sentence': 'Eigentlich bist du genau pünktlich, Harvey.',
 'de_word': 'pünktlich',
 'es_sentence': 'En realidad, Harvey, llegas justo a tiempo.',
 'es_word': 'a tiempo',
 'tags': 'adjetivos'}
{'de_audio': '',
 'de_sentence': 'Oh, ja, es war so spannend.',
 'de_word': 'spannend',
 'es_sentence': 'Oh, sí, fue tan emocionante.',
 'es_word': 'emocionante',
 'tag

TypeError: cannot unpack non-iterable NoneType object