In [7]:
import csv
import logging
import requests
from pathlib import Path
from pprint import pprint
from bs4 import BeautifulSoup
from azure.cognitiveservices.speech import AudioDataStream, SpeechConfig, SpeechSynthesizer, SpeechSynthesisOutputFormat
from azure.cognitiveservices.speech.audio import AudioOutputConfig

In [8]:
headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:87.0) Gecko/20100101 Firefox/87.0'
}

In [9]:
def reverso_request(start, length):
    base_url = f"https://context.reverso.net/bst-web-user/user/favourites/shared"
    
    params = {
        'userName': 'lmirandam07',
        'start': start,
        'length': length,
        'order': 10
    }

    try:
        request = requests.get(base_url, params=params, headers=headers)
        request.raise_for_status()
        
        return request.json()
    except requests.exceptions.HTTPError as e:
        print(e)

In [10]:
# Function to drop the duplicated words from the JSON
def words_filter(words, csv_header): 
    csv_file = Path("./words_list.csv")
    csv_file.touch(exist_ok=True)
    empty_file = False

    with open(csv_file, 'r') as file: 
      reader = csv.reader(file)

      if len(reader) == 1:
        return words

      if len(reader) < 1:
        empty_file = True
      elif len(reader) >=2:
        last_word_de = reader[-1][0]

    with open(csv_file, 'w') as file: 
      writer = csv.writer()
      if empty_file:
        writer.writerow(csv_header.keys())
        return words
      #  TODO: Filter repeated words
      last_updated_idx = words.index(list(filter(lambda w: w['srcText'] == last_word_de if w['srcLang'] == 'de' else w['trgText'] == last_word_de, words))[0])

      return words[:last_updated_idx]


In [21]:
def get_word_tag(de_word):
    tags = {
        'adj.': 'adjetivos',
        'v.': 'verbos',
        'n.': 'sustantivos',
        'adv.': 'adverbios'
    }
    reverso_url = f"https://context.reverso.net/traduccion/aleman-espanol/{de_word}"

    try:
        req = requests.get(reverso_url, headers=headers)
        soup = BeautifulSoup(req.text, "html.parser")
        word_tag = soup.select("#pos-filters button")[0]["data-pos"]   

        return tags[word_tag]
    except requests.exceptions.HTTPError as e:
        print(e)

'sustantivos'

In [73]:
def get_noun_article(de_word, es_word):
    leo_url = f"https://dict.leo.org/alem%C3%A1n-espa%C3%B1ol/{de_word}"

    try:
        req = requests.get(leo_url, headers=headers)
        soup = BeautifulSoup(req.text, "html.parser")        
        es_noun = soup.select("#section-subst td[lang='es'] samp")
        de_noun = soup.select("#section-subst td[lang='de'] samp")
        
        for es_row, de_row in zip(es_noun, de_noun):
            # If the spanish word from reverso its the same as the one on the Leo row
            if es_row.text.split(' ')[1] == es_word:
                # Get the article for the noun
                de_article = de_row.text.split(' ')[0]
                # Get the plural form of the noun
                de_plural = de_row.find('small').text

                return (de_article, de_plural)            
            
        
    except requests.exceptions.HTTPError as e:
        print(e)

get_noun_article('Gedanke', 'idea')

('der', 'pl.: die Gedanken')

In [76]:
def main():
    # logging.basicConfig(filename='scraper.log', level=logging.INFO)
    words_dict = {
        'de_word': '',
        'de_sentence': '',
        'es_word': '',
        'es_sentence': '',
        'de_audio': '',
        'tags': ''
    }

    start = 0
    length = 10
    data = reverso_request(start, length)
    words_results = data['results']
    num_total_results = data['numTotalResults']
    if num_total_results > length:
        # Starts the requests in the end of the previus and make another with all the remaining words
        start = length
        length = total_results - length
        data = reverso_request(start, length)
        words_results.extend(data['results'])

    filtered_words = words_filter(words_results, words_dict)

    for word in filtered_words:
        words_dict['de_word'] = word['srcText'] if word['srcLang'] == 'de' else word['trgText']
        
        # To remove <em> tags in sentences
        words_dict['de_sentence'] = BeautifulSoup(word['srcContext']).get_text() 
        words_dict['es_word'] = word['trgText']
        words_dict['es_sentence'] = BeautifulSoup(word['trgContext']).get_text() 
        words_dict['tags'] = get_word_tag(words_dict['de_word'])

        if words_dict['tags'] == 'sustantivos':
            article, plural = get_noun_article(words_dict['de_word'], words_dict['es_word'])
            words_dict['de_word'] = article + words_dict['de_word'] + plural


In [18]:
if __name__ == '__main__':
   main()

14
[{'comment': '',
  'creationDate': '2021-04-15T17:12:41Z',
  'document': '',
  'documentTitle': '',
  'domain': None,
  'favType': None,
  'hash': 'a1d3f743f3938e96795f0b870f546d83',
  'historyID': None,
  'id': 26248847,
  'lastEdit': '2021-04-15T17:12:41Z',
  'removed': False,
  'source': 0,
  'srcContext': 'Ich glaube, dass es war nur eine <em>Erkältung</em>.',
  'srcLang': 'de',
  'srcSegment': '',
  'srcText': 'Erkältung',
  'trgContext': 'Pensé que solo era un <em>resfriado</em>.',
  'trgLang': 'es',
  'trgText': 'resfriado',
  'trgTextEdited': None,
  'userID': 4291697},
 {'comment': '',
  'creationDate': '2021-04-15T17:09:06Z',
  'document': '',
  'documentTitle': '',
  'domain': None,
  'favType': None,
  'hash': 'f85dbcb404e9b279eb945a8b5124806c',
  'historyID': None,
  'id': 26248790,
  'lastEdit': '2021-04-15T17:09:06Z',
  'removed': False,
  'source': 0,
  'srcContext': 'Komm mir nicht zu nahe. Ich bin <em>erkältet</em>.',
  'srcLang': 'de',
  'srcSegment': '',
  'srcTe