In [2]:
import csv
import logging
import requests
from pathlib import Path
from pprint import pprint
from decouple import config
from bs4 import BeautifulSoup
from xml.etree import ElementTree

In [3]:
headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:87.0) Gecko/20100101 Firefox/87.0'
}

In [4]:
def reverso_request(start, length):
    base_url = f"https://context.reverso.net/bst-web-user/user/favourites/shared"
    
    params = {
        'userName': 'lmirandam07',
        'start': start,
        'length': length,
        'order': 10
    }

    try:
        request = requests.get(base_url, params=params, headers=headers)
        request.raise_for_status()
        
        return request.json()
    except requests.exceptions.HTTPError as e:
        print(e)

In [5]:
# Function to drop the duplicated words from the JSON
def words_filter(words, csv_headers): 
    csv_file = Path("./words_list.csv")
    csv_file.touch(exist_ok=True)
    empty_file = False

    with open(csv_file, 'r') as file: 
      reader = list(csv.reader(file))

      if len(reader) == 1:
        return words

      if len(reader) < 1:
        empty_file = True
      elif len(reader) >=2:
        last_word_de = reader[-1][0]

    with open(csv_file, 'w') as file: 
      writer = csv.writer()
      if empty_file:
        writer.writerow(csv_headers.keys())
        return words

      last_updated_idx = words.index(list(filter(lambda w: w['srcText'] == last_word_de if w['srcLang'] == 'de' else w['trgText'] == last_word_de, words))[0])

      return words[:last_updated_idx]


In [6]:
def get_word_tag(de_word):
    tags = ('adjetivo', 'sustantivo', 'adverbio', 'verbo')
    reverso_url = f"https://context.reverso.net/traduccion/aleman-espanol/{de_word}"

    try:
        req = requests.get(reverso_url, headers=headers)
        soup = BeautifulSoup(req.text, "html.parser")
        word_tag = soup.select("#pos-filters button")[0].text.strip().lower() or ""

        if word_tag not in tags:
            print(de_word, word_tag)
            return ""
        
        return word_tag
    except requests.exceptions.HTTPError as e:
        print(e)

In [7]:
def get_noun_article(de_word):
    leo_url = f"https://dict.leo.org/alem%C3%A1n-espa%C3%B1ol/{de_word}"

    try:
        req = requests.get(leo_url, headers=headers)
        soup = BeautifulSoup(req.text, "html.parser")        
        de_noun = soup.select("#section-subst td[lang='de'] samp")

        de_article = de_noun[0].text.split(' ')[0] or ''
        de_plural = de_noun[0].find('small').text or ''
        de_word = f"{de_article} {de_word} - {de_plural}"
        return de_word
        
    except requests.exceptions.HTTPError as e:
        print(e)


In [8]:
def get_sentence_audio(de_sentence):
    AZURE_API_KEY = config('AZURE_API_KEY')
    AZURE_LOCATION = config('AZURE_LOCATION')
    azure_api_url = f'https://{AZURE_LOCATION}.api.cognitive.microsoft.com/sts/v1.0/issueToken'
    headers = {
        'Ocp-Apim-Subscription-Key': AZURE_API_KEY
    }

    

In [26]:
def get_words_list(word, words_dict):
    words_dict_c = words_dict.copy()
    # If src lang is german, then return the first order otherwise change the order
    language_order = lambda lang: ('src', 'trg') if lang =='de' else ('trg', 'src')

    de_order, es_order = language_order(word['srcLang'])
    words_dict_c['de_word'] = word[f'{de_order}Text']
    words_dict_c['es_word'] = word[f'{es_order}Text']
    # To remove <em> tags 
    words_dict_c['de_sentence'] = BeautifulSoup(word[f'{de_order}Context']).get_text()
    words_dict_c['es_sentence'] = BeautifulSoup(word[f'{es_order}Context']).get_text()

    words_dict_c['tags'] = get_word_tag(words_dict_c['de_word'])

    if words_dict_c['tags'] == 'sustantivo':
        words_dict_c['de_word'] = get_noun_article(words_dict_c['de_word'])

    # words_dict['de_audio'] = get_sentence_audio(words_dict['de_sentence'])
    return words_dict_c

In [27]:
def main():
    # logging.basicConfig(filename='scraper.log', level=logging.INFO)
    words_dict = {
        'de_word': '',
        'de_sentence': '',
        'es_word': '',
        'es_sentence': '',
        'de_audio': '',
        'tags': ''
    }
    start = 0
    length = 10
    data = reverso_request(start, length)
    words_results = data['results']
    num_total_results = data['numTotalResults']
    if num_total_results > length:
        # Starts the requests in the end of the previus and make another with all the remaining words
        start = length
        length = num_total_results - length
        data = reverso_request(start, length)
        words_results.extend(data['results'])

    filtered_words = words_filter(words_results, words_dict)
    # words_dict_list = []
    # for f_w in filtered_words:
    #     words_dict_list.append(get_words_list(f_w, words_dict))
    #     print(words_dict_list)

    words_dict_list = [get_words_list(f_w, words_dict) for f_w in filtered_words]
    print(len(words_dict_list))
    print(words_dict_list)

In [28]:
if __name__ == '__main__':
   main()

60
[{'de_word': 'die Garnele - pl.: die Garnelen', 'de_sentence': 'Wer würde nicht auch gerne eine Garnele essen?', 'es_word': 'camarón', 'es_sentence': '¿A quién no le gustaría comer un camarón?', 'de_audio': '', 'tags': 'sustantivo'}, {'de_word': 'der Zahn - pl.: die Zähne', 'de_sentence': 'Ich glaube, ein Zahn ist kaputt.', 'es_word': 'diente', 'es_sentence': 'Oh, creo que me rompí un diente.', 'de_audio': '', 'tags': 'sustantivo'}, {'de_word': 'nie', 'de_sentence': 'Ich habe Felix Bleibner nie vertraut.', 'es_word': 'nunca', 'es_sentence': 'Bueno, yo nunca me fié de Félix Bleibner.', 'de_audio': '', 'tags': 'adverbio'}, {'de_word': 'nicht mehr', 'de_sentence': 'Die Götter trauen dir nicht mehr.', 'es_word': 'ya no', 'es_sentence': 'Los dioses ya no pueden confiar en ti.', 'de_audio': '', 'tags': 'adverbio'}, {'de_word': 'gemütlich', 'de_sentence': "Ich find's hier sehr gemütlich.", 'es_word': 'cómodo', 'es_sentence': 'De hecho, estoy muy cómodo aquí.', 'de_audio': '', 'tags': 'adje