In [145]:
from selenium.webdriver.firefox.options import Options
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions

import time
import re
import os

from itertools import cycle

browser = webdriver.Firefox()

——————————— Wait for browser to load ————————————

In [146]:
def string_to_list_of_pairs(vocab_list):
    '''Takes string of verb vocab words, separated by spaces
    and newlines, removes the "to" from english infinitive
    and returns a list of lists of the string pairs.'''
    remove_to = vocab_list.replace('to ', '')
    word_parse = re.split(r"\s|\n", remove_to.lower())

    conj_list = []
    for k in range(len(word_parse)):
        if not k%2:
            conj_list.append([word_parse[k].lower(), word_parse[k+1].lower()])
    return conj_list


def get_page_source(word, webpage):
    '''Takes a verb string and the appropriate langauge version
    of the-conjugation.com and returns the html source code
    as a string containing all of the verb's conjugations'''
    browser.get(webpage)
    
    # Check if the webpage loaded before proceeding.
    timeout = 10
    try:
        element_present = expected_conditions.presence_of_element_located((By.TAG_NAME, 'h1'))
        WebDriverWait(browser, timeout).until(element_present)
    except TimeoutException:
        print("Timed out waiting for page to load")
    finally:
        print("Page loaded")
    
    # Locate search field and input word.
    search_bar = browser.find_element(By.NAME, "q")
    webdriver.ActionChains(browser).click(search_bar).perform()
    search_bar.send_keys(word + Keys.RETURN)

    time.sleep(3)
    return browser.page_source


def store_source_code(word, source_code, storage_dict):
    '''Stores html source code of a conjugated verb for
    offline use.'''
    storage_dict[word] = source_code
    return storage_dict


def parse_source_for_conjugate(verb_tense, source_code):
    '''Takes html source code string with all the conjugations 
    of a certain verb, parses it for the conjugation of the
    given verb tense, and returns a list of conjugations.'''
    try:
        # list of a single string
        tense = re.findall(fr'{verb_tense}.*?tempscorps.>(.+?)div', source_code) 
        # list of raw words
        messy_conjugates = re.split(r'</b><br>|<br>', tense[0])
        # list of cleaned up words
        conjugates = []
        for verb_phrase in messy_conjugates:
            if verb_phrase != "</":
                cleaned_phrase = re.sub("<b>", "", verb_phrase) 
                conjugates.append(cleaned_phrase)
    except IndexError:
        print(f"IndexError: {pair[0]} failed.")
    return conjugates


def sort_terms_into_dict(conjugated_lists):
    '''Takes list of lists of ordered conjugations,
    associates one language with its translation and returns
    a dictionary.'''
    first_flatlist = [conj for sublist in conjugated_lists[0] for conj in sublist]
    second_flatlist = [conj for sublist in conjugated_lists[1] for conj in sublist]
    blank = {}
    # paired_terms = blank.fromkeys(second_flatlist, [])   BAD!!!
    paired_terms = {key: '' for key in second_flatlist}
    recycling_keys = cycle(second_flatlist)
    for term in first_flatlist:
        # type(print(term))
        next_key = next(recycling_keys)
        paired_terms[next_key] += ' ' + term + '.'
    return paired_terms  


def format_into_quizlet_upload(paired_terms, output):
    '''Takes dictionary of terms and translations and appends
    them to an output string that is formatted for uploading
    to the quizlet.com website.'''
    for key in paired_terms:
        output += f"{paired_terms[key]}, {key};"
    return output

In [147]:
french_page = "https://www.the-conjugation.com/french/"
english_page = "https://www.the-conjugation.com/"
webpages = [french_page, english_page]

# temps_de_verbe = ["Indicatif", "Passé simple", "Passé composé", "Imparfait", "Futur simple", "Subjonctif"]
# verb_tenses = ["Simple present", "Simple past", "Future"]

verb_tenses = [["Passé composé", "Imparfait", "Passé simple"], ["Simple past"]]

In [148]:
vocab_list = '''aller go
avoir have'''

* Run cell below to empty the source code storage

In [149]:
source_storage = {}

* Main

In [162]:
starttime = time.time()

output = ""

pairs_list = string_to_list_of_pairs(vocab_list)
# print(pairs_list)

for pair in pairs_list:
    conjugated_lists = [[],[]]
    for i in range(len(pair)):
        if pair[i] not in source_storage:
            source_code = get_page_source(pair[i], webpages[i])
            store_source_code(pair[i], source_code, source_storage)
        else:
            source_code = source_storage[pair[i]]
        for tense in verb_tenses[i]:
            # with tense and source, extract list of
            # conjugations
            conjugates = parse_source_for_conjugate(tense, source_code)
            conjugated_lists[i].append(conjugates)
    # print(conjugated_lists)
    term_and_def_dict = sort_terms_into_dict(conjugated_lists)
    output = format_into_quizlet_upload(term_and_def_dict, output)

print(output)

totaltime = time.time() - starttime
print(f"\n****** {totaltime//60:.0f}m {totaltime%60:.3f}s ******")

 je vais. je suis allé., I went; tu vas. tu es allé., you  went; il va. il est allé., he went; nous allons. nous sommes allés., we went; vous allez. vous êtes allés., you went; ils vont. ils sont allés., they went; j'ai. j'ai eu., I had; tu as. tu as eu., you  had; il a. il a eu., he had; nous avons. nous avons eu., we had; vous avez. vous avez eu., you had; ils ont. ils ont eu., they had;

****** 0m 0.006s ******
