In [None]:
from selenium.webdriver.firefox.options import Options
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions

import time
import re
import os
import sqlite3

from itertools import cycle

conn = sqlite3.connect("French_English_Vocab.db")
browser = webdriver.Firefox()

In [None]:
# Cell can be commented out, or skipped, after running one time.

cur.execute('''create table if not exists source_code
            (id integer primary key, word text, page_source text);
            ''')

<sqlite3.Cursor at 0x7feb74b4a6c0>

——————————— Wait for browser to load ————————————

In [None]:
def string_to_list_of_pairs(vocab_list):
    '''Takes string of verb vocab words, separated by spaces
    and newlines, removes the "to" from english infinitive
    and returns a list of lists of the string pairs.'''
    remove_to = vocab_list.replace('to ', '')
    word_parse = re.split(r"\s|\n", remove_to.lower())

    conj_list = []
    for k in range(len(word_parse)):
        if not k%2:
            conj_list.append([word_parse[k].lower(), word_parse[k+1].lower()])
    return conj_list


def get_page_source(word, webpage):
    '''Takes a verb string and the appropriate langauge version
    of the-conjugation.com and returns the html source code
    as a string containing all of the verb's conjugations'''
    browser.get(webpage)
    
    # Check if webpage loaded before proceeding.
    timeout = 10
    try:
        element_present = expected_conditions.presence_of_element_located((By.TAG_NAME, 'h1'))
        WebDriverWait(browser, timeout).until(element_present)
    except TimeoutException:
        print("Timed out waiting for page to load")
    finally:
        print("Page loaded")
    
    # Locate search field and input word.
    search_bar = browser.find_element(By.NAME, "q")
    webdriver.ActionChains(browser).click(search_bar).perform()
    search_bar.send_keys(word + Keys.RETURN)

    time.sleep(3)
    return browser.page_source


def store_source_code(word, source_code, conn):
    '''Stores html source code of a conjugated verb for
    offline use.'''
    cur = conn.cursor()
    row_count = cur.execute('''select count(*) from source_code''').fetchone()
    number = row_count[0]
    cur.execute('''INSERT INTO source_code VALUES (?, ?, ?)''', (number, word, source_code))
    conn.commit()


def fetch_all_words(conn):
    '''Retrieve all words currently stored in the database.'''
    cur = conn.cursor()
    rows = cur.execute('''select word from source_code''').fetchall()
    conn.commit()
    return [each[0] for each in rows]


def retrieve_source_code(word, conn):
    '''Retreives stored source code for a vocab word from database.'''
    cur = conn.cursor()
    page_source = cur.execute('''select page_source from source_code where (word= ?)''', (word,)).fetchone()[0]
    conn.commit()
    return page_source


def parse_source_for_conjugate(verb_tense, source_code):
    '''Takes html source code string with all the conjugations 
    of a certain verb, parses it for the conjugation of the
    given verb tense, and returns a list of conjugations.'''
    try:
        # list of a single string
        tense = re.findall(fr'{verb_tense}.*?tempscorps.>(.+?)div', source_code) 
        # list of raw words
        messy_conjugates = re.split(r'</b><br>|<br>', tense[0])
        # list of cleaned up words
        conjugates = []
        for verb_phrase in messy_conjugates:
            if verb_phrase != "</":
                cleaned_phrase = re.sub("<b>", "", verb_phrase) 
                conjugates.append(cleaned_phrase)
    except IndexError:
        print(f"IndexError: {pair[0]} failed.")
    return conjugates


def sort_terms_into_dict(conjugated_lists):
    '''Takes list of lists of ordered conjugations,
    associates one language with its translation and returns
    a dictionary.'''
    first_flatlist = [conj for sublist in conjugated_lists[0] for conj in sublist]
    second_flatlist = [conj for sublist in conjugated_lists[1] for conj in sublist]
    blank = {}
    # paired_terms = blank.fromkeys(second_flatlist, [])   BAD!!!
    paired_terms = {key: '' for key in second_flatlist}
    recycling_keys = cycle(second_flatlist)
    for term in first_flatlist:
        # type(print(term))
        next_key = next(recycling_keys)
        paired_terms[next_key] += ' ' + term + '.'
    return paired_terms  


def format_into_quizlet_upload(paired_terms, output):
    '''Takes dictionary of terms and translations and appends
    them to an output string that is formatted for uploading
    to the quizlet.com website.'''
    for key in paired_terms:
        no_space = re.sub("^ ", "", paired_terms[key])
        output += f"{no_space}, {key};"
    return output

In [None]:
french_page = "https://www.the-conjugation.com/french/"
english_page = "https://www.the-conjugation.com/"
webpages = [french_page, english_page]

# temps_de_verbe = ["Indicatif", "Passé simple", "Passé composé", "Imparfait", "Futur simple", "Subjonctif"]
# verb_tenses = ["Simple present", "Simple past", "Future"]

# verb_tenses = [["Passé composé", "Imparfait", "Passé simple"], ["Simple past"]]

verb_tenses = [["Conditionnel"], ["Conditional"]]

# verb_tenses = [["Passé composé", "Imparfait", "Passé simple", "Indicatif", "Futur simple", "Conditionnel", "Subjonctif"], ["Simple present"]]



In [None]:
# vocab_list = '''aller go
# avoir have'''

vocab_list = '''parler speak
finir finish
entendre hear
être be
avoir have
aller go
faire make
prendre take
venir come
vouloir want
mettre put
savoir know
devoir have
voir see
dire say
pouvoir can
donner give
tenir hold'''

# vocab_list = '''être be
# devoir have'''

* Main

In [None]:
starttime = time.time()

output = ""

print(vocab_list)

pairs_list = string_to_list_of_pairs(vocab_list)
print(pairs_list)

for pair in pairs_list:
    conjugated_lists = [[],[]]
    for i in range(len(pair)):
        stored_words = fetch_all_words(conn)
        if pair[i] not in stored_words:
            source_code = get_page_source(pair[i], webpages[i])
            store_source_code(pair[i], source_code, conn)
        else:
            source_code = retrieve_source_code(pair[i], conn)
            # source_code = source_storage[pair[i]]
        for tense in verb_tenses[i]:
            # with tense and source, extract list of
            # conjugations
            conjugates = parse_source_for_conjugate(tense, source_code)
            conjugated_lists[i].append(conjugates)

    term_and_def_dict = sort_terms_into_dict(conjugated_lists)
    output = format_into_quizlet_upload(term_and_def_dict, output)

conn.close()
print(output)

totaltime = time.time() - starttime
print(f"\n****** {totaltime//60:.0f}m {totaltime%60:.3f}s ******")

parler speak
finir finish
entendre hear
être be
avoir have
aller go
faire make
prendre take
venir come
vouloir want
mettre put
savoir know
devoir have
voir see
dire say
pouvoir can
donner give
tenir hold
[['parler', 'speak'], ['finir', 'finish'], ['entendre', 'hear'], ['être', 'be'], ['avoir', 'have'], ['aller', 'go'], ['faire', 'make'], ['prendre', 'take'], ['venir', 'come'], ['vouloir', 'want'], ['mettre', 'put'], ['savoir', 'know'], ['devoir', 'have'], ['voir', 'see'], ['dire', 'say'], ['pouvoir', 'can'], ['donner', 'give'], ['tenir', 'hold']]
Page loaded
Page loaded
Page loaded
Page loaded
Page loaded
Page loaded
Page loaded
Page loaded
Page loaded
Page loaded
je parlerais., I would/should speak;tu parlerais. vous parleriez., you would speak;il parlerait., he would speak;nous parlerions., we would/should speak;ils parleraient., they would speak;je finirais., I would/should finish;tu finirais. vous finiriez., you would finish;il finirait., he would finish;nous finirions., we would/s