In [1]:
#!pip install beautifulsoup4
#!pip install requests
#!conda install pandas
#!pip install tqdm

In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from tqdm import tqdm_notebook as tqdm

In [8]:
def clean_text(text):
    return text.replace('/', '').strip().lower()


def process_mode(row):
    return clean_text(row.td.text)


def process_times(row):
    return tuple(clean_text(col.text) for col in row.find_all('td'))


def process_conj(row, mode, times, infinitive):
    verbs = []
    cols = row.find_all('td')
    
    for i in range(0, 6, 2):
        verbs.append(
            dict(infinitive=infinitive,
                 pess=cols[i].text,
                 conj=cols[i + 1].text,
                 time=times[i // 2],
                 mode=mode
                )
        )
        
    return verbs



def get_indicativo(infinitive:str, table:list) -> list:
    
    verbs = []
    
    for i, row in enumerate(table.find_all('tr')):
        if i == 3:
            mode = process_mode(row)
        if i == 4 or i == 12:
            times = process_times(row)
        if 5 <= i <= 10 or 14 <= i <= 19:
            verbs.extend(process_conj(row, mode, times, infinitive))
    
    return verbs

def get_conjugations(verb):
    r = requests.get(f"http://www.conjuga-me.net/verbo-{verb}")
    soup = BeautifulSoup(r.text, 'html.parser')
    indicativo = get_indicativo(verb, soup.find(class_="conj"))
    return indicativo

In [9]:
def clean_verb(verb):
    return verb.replace('[', '').replace(']', '').lower().strip()

#verbs = pd.read_clipboard().assign(verb=lambda df: df.verb.apply(clean_verb)).verb.values
verbs = pd.read_csv('top_1000_verbs_pt.csv').verbs.values
verbs[:1]

array(['ser'], dtype=object)

In [10]:
parsed_verbs = verbs[:1]

In [13]:
parsed = []

for verb in tqdm(verbs[:100]):
    parsed.extend(get_conjugations(verb))


  0%|          | 0/100 [00:00<?, ?it/s][A
  1%|          | 1/100 [00:00<01:24,  1.16it/s][A
  2%|▏         | 2/100 [00:01<01:26,  1.14it/s][A
  3%|▎         | 3/100 [00:04<02:23,  1.48s/it][A
  4%|▍         | 4/100 [00:05<02:04,  1.29s/it][A
  5%|▌         | 5/100 [00:06<01:50,  1.17s/it][A
  6%|▌         | 6/100 [00:07<01:42,  1.09s/it][A
  7%|▋         | 7/100 [00:08<01:36,  1.04s/it][A
  8%|▊         | 8/100 [00:09<01:41,  1.10s/it][A
  9%|▉         | 9/100 [00:11<02:01,  1.34s/it][A
 10%|█         | 10/100 [00:12<01:46,  1.18s/it][A
 11%|█         | 11/100 [00:13<01:36,  1.09s/it][A
 12%|█▏        | 12/100 [00:14<01:33,  1.07s/it][A
 13%|█▎        | 13/100 [00:14<01:27,  1.00s/it][A
 14%|█▍        | 14/100 [00:15<01:21,  1.06it/s][A
 15%|█▌        | 15/100 [00:16<01:16,  1.11it/s][A
 16%|█▌        | 16/100 [00:17<01:16,  1.10it/s][A
 17%|█▋        | 17/100 [00:18<01:14,  1.12it/s][A
 18%|█▊        | 18/100 [00:19<01:18,  1.05it/s][A
 19%|█▉        | 19/100 [00:2

In [14]:
pd.DataFrame(parsed).to_csv('top_100_pt.csv', index=False)

In [13]:
verbs[:100]

array(['ser', 'ter', 'estar', 'poder', 'fazer', 'ir', 'haver', 'dizer',
       'dar', 'ver', 'saber', 'querer', 'ficar', 'dever', 'passar', 'vir',
       'chegar', 'falar', 'deixar', 'encontrar', 'levar', 'começar',
       'partir', 'pensar', 'parecer', 'apresentar', 'olhar', 'tornar',
       'sair', 'voltar', 'conseguir', 'achar', 'existir', 'sentir',
       'entrar', 'chamar', 'conhecer', 'considerar', 'pôr', 'continuar',
       'viver', 'ouvir', 'tomar', 'acabar', 'receber', 'perder', 'andar',
       'trabalhar', 'criar', 'pedir', 'seguir', 'contar', 'acontecer',
       'afirmar', 'tratar', 'esperar', 'gostar', 'usar', 'manter',
       'realizar', 'abrir', 'escrever', 'permitir', 'ocorrer', 'mostrar',
       'lembrar', 'trazer', 'procurar', 'morrer', 'tentar', 'formar',
       'aparecer', 'incluir', 'cair', 'correr', 'ganhar', 'surgir',
       'nascer', 'pagar', 'representar', 'entender', 'produzir', 'ler',
       'precisar', 'perguntar', 'constituir', 'colocar', 'possuir',
       '

In [14]:
decks = [
    list(verbs[:10]),
    [],
    [],
    [],
    [],
    [],
    [],
    [],
    [],
    [],
]

In [18]:
pd.DataFrame(verbs).rename(columns={0:'verbs'}).to_csv('top_1000_verbs_pt.csv', index=False)