In [1]:
#!pip install beautifulsoup4
#!pip install requests
#!conda install pandas
#!pip install tqdm

In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from tqdm import tqdm_notebook as tqdm

In [3]:
def clean_text(text):
    return text.replace('/', '').strip().lower()


def process_mode(row):
    return clean_text(row.td.text)


def process_times(row):
    return tuple(clean_text(col.text) for col in row.find_all('td'))


def process_conj(row, mode, times, infinitive):
    verbs = []
    cols = row.find_all('td')
    
    for i in range(0, 6, 2):
        verbs.append(
            dict(infinitive=infinitive,
                 pess=cols[i].text.split()[-1],
                 conj=cols[i + 1].text,
                 time=times[i // 2],
                 mode=mode
                )
        )
        
    return verbs






def process_inf_per_conj(row, mode, times, infinitive):
    verbs = []
    cols = row.find_all('td')
    verbs.append(
        dict(infinitive=infinitive,
             pess=cols[2].text.split()[-1],
             conj=[word for word in row.find_all('td')[2].text.split(' ') if word][1],
             time=times[2],
             mode=mode
            )
    )
        
    return verbs


def process_imp_neg_conj(row, mode, times, infinitive):
    verbs = []
    cols = row.find_all('td')
    
    verbs.append(
        dict(infinitive=infinitive,
             pess=cols[1].text.split()[-1],
             conj=[word for word in row.find_all('td')[1].text.split(' ') if word][1],
             time=times[1],
             mode=mode
            )
    )
        
    return verbs


def process_imp_af_conj(row, mode, times, infinitive):
    verbs = []
    cols = row.find_all('td')
    
    
    verbs.append(
        dict(infinitive=infinitive,
             pess=cols[0].text.split()[-1],
             conj=cols[0].text.split()[0],
             time=times[0],
             mode=mode
            )
    )
        
    return verbs


def get_gerundio(verb, soup):
    return [dict(infinitive=verb,
                 mode='gerundio',
                 time=None,
                 pess=None,
                 conj=soup.find(class_='gerund').text.split(':')[-1].strip())]


def get_participio(verb, soup):
    return [dict(infinitive=verb,
                 mode='participio',
                 time=None,
                 pess=None,
                 conj=soup.find(class_='partpass').text.split(':')[-1].strip())]


def get_indicativo(infinitive:str, table:list) -> list:
    
    verbs = []
    
    for i, row in enumerate(table.find_all('tr')):
        if i == 3:
            mode = process_mode(row)
        if i in {4, 12}:
            times = process_times(row)
        if 5 <= i <= 10 or 14 <= i <= 19:
            verbs.extend(process_conj(row, mode, times, infinitive))
    
    return verbs


def get_subjuntivo(infinitive:str, table:list) -> list:
    
    verbs = []
    
    for i, row in enumerate(table.find_all('tr')):
        if i == 3 or i == 21:
            mode = process_mode(row)
        if i in {22}:
            times = process_times(row)
        if 23 <= i <= 28:
            verbs.extend(process_conj(row, mode, times, infinitive))
    
    return verbs


def get_imperativo(infinitive:str, table:list):
    verbs = []

    for i, row in enumerate(table.find_all('tr')):
        if i == 30:
            mode = process_mode(row)
        elif i == 31:
            times = process_times(row)
        elif 32 <= i <= 37:
            verbs.extend(process_inf_per_conj(row, mode, times, infinitive))
            if 32 < i:
                verbs.extend(process_imp_af_conj(row, mode, times, infinitive))
                verbs.extend(process_imp_neg_conj(row, mode, times, infinitive))
            
    return verbs


def get_conjugations(verb):
    try:
        r = requests.get(f"http://www.conjuga-me.net/verbo-{verb}")

        soup = BeautifulSoup(r.text, 'html.parser')
        table = soup.find(class_="conj")

        gerundio = get_gerundio(verb, soup)
        participio = get_participio(verb, soup)

        indicativo = get_indicativo(verb, table)
        subjuntivo = get_subjuntivo(verb, table)
        imperativo = get_imperativo(verb, table)

        return gerundio + participio + indicativo + subjuntivo + imperativo
    
    except AttributeError:
        print(f"verb '{verb}' not found")
        return []

In [4]:
def simple_test(verb):
    persons = {
        'eu',
        'tu',
        'ele/ela',
        'nós',
        'vós',
        'eles/elas',
        None
    }
    
    conjugations = get_conjugations(verb)
    
    
    assert all([conj['pess'] in persons for conj in conjugations])
    
simple_test('ouvir')

In [5]:
verbs = pd.read_csv('top_1000_verbs_pt.csv').dropna().verbs.values

In [6]:
first = 1001

for i in range(100, first, 100):
    print(f"downloading {i}")

    parsed = []

    for verb in tqdm(verbs[i - 100:i]):
        parsed.extend(get_conjugations(verb))

    pd.DataFrame(parsed).to_csv(f'conjugations/conjugations_{i}_pt.csv', index=False)

downloading 100


HBox(children=(IntProgress(value=0), HTML(value='')))


downloading 200


HBox(children=(IntProgress(value=0), HTML(value='')))


downloading 300


HBox(children=(IntProgress(value=0), HTML(value='')))


downloading 400


HBox(children=(IntProgress(value=0), HTML(value='')))


downloading 500


HBox(children=(IntProgress(value=0), HTML(value='')))


downloading 600


HBox(children=(IntProgress(value=0), HTML(value='')))


downloading 700


HBox(children=(IntProgress(value=0), HTML(value='')))


downloading 800


HBox(children=(IntProgress(value=0), HTML(value='')))


downloading 900


HBox(children=(IntProgress(value=0), HTML(value='')))


downloading 1000


HBox(children=(IntProgress(value=0, max=97), HTML(value='')))




In [18]:
pd.DataFrame(get_conjugations('ouvir'))

Unnamed: 0,conj,infinitive,mode,pess,time
0,ouvindo,ouvir,gerundio,,
1,ouvido,ouvir,participio,,
2,ouço ≈ oiço,ouvir,indicativo,eu,presente
3,ouvi,ouvir,indicativo,eu,pretérito perfeito
4,ouvia,ouvir,indicativo,eu,pretérito imperfeito
5,ouves,ouvir,indicativo,tu,presente
6,ouviste,ouvir,indicativo,tu,pretérito perfeito
7,ouvias,ouvir,indicativo,tu,pretérito imperfeito
8,ouve,ouvir,indicativo,ele/ela,presente
9,ouviu,ouvir,indicativo,ele/ela,pretérito perfeito


In [14]:
parsed

[{'conj': 'creio',
  'infinitive': 'crer',
  'mode': 'indicativo',
  'pess': 'eu',
  'time': 'presente'},
 {'conj': 'cri',
  'infinitive': 'crer',
  'mode': 'indicativo',
  'pess': 'eu',
  'time': 'pretérito perfeito'},
 {'conj': 'cria',
  'infinitive': 'crer',
  'mode': 'indicativo',
  'pess': 'eu',
  'time': 'pretérito imperfeito'},
 {'conj': 'crês',
  'infinitive': 'crer',
  'mode': 'indicativo',
  'pess': 'tu',
  'time': 'presente'},
 {'conj': 'creste',
  'infinitive': 'crer',
  'mode': 'indicativo',
  'pess': 'tu',
  'time': 'pretérito perfeito'},
 {'conj': 'crias',
  'infinitive': 'crer',
  'mode': 'indicativo',
  'pess': 'tu',
  'time': 'pretérito imperfeito'},
 {'conj': 'crê',
  'infinitive': 'crer',
  'mode': 'indicativo',
  'pess': 'ele/ela',
  'time': 'presente'},
 {'conj': 'creu',
  'infinitive': 'crer',
  'mode': 'indicativo',
  'pess': 'ele/ela',
  'time': 'pretérito perfeito'},
 {'conj': 'cria',
  'infinitive': 'crer',
  'mode': 'indicativo',
  'pess': 'ele/ela',
  'time

In [7]:
verb = 'ser'

In [8]:
r = requests.get(f"http://www.conjuga-me.net/verbo-{verb}")
    
soup = BeautifulSoup(r.text, 'html.parser')
table = soup.find(class_="conj")

'sendo'

In [13]:
soup.find(class_='partpass').text.split(':')[-1].strip()

'sido'